Added icon.
[planet-ucc.git] / XMLParse.py
1 #
2 # XMLParse.py
3 #
4 # Parse arbitrary XML news streams into an object type
5 # understandable by Planet UCC.
6 #
7 # (c) 2004, Davyd Madeley <[email protected]>
8 #
9
10 import sys, time
11 from xml.dom.minidom import parseString
12
13 class Blog:
14         def __init__(self):
15                 self.blogTitle  = None
16                 self.blogURL    = None
17                 self.imageURL   = None
18                 self.imageLink  = None
19                 self.items      = []
20
21 class BlogItem:
22         def __init__(self):
23                 self.itemTitle  = None
24                 self.itemDate   = None
25                 self.itemURL    = None
26                 self.contents   = None
27
28 class XMLParse:
29         def __init__(self, XMLString):
30                 # parse our XML file
31                 self.dom        = parseString(XMLString)
32                 self.bloglist   = None
33                 # find out what sort of XML format we're dealing with
34                 if self.dom.documentElement.tagName == 'rss':
35                         # this is some sort of RSS feed
36                         # find out what version
37                         if self.dom.documentElement.attributes.has_key('version'):
38                                 version = self.dom.documentElement.attributes['version'].value
39                                 if version == '2.0':
40                                         # this is an RSS2 document
41                                         self.news       = RSS2Parse(self.dom)
42                                 else:
43                                         sys.stderr.write('DEBUG: XMLParse: Unknown RSS version %s\n' % version)
44                         else:
45                                 sys.stderr.write('DEBUG: XMLParse: RSS document has no version information\n')
46                 elif self.dom.documentElement.tagName == 'rdf:RDF':
47                         # this is an RDF document
48                         self.news       = RDFParse(self.dom)
49                 else:
50                         sys.stderr.write('DEBUG: XMLParse: Unknown XML document %s\n' % dom.documentElement.tagname)
51                         
52         def parse(self):
53                 "Return a list of Blog objects from the XML file we parsed"
54                 # quick cache for XML parsing
55                 if self.bloglist:
56                         return self.bloglist
57                 else:
58                         self.bloglist   = self.news.parse()
59                         return self.bloglist
60
61 class RDFParse:
62         def __init__(self, dom):
63                 self.dom        = dom
64                 self.root       = dom.documentElement
65         
66         def __retrieve_value__(self, fromNode):
67                 for node in fromNode.childNodes:
68                         if node.nodeType == 3:
69                                 return node.nodeValue
70                         else:
71                                 sys.stderr.write('DEBUG: RDFParse: Asked to retrieve value from wrong part of tree\n')
72                                 return None
73         
74         def parse(self):
75                 channel = Blog()
76                 for node in self.root.childNodes:
77                         if node.nodeType == 1 and node.tagName == 'channel':
78                                 for node2 in node.childNodes:
79                                         if node2.nodeType == 1 and node2.tagName == 'title':
80                                                 channel.blogTitle       = self.__retrieve_value__(node2)
81                                         elif node2.nodeType == 1 and node2.tagName == 'link':
82                                                 channel.blogURL         = self.__retrieve_value__(node2)
83                         elif node.nodeType == 1 and node.tagName == 'item':
84                                 item    = BlogItem()
85                                 for node2 in node.childNodes:
86                                         if node2.nodeType == 1 and node2.tagName == 'title':
87                                                 item.itemTitle          = self.__retrieve_value__(node2)
88                                         elif node2.nodeType == 1 and node2.tagName == 'link':
89                                                 item.itemURL            = self.__retrieve_value__(node2)
90                                         elif node2.nodeType == 1 and node2.tagName == 'dc:date':
91                                                 date                    = self.__retrieve_value__(node2)
92                                                 try:
93                                                         item.itemDate   = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%S+07:00'))
94                                                 except:
95                                                         sys.stderr.write("DEBUG: RDFParse: time string %s unparseable\n" % date)
96                                         elif node2.nodeType == 1 and node2.tagName == 'description':
97                                                 item.contents           = self.__retrieve_value__(node2)
98                                 channel.items.append(item)
99                 return [channel]
100
101 class RSS2Parse:
102         def __init__(self, dom):
103                 self.dom        = dom
104                 self.root       = dom.documentElement
105
106         def __retrieve_value__(self, fromNode):
107                 "Returns the value from between two nodes, ie <node>text</node>"
108                 for node in fromNode.childNodes:
109                         if node.nodeType == 3:
110                                 # this is the information contained within our node
111                                 return node.nodeValue
112                         else:
113                                 sys.stderr.write('DEBUG: RSS2Parse: Asked to retrieve value from wrong part of tree\n')
114                                 return None
115         
116         def __parse_item__(self, fromNode):
117                 "Returns a BlogItem collected from fromNode"
118                 item    = BlogItem()
119                 for node in fromNode.childNodes:
120                         if node.nodeType == 1 and node.tagName == 'title':
121                                 item.itemTitle  = self.__retrieve_value__(node)
122                         elif node.nodeType == 1 and node.tagName == 'pubDate':
123                                 try:
124                                         item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S %Z'))
125                                 except:
126                                         try:
127                                                 item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000'))
128                                         except:
129                                                 sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % self.__retrieve_value__(node))
130                         elif node.nodeType == 1 and node.tagName == 'link':
131                                 item.itemURL    = self.__retrieve_value__(node)
132                         elif node.nodeType == 1 and node.tagName == 'description':
133                                 item.contents   = self.__retrieve_value__(node)
134                 return item
135
136         def parse(self):
137                 "Returns a list of Blog objects for parsing into an arbitrary data format."
138                 channellist     = []
139                 for node in self.root.childNodes:
140                         if node.nodeType == 1 and node.tagName == 'channel':
141                                 channel = Blog()
142                                 channellist.append(channel)
143                                 # populate channel with information from the blog
144                                 for node2 in node.childNodes:
145                                         if node2.nodeType == 1 and node2.tagName == 'title':
146                                                 channel.blogTitle       = self.__retrieve_value__(node2)
147                                         elif node2.nodeType == 1 and node2.tagName == 'link':
148                                                 channel.blogURL         = self.__retrieve_value__(node2)
149                                         elif node2.nodeType == 1 and node2.tagName == 'image':
150                                                 for node3 in node2.childNodes:
151                                                         if node3.nodeType == 1 and node3.tagName == 'url':
152                                                                 channel.imageURL                = self.__retrieve_value__(node3)
153                                                         elif node3.nodeType == 1 and node3.tagName == 'link':
154                                                                 channel.imageLink       = self.__retrieve_value__(node3)
155                                         elif node2.nodeType == 1 and node2.tagName == 'item':
156                                                 item    = self.__parse_item__(node2)
157                                                 channel.items.append(item)
158                 return channellist

UCC git Repository :: git.ucc.asn.au