CSS tweak
[planet-ucc.git] / XMLParse.py
1 #
2 # XMLParse.py
3 #
4 # Parse arbitrary XML news streams into an object type
5 # understandable by Planet UCC.
6 #
7 # (c) 2004, Davyd Madeley <[email protected]>
8 #
9
10 import sys, time
11 from xml.dom.minidom import parseString
12
13 class Blog:
14         def __init__(self):
15                 self.blogTitle  = None
16                 self.blogURL    = None
17                 self.imageURL   = None
18                 self.imageLink  = None
19                 self.items      = []
20
21 class BlogItem:
22         def __init__(self):
23                 self.itemTitle  = None
24                 self.itemDate   = None
25                 self.itemURL    = None
26                 self.contents   = None
27
28 class XMLParse:
29         def __init__(self, XMLString):
30                 # parse our XML file
31                 self.dom        = parseString(XMLString)
32                 self.bloglist   = None
33                 # find out what sort of XML format we're dealing with
34                 if self.dom.documentElement.tagName == 'rss':
35                         # this is some sort of RSS feed
36                         # find out what version
37                         if self.dom.documentElement.attributes.has_key('version'):
38                                 version = self.dom.documentElement.attributes['version'].value
39                                 if version == '2.0':
40                                         # this is an RSS2 document
41                                         self.news       = RSS2Parse(self.dom)
42                                 else:
43                                         sys.stderr.write('DEBUG: XMLParse: Unknown RSS version %s\n' % version)
44                         else:
45                                 sys.stderr.write('DEBUG: XMLParse: RSS document has no version information\n')
46                 elif self.dom.documentElement.tagName == 'rdf:RDF':
47                         # this is an RDF document
48                         self.news       = RDFParse(self.dom)
49                 elif self.dom.documentElement.tagName == 'feed':
50                         # this seems to be an Atom feed
51                         self.news       = AtomParse(self.dom)
52                 else:
53                         sys.stderr.write('DEBUG: XMLParse: Unknown XML document \'%s\'\n' % self.dom.documentElement.tagName)
54                         
55         def parse(self):
56                 "Return a list of Blog objects from the XML file we parsed"
57                 # quick cache for XML parsing
58                 if self.bloglist:
59                         return self.bloglist
60                 else:
61                         self.bloglist   = self.news.parse()
62                         return self.bloglist
63
64 class Parse:
65         "Generic class for parsing XML feeds"
66         def __init__(self, dom):
67                 self.dom        = dom
68                 self.root       = dom.documentElement
69
70         def __retrieve_value__(self, fromNode):
71                 "Retrieve a value from between two nodes"
72                 for node in fromNode.childNodes:
73                         if node.nodeType == 3:
74                                 return node.nodeValue
75                         else:
76                                 sys.stderr.write('DEBUG: Parse: Asked to retrieve value from wrong part of tree\n')
77                                 return None
78
79 class AtomParse(Parse): 
80         def parse(self):
81                 channel = Blog()
82                 for node in self.root.childNodes:
83                         if node.nodeType == 1 and node.tagName == 'title':
84                                 channel.blogTitle       = self.__retrieve_value__(node)
85                         elif node.nodeType == 1 and node.tagName == 'link' and node.attributes.has_key('rel') and node.attributes['rel'].value == "alternate":
86                                 if node.attributes.has_key('href'):
87                                         channel.blogURL         = node.attributes['href'].value
88                                 else:
89                                         sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n')
90                         elif node.nodeType == 1 and node.tagName == 'entry':
91                                 # create an item and add it to the list
92                                 item    = BlogItem()
93                                 channel.items.append(item)
94                                 # handlers for tags
95                                 for node2 in node.childNodes:
96                                         if node2.nodeType == 1 and node2.tagName == 'created':
97                                                 date    = self.__retrieve_value__(node2)
98                                                 try:
99                                                         item.itemDate   = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%SZ')) + 28800
100                                                 except:
101                                                         sys.stderr.write("DEBUG: AtomParse: time string %s is unparseable\n" % date)
102                                         elif node2.nodeType == 1 and node2.tagName == 'link' and node2.attributes.has_key('rel') and node2.attributes['rel'].value == 'alternate':
103                                                 if node2.attributes.has_key('href'):
104                                                         item.itemURL    = node2.attributes['href'].value
105                                                 else:
106                                                         sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n')
107                                         elif node2.nodeType == 1 and node2.tagName == 'title':
108                                                 item.itemTitle  = self.__retrieve_value__(node2)
109                                         elif node2.nodeType == 1 and node2.tagName == 'summary':
110                                                 for node3 in node2.childNodes:
111                                                         if node3.nodeType == 1 and node3.tagName == 'div':
112                                                                 item.contents   = self.__retrieve_value__(node3)
113                 return [channel]
114
115 class RDFParse(Parse):
116         def parse(self):
117                 channel = Blog()
118                 for node in self.root.childNodes:
119                         if node.nodeType == 1 and node.tagName == 'channel':
120                                 for node2 in node.childNodes:
121                                         if node2.nodeType == 1 and node2.tagName == 'title':
122                                                 channel.blogTitle       = self.__retrieve_value__(node2)
123                                         elif node2.nodeType == 1 and node2.tagName == 'link':
124                                                 channel.blogURL         = self.__retrieve_value__(node2)
125                         elif node.nodeType == 1 and node.tagName == 'item':
126                                 item    = BlogItem()
127                                 for node2 in node.childNodes:
128                                         if node2.nodeType == 1 and node2.tagName == 'title':
129                                                 item.itemTitle          = self.__retrieve_value__(node2)
130                                         elif node2.nodeType == 1 and node2.tagName == 'link':
131                                                 item.itemURL            = self.__retrieve_value__(node2)
132                                         elif node2.nodeType == 1 and node2.tagName == 'dc:date':
133                                                 date                    = self.__retrieve_value__(node2)
134                                                 try:
135                                                         item.itemDate   = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%S+08:00'))
136                                                 except:
137                                                         sys.stderr.write("DEBUG: RDFParse: time string %s unparseable\n" % date)
138                                         elif node2.nodeType == 1 and node2.tagName == 'description':
139                                                 item.contents           = self.__retrieve_value__(node2)
140                                 channel.items.append(item)
141                 return [channel]
142
143 class RSS2Parse(Parse):
144         def __parse_item__(self, fromNode):
145                 "Returns a BlogItem collected from fromNode"
146                 item    = BlogItem()
147                 for node in fromNode.childNodes:
148                         if node.nodeType == 1 and node.tagName == 'title':
149                                 item.itemTitle  = self.__retrieve_value__(node)
150                         elif node.nodeType == 1 and node.tagName == 'pubDate':
151                                 try:
152                                         item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S GMT')) + 28800
153                                 except:
154                                         try:
155                                                 item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000')) + 28800
156                                         except:
157                                                 sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % self.__retrieve_value__(node))
158                         elif node.nodeType == 1 and node.tagName == 'link':
159                                 item.itemURL    = self.__retrieve_value__(node)
160                         elif node.nodeType == 1 and node.tagName == 'description':
161                                 item.contents   = self.__retrieve_value__(node)
162                 return item
163
164         def parse(self):
165                 "Returns a list of Blog objects for parsing into an arbitrary data format."
166                 channellist     = []
167                 for node in self.root.childNodes:
168                         if node.nodeType == 1 and node.tagName == 'channel':
169                                 channel = Blog()
170                                 channellist.append(channel)
171                                 # populate channel with information from the blog
172                                 for node2 in node.childNodes:
173                                         if node2.nodeType == 1 and node2.tagName == 'title':
174                                                 channel.blogTitle       = self.__retrieve_value__(node2)
175                                         elif node2.nodeType == 1 and node2.tagName == 'link':
176                                                 channel.blogURL         = self.__retrieve_value__(node2)
177                                         elif node2.nodeType == 1 and node2.tagName == 'image':
178                                                 for node3 in node2.childNodes:
179                                                         if node3.nodeType == 1 and node3.tagName == 'url':
180                                                                 channel.imageURL                = self.__retrieve_value__(node3)
181                                                         elif node3.nodeType == 1 and node3.tagName == 'link':
182                                                                 channel.imageLink       = self.__retrieve_value__(node3)
183                                         elif node2.nodeType == 1 and node2.tagName == 'item':
184                                                 item    = self.__parse_item__(node2)
185                                                 channel.items.append(item)
186                 return channellist

UCC git Repository :: git.ucc.asn.au