Initial Upload
[planet-ucc.git] / XMLParse.py
1 #
2 # XMLParse.py
3 #
4 # Parse arbitrary XML news streams into an object type
5 # understandable by Planet UCC.
6 #
7 # (c) 2004, Davyd Madeley <[email protected]>
8 #
9
10 import sys, time
11 from xml.dom.minidom import parseString
12
13 class Blog:
14         def __init__(self):
15                 self.blogTitle  = None
16                 self.blogURL    = None
17                 self.imageURL   = None
18                 self.imageLink  = None
19                 self.items      = []
20
21 class BlogItem:
22         def __init__(self):
23                 self.itemTitle  = None
24                 self.itemDate   = None
25                 self.itemURL    = None
26                 self.contents   = None
27
28 class XMLParse:
29         def __init__(self, XMLString):
30                 # parse our XML file
31                 self.dom        = parseString(XMLString)
32                 self.bloglist   = None
33                 # find out what sort of XML format we're dealing with
34                 if self.dom.documentElement.tagName == 'rss':
35                         # this is some sort of RSS feed
36                         # find out what version
37                         if self.dom.documentElement.attributes.has_key('version'):
38                                 version = self.dom.documentElement.attributes['version'].value
39                                 if version == '2.0':
40                                         # this is an RSS2 document
41                                         self.news       = RSS2Parse(self.dom)
42                                 else:
43                                         sys.stderr.write('DEBUG: XMLParse: Unknown RSS version %s\n' % version)
44                         else:
45                                 sys.stderr.write('DEBUG: XMLParse: RSS document has no version information\n')
46                 else:
47                         sys.stderr.write('DEBUG: XMLParse: Unknown XML document %s\n' % dom.documentElement.tagname)
48                         
49         def parse(self):
50                 "Return a list of Blog objects from the XML file we parsed"
51                 # quick cache for XML parsing
52                 if self.bloglist:
53                         return self.bloglist
54                 else:
55                         self.bloglist   = self.news.parse()
56                         return self.bloglist
57
58 class RSS2Parse:
59         def __init__(self, dom):
60                 self.dom        = dom
61                 self.root       = dom.documentElement
62
63         def __retrieve_value__(self, fromNode):
64                 "Returns the value from between two nodes, ie <node>text</node>"
65                 for node in fromNode.childNodes:
66                         if node.nodeType == 3:
67                                 # this is the information contained within our node
68                                 return node.nodeValue
69                         else:
70                                 sys.stderr.write('DEBUG: RSS2Parse: Asked to retrieve value from wrong part of tree\n')
71                                 return None
72         
73         def __parse_item__(self, fromNode):
74                 "Returns a BlogItem collected from fromNode"
75                 item    = BlogItem()
76                 for node in fromNode.childNodes:
77                         if node.nodeType == 1 and node.tagName == 'title':
78                                 item.itemTitle  = self.__retrieve_value__(node)
79                         elif node.nodeType == 1 and node.tagName == 'pubDate':
80                                 try:
81                                         item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S %Z'))
82                                 except:
83                                         try:
84                                                 item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000'))
85                                         except:
86                                                 sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % (self.__retrieve_value__(node)))
87                         elif node.nodeType == 1 and node.tagName == 'link':
88                                 item.itemURL    = self.__retrieve_value__(node)
89                         elif node.nodeType == 1 and node.tagName == 'description':
90                                 item.contents   = self.__retrieve_value__(node)
91                 return item
92
93         def parse(self):
94                 "Returns a list of Blog objects for parsing into an arbitrary data format."
95                 channellist     = []
96                 for node in self.root.childNodes:
97                         if node.nodeType == 1 and node.tagName == 'channel':
98                                 channel = Blog()
99                                 channellist.append(channel)
100                                 # populate channel with information from the blog
101                                 for node2 in node.childNodes:
102                                         if node2.nodeType == 1 and node2.tagName == 'title':
103                                                 channel.blogTitle       = self.__retrieve_value__(node2)
104                                         elif node2.nodeType == 1 and node2.tagName == 'link':
105                                                 channel.blogURL         = self.__retrieve_value__(node2)
106                                         elif node2.nodeType == 1 and node2.tagName == 'image':
107                                                 for node3 in node2.childNodes:
108                                                         if node3.nodeType == 1 and node3.tagName == 'url':
109                                                                 channel.imageURL                = self.__retrieve_value__(node3)
110                                                         elif node3.nodeType == 1 and node3.tagName == 'link':
111                                                                 channel.imageLink       = self.__retrieve_value__(node3)
112                                         elif node2.nodeType == 1 and node2.tagName == 'item':
113                                                 item    = self.__parse_item__(node2)
114                                                 channel.items.append(item)
115                 return channellist

UCC git Repository :: git.ucc.asn.au