4 # Parse arbitrary XML news streams into an object type
5 # understandable by Planet UCC.
11 from xml.dom.minidom import parseString
29 def __init__(self, XMLString):
31 self.dom = parseString(XMLString)
33 # find out what sort of XML format we're dealing with
34 if self.dom.documentElement.tagName == 'rss':
35 # this is some sort of RSS feed
36 # find out what version
37 if self.dom.documentElement.attributes.has_key('version'):
38 version = self.dom.documentElement.attributes['version'].value
40 # this is an RSS2 document
41 self.news = RSS2Parse(self.dom)
43 sys.stderr.write('DEBUG: XMLParse: Unknown RSS version %s\n' % version)
45 sys.stderr.write('DEBUG: XMLParse: RSS document has no version information\n')
47 sys.stderr.write('DEBUG: XMLParse: Unknown XML document %s\n' % dom.documentElement.tagname)
50 "Return a list of Blog objects from the XML file we parsed"
51 # quick cache for XML parsing
55 self.bloglist = self.news.parse()
59 def __init__(self, dom):
61 self.root = dom.documentElement
63 def __retrieve_value__(self, fromNode):
64 "Returns the value from between two nodes, ie <node>text</node>"
65 for node in fromNode.childNodes:
66 if node.nodeType == 3:
67 # this is the information contained within our node
70 sys.stderr.write('DEBUG: RSS2Parse: Asked to retrieve value from wrong part of tree\n')
73 def __parse_item__(self, fromNode):
74 "Returns a BlogItem collected from fromNode"
76 for node in fromNode.childNodes:
77 if node.nodeType == 1 and node.tagName == 'title':
78 item.itemTitle = self.__retrieve_value__(node)
79 elif node.nodeType == 1 and node.tagName == 'pubDate':
81 item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S %Z'))
84 item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000'))
86 sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % (self.__retrieve_value__(node)))
87 elif node.nodeType == 1 and node.tagName == 'link':
88 item.itemURL = self.__retrieve_value__(node)
89 elif node.nodeType == 1 and node.tagName == 'description':
90 item.contents = self.__retrieve_value__(node)
94 "Returns a list of Blog objects for parsing into an arbitrary data format."
96 for node in self.root.childNodes:
97 if node.nodeType == 1 and node.tagName == 'channel':
99 channellist.append(channel)
100 # populate channel with information from the blog
101 for node2 in node.childNodes:
102 if node2.nodeType == 1 and node2.tagName == 'title':
103 channel.blogTitle = self.__retrieve_value__(node2)
104 elif node2.nodeType == 1 and node2.tagName == 'link':
105 channel.blogURL = self.__retrieve_value__(node2)
106 elif node2.nodeType == 1 and node2.tagName == 'image':
107 for node3 in node2.childNodes:
108 if node3.nodeType == 1 and node3.tagName == 'url':
109 channel.imageURL = self.__retrieve_value__(node3)
110 elif node3.nodeType == 1 and node3.tagName == 'link':
111 channel.imageLink = self.__retrieve_value__(node3)
112 elif node2.nodeType == 1 and node2.tagName == 'item':
113 item = self.__parse_item__(node2)
114 channel.items.append(item)