4 # Parse arbitrary XML news streams into an object type
5 # understandable by Planet UCC.
11 from xml.dom.minidom import parseString
29 def __init__(self, XMLString):
31 self.dom = parseString(XMLString)
33 # find out what sort of XML format we're dealing with
34 if self.dom.documentElement.tagName == 'rss':
35 # this is some sort of RSS feed
36 # find out what version
37 if self.dom.documentElement.attributes.has_key('version'):
38 version = self.dom.documentElement.attributes['version'].value
40 # this is an RSS2 document
41 self.news = RSS2Parse(self.dom)
43 sys.stderr.write('DEBUG: XMLParse: Unknown RSS version %s\n' % version)
45 sys.stderr.write('DEBUG: XMLParse: RSS document has no version information\n')
46 elif self.dom.documentElement.tagName == 'rdf:RDF':
47 # this is an RDF document
48 self.news = RDFParse(self.dom)
50 sys.stderr.write('DEBUG: XMLParse: Unknown XML document %s\n' % dom.documentElement.tagname)
53 "Return a list of Blog objects from the XML file we parsed"
54 # quick cache for XML parsing
58 self.bloglist = self.news.parse()
62 def __init__(self, dom):
64 self.root = dom.documentElement
66 def __retrieve_value__(self, fromNode):
67 for node in fromNode.childNodes:
68 if node.nodeType == 3:
71 sys.stderr.write('DEBUG: RDFParse: Asked to retrieve value from wrong part of tree\n')
76 for node in self.root.childNodes:
77 if node.nodeType == 1 and node.tagName == 'channel':
78 for node2 in node.childNodes:
79 if node2.nodeType == 1 and node2.tagName == 'title':
80 channel.blogTitle = self.__retrieve_value__(node2)
81 elif node2.nodeType == 1 and node2.tagName == 'link':
82 channel.blogURL = self.__retrieve_value__(node2)
83 elif node.nodeType == 1 and node.tagName == 'item':
85 for node2 in node.childNodes:
86 if node2.nodeType == 1 and node2.tagName == 'title':
87 item.itemTitle = self.__retrieve_value__(node2)
88 elif node2.nodeType == 1 and node2.tagName == 'link':
89 item.itemURL = self.__retrieve_value__(node2)
90 elif node2.nodeType == 1 and node2.tagName == 'dc:date':
91 date = self.__retrieve_value__(node2)
93 item.itemDate = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%S+07:00'))
95 sys.stderr.write("DEBUG: RDFParse: time string %s unparseable\n" % date)
96 elif node2.nodeType == 1 and node2.tagName == 'description':
97 item.contents = self.__retrieve_value__(node2)
98 channel.items.append(item)
102 def __init__(self, dom):
104 self.root = dom.documentElement
106 def __retrieve_value__(self, fromNode):
107 "Returns the value from between two nodes, ie <node>text</node>"
108 for node in fromNode.childNodes:
109 if node.nodeType == 3:
110 # this is the information contained within our node
111 return node.nodeValue
113 sys.stderr.write('DEBUG: RSS2Parse: Asked to retrieve value from wrong part of tree\n')
116 def __parse_item__(self, fromNode):
117 "Returns a BlogItem collected from fromNode"
119 for node in fromNode.childNodes:
120 if node.nodeType == 1 and node.tagName == 'title':
121 item.itemTitle = self.__retrieve_value__(node)
122 elif node.nodeType == 1 and node.tagName == 'pubDate':
124 item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S %Z'))
127 item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000'))
129 sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % self.__retrieve_value__(node))
130 elif node.nodeType == 1 and node.tagName == 'link':
131 item.itemURL = self.__retrieve_value__(node)
132 elif node.nodeType == 1 and node.tagName == 'description':
133 item.contents = self.__retrieve_value__(node)
137 "Returns a list of Blog objects for parsing into an arbitrary data format."
139 for node in self.root.childNodes:
140 if node.nodeType == 1 and node.tagName == 'channel':
142 channellist.append(channel)
143 # populate channel with information from the blog
144 for node2 in node.childNodes:
145 if node2.nodeType == 1 and node2.tagName == 'title':
146 channel.blogTitle = self.__retrieve_value__(node2)
147 elif node2.nodeType == 1 and node2.tagName == 'link':
148 channel.blogURL = self.__retrieve_value__(node2)
149 elif node2.nodeType == 1 and node2.tagName == 'image':
150 for node3 in node2.childNodes:
151 if node3.nodeType == 1 and node3.tagName == 'url':
152 channel.imageURL = self.__retrieve_value__(node3)
153 elif node3.nodeType == 1 and node3.tagName == 'link':
154 channel.imageLink = self.__retrieve_value__(node3)
155 elif node2.nodeType == 1 and node2.tagName == 'item':
156 item = self.__parse_item__(node2)
157 channel.items.append(item)