4 # Parse arbitrary XML news streams into an object type
5 # understandable by Planet UCC.
11 from xml.dom.minidom import parseString
29 def __init__(self, XMLString):
31 self.dom = parseString(XMLString)
33 # find out what sort of XML format we're dealing with
34 if self.dom.documentElement.tagName == 'rss':
35 # this is some sort of RSS feed
36 # find out what version
37 if self.dom.documentElement.attributes.has_key('version'):
38 version = self.dom.documentElement.attributes['version'].value
40 # this is an RSS2 document
41 self.news = RSS2Parse(self.dom)
43 sys.stderr.write('DEBUG: XMLParse: Unknown RSS version %s\n' % version)
45 sys.stderr.write('DEBUG: XMLParse: RSS document has no version information\n')
46 elif self.dom.documentElement.tagName == 'rdf:RDF':
47 # this is an RDF document
48 self.news = RDFParse(self.dom)
49 elif self.dom.documentElement.tagName == 'feed':
50 # this seems to be an Atom feed
51 self.news = AtomParse(self.dom)
53 sys.stderr.write('DEBUG: XMLParse: Unknown XML document \'%s\'\n' % self.dom.documentElement.tagName)
56 "Return a list of Blog objects from the XML file we parsed"
57 # quick cache for XML parsing
61 self.bloglist = self.news.parse()
65 "Generic class for parsing XML feeds"
66 def __init__(self, dom):
68 self.root = dom.documentElement
70 def __retrieve_value__(self, fromNode):
71 "Retrieve a value from between two nodes"
72 for node in fromNode.childNodes:
73 if node.nodeType == 3:
76 sys.stderr.write('DEBUG: Parse: Asked to retrieve value from wrong part of tree\n')
79 class AtomParse(Parse):
82 for node in self.root.childNodes:
83 if node.nodeType == 1 and node.tagName == 'title':
84 channel.blogTitle = self.__retrieve_value__(node)
85 elif node.nodeType == 1 and node.tagName == 'link' and node.attributes.has_key('rel') and node.attributes['rel'].value == "alternate":
86 if node.attributes.has_key('href'):
87 channel.blogURL = node.attributes['href'].value
89 sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n')
90 elif node.nodeType == 1 and node.tagName == 'entry':
91 # create an item and add it to the list
93 channel.items.append(item)
95 for node2 in node.childNodes:
96 if node2.nodeType == 1 and node2.tagName == 'created':
97 date = self.__retrieve_value__(node2)
99 item.itemDate = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%SZ')) + 28800
101 sys.stderr.write("DEBUG: AtomParse: time string %s is unparseable\n" % date)
102 elif node2.nodeType == 1 and node2.tagName == 'link' and node2.attributes.has_key('rel') and node2.attributes['rel'].value == 'alternate':
103 if node2.attributes.has_key('href'):
104 item.itemURL = node2.attributes['href'].value
106 sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n')
107 elif node2.nodeType == 1 and node2.tagName == 'title':
108 item.itemTitle = self.__retrieve_value__(node2)
109 elif node2.nodeType == 1 and node2.tagName == 'summary':
110 for node3 in node2.childNodes:
111 if node3.nodeType == 1 and node3.tagName == 'div':
112 item.contents = self.__retrieve_value__(node3)
115 class RDFParse(Parse):
118 for node in self.root.childNodes:
119 if node.nodeType == 1 and node.tagName == 'channel':
120 for node2 in node.childNodes:
121 if node2.nodeType == 1 and node2.tagName == 'title':
122 channel.blogTitle = self.__retrieve_value__(node2)
123 elif node2.nodeType == 1 and node2.tagName == 'link':
124 channel.blogURL = self.__retrieve_value__(node2)
125 elif node.nodeType == 1 and node.tagName == 'item':
127 for node2 in node.childNodes:
128 if node2.nodeType == 1 and node2.tagName == 'title':
129 item.itemTitle = self.__retrieve_value__(node2)
130 elif node2.nodeType == 1 and node2.tagName == 'link':
131 item.itemURL = self.__retrieve_value__(node2)
132 elif node2.nodeType == 1 and node2.tagName == 'dc:date':
133 date = self.__retrieve_value__(node2)
135 item.itemDate = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%S+08:00'))
137 sys.stderr.write("DEBUG: RDFParse: time string %s unparseable\n" % date)
138 elif node2.nodeType == 1 and node2.tagName == 'description':
139 item.contents = self.__retrieve_value__(node2)
140 channel.items.append(item)
143 class RSS2Parse(Parse):
144 def __parse_item__(self, fromNode):
145 "Returns a BlogItem collected from fromNode"
147 for node in fromNode.childNodes:
148 if node.nodeType == 1 and node.tagName == 'title':
149 item.itemTitle = self.__retrieve_value__(node)
150 elif node.nodeType == 1 and node.tagName == 'pubDate':
152 item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S GMT')) + 28800
155 item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000')) + 28800
157 sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % self.__retrieve_value__(node))
158 elif node.nodeType == 1 and node.tagName == 'link':
159 item.itemURL = self.__retrieve_value__(node)
160 elif node.nodeType == 1 and node.tagName == 'description':
161 item.contents = self.__retrieve_value__(node)
165 "Returns a list of Blog objects for parsing into an arbitrary data format."
167 for node in self.root.childNodes:
168 if node.nodeType == 1 and node.tagName == 'channel':
170 channellist.append(channel)
171 # populate channel with information from the blog
172 for node2 in node.childNodes:
173 if node2.nodeType == 1 and node2.tagName == 'title':
174 channel.blogTitle = self.__retrieve_value__(node2)
175 elif node2.nodeType == 1 and node2.tagName == 'link':
176 channel.blogURL = self.__retrieve_value__(node2)
177 elif node2.nodeType == 1 and node2.tagName == 'image':
178 for node3 in node2.childNodes:
179 if node3.nodeType == 1 and node3.tagName == 'url':
180 channel.imageURL = self.__retrieve_value__(node3)
181 elif node3.nodeType == 1 and node3.tagName == 'link':
182 channel.imageLink = self.__retrieve_value__(node3)
183 elif node2.nodeType == 1 and node2.tagName == 'item':
184 item = self.__parse_item__(node2)
185 channel.items.append(item)