+++ /dev/null
-#
-# XMLParse.py
-#
-# Parse arbitrary XML news streams into an object type
-# understandable by Planet UCC.
-#
-#
-
-import sys, time
-from xml.dom.minidom import parseString
-
-class Blog:
- def __init__(self):
- self.blogTitle = None
- self.blogURL = None
- self.imageURL = None
- self.imageLink = None
- self.items = []
-
-class BlogItem:
- def __init__(self):
- self.itemTitle = None
- self.itemDate = None
- self.itemURL = None
- self.contents = None
-
-class XMLParse:
- def __init__(self, XMLString):
- # parse our XML file
- self.dom = parseString(XMLString)
- self.bloglist = None
- # find out what sort of XML format we're dealing with
- if self.dom.documentElement.tagName == 'rss':
- # this is some sort of RSS feed
- # find out what version
- if self.dom.documentElement.attributes.has_key('version'):
- version = self.dom.documentElement.attributes['version'].value
- if version == '2.0':
- # this is an RSS2 document
- self.news = RSS2Parse(self.dom)
- else:
- sys.stderr.write('DEBUG: XMLParse: Unknown RSS version %s\n' % version)
- else:
- sys.stderr.write('DEBUG: XMLParse: RSS document has no version information\n')
- elif self.dom.documentElement.tagName == 'rdf:RDF':
- # this is an RDF document
- self.news = RDFParse(self.dom)
- elif self.dom.documentElement.tagName == 'feed':
- # this seems to be an Atom feed
- self.news = AtomParse(self.dom)
- else:
- sys.stderr.write('DEBUG: XMLParse: Unknown XML document \'%s\'\n' % self.dom.documentElement.tagName)
-
- def parse(self):
- "Return a list of Blog objects from the XML file we parsed"
- # quick cache for XML parsing
- if self.bloglist:
- return self.bloglist
- else:
- self.bloglist = self.news.parse()
- return self.bloglist
-
-class Parse:
- "Generic class for parsing XML feeds"
- def __init__(self, dom):
- self.dom = dom
- self.root = dom.documentElement
-
- def __retrieve_value__(self, fromNode):
- "Retrieve a value from between two nodes"
- for node in fromNode.childNodes:
- if node.nodeType == 3:
- return node.nodeValue
- else:
- sys.stderr.write('DEBUG: Parse: Asked to retrieve value from wrong part of tree\n')
- return None
-
-class AtomParse(Parse):
- def parse(self):
- channel = Blog()
- for node in self.root.childNodes:
- if node.nodeType == 1 and node.tagName == 'title':
- channel.blogTitle = self.__retrieve_value__(node)
- elif node.nodeType == 1 and node.tagName == 'link' and node.attributes.has_key('rel') and node.attributes['rel'].value == "alternate":
- if node.attributes.has_key('href'):
- channel.blogURL = node.attributes['href'].value
- else:
- sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n')
- elif node.nodeType == 1 and node.tagName == 'entry':
- # create an item and add it to the list
- item = BlogItem()
- channel.items.append(item)
- # handlers for tags
- for node2 in node.childNodes:
- if node2.nodeType == 1 and node2.tagName == 'created':
- date = self.__retrieve_value__(node2)
- try:
- item.itemDate = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%SZ')) + 28800
- except:
- sys.stderr.write("DEBUG: AtomParse: time string %s is unparseable\n" % date)
- elif node2.nodeType == 1 and node2.tagName == 'link' and node2.attributes.has_key('rel') and node2.attributes['rel'].value == 'alternate':
- if node2.attributes.has_key('href'):
- item.itemURL = node2.attributes['href'].value
- else:
- sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n')
- elif node2.nodeType == 1 and node2.tagName == 'title':
- item.itemTitle = self.__retrieve_value__(node2)
- elif node2.nodeType == 1 and node2.tagName == 'summary':
- for node3 in node2.childNodes:
- if node3.nodeType == 1 and node3.tagName == 'div':
- item.contents = self.__retrieve_value__(node3)
- return [channel]
-
-class RDFParse(Parse):
- def parse(self):
- channel = Blog()
- for node in self.root.childNodes:
- if node.nodeType == 1 and node.tagName == 'channel':
- for node2 in node.childNodes:
- if node2.nodeType == 1 and node2.tagName == 'title':
- channel.blogTitle = self.__retrieve_value__(node2)
- elif node2.nodeType == 1 and node2.tagName == 'link':
- channel.blogURL = self.__retrieve_value__(node2)
- elif node.nodeType == 1 and node.tagName == 'item':
- item = BlogItem()
- for node2 in node.childNodes:
- if node2.nodeType == 1 and node2.tagName == 'title':
- item.itemTitle = self.__retrieve_value__(node2)
- elif node2.nodeType == 1 and node2.tagName == 'link':
- item.itemURL = self.__retrieve_value__(node2)
- elif node2.nodeType == 1 and node2.tagName == 'dc:date':
- date = self.__retrieve_value__(node2)
- try:
- item.itemDate = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%S+08:00'))
- except:
- sys.stderr.write("DEBUG: RDFParse: time string %s unparseable\n" % date)
- elif node2.nodeType == 1 and node2.tagName == 'description':
- item.contents = self.__retrieve_value__(node2)
- channel.items.append(item)
- return [channel]
-
-class RSS2Parse(Parse):
- def __parse_item__(self, fromNode):
- "Returns a BlogItem collected from fromNode"
- item = BlogItem()
- for node in fromNode.childNodes:
- if node.nodeType == 1 and node.tagName == 'title':
- item.itemTitle = self.__retrieve_value__(node)
- elif node.nodeType == 1 and node.tagName == 'pubDate':
- try:
- item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S GMT')) + 28800
- except:
- try:
- item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000')) + 28800
- except:
- sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % self.__retrieve_value__(node))
- elif node.nodeType == 1 and node.tagName == 'link':
- item.itemURL = self.__retrieve_value__(node)
- elif node.nodeType == 1 and node.tagName == 'description':
- item.contents = self.__retrieve_value__(node)
- return item
-
- def parse(self):
- "Returns a list of Blog objects for parsing into an arbitrary data format."
- channellist = []
- for node in self.root.childNodes:
- if node.nodeType == 1 and node.tagName == 'channel':
- channel = Blog()
- channellist.append(channel)
- # populate channel with information from the blog
- for node2 in node.childNodes:
- if node2.nodeType == 1 and node2.tagName == 'title':
- channel.blogTitle = self.__retrieve_value__(node2)
- elif node2.nodeType == 1 and node2.tagName == 'link':
- channel.blogURL = self.__retrieve_value__(node2)
- elif node2.nodeType == 1 and node2.tagName == 'image':
- for node3 in node2.childNodes:
- if node3.nodeType == 1 and node3.tagName == 'url':
- channel.imageURL = self.__retrieve_value__(node3)
- elif node3.nodeType == 1 and node3.tagName == 'link':
- channel.imageLink = self.__retrieve_value__(node3)
- elif node2.nodeType == 1 and node2.tagName == 'item':
- item = self.__parse_item__(node2)
- channel.items.append(item)
- return channellist