X-Git-Url: https://git.ucc.asn.au/?p=planet-ucc.git;a=blobdiff_plain;f=XMLParse.py;h=8ce8e401ea7ef6d33aafb311ad6f8374920142a6;hp=a3ef6ae8a6333599591ba2aa8b3efff68b94d50e;hb=52296aaabfab3fd89036e1acca23cd26d2a00173;hpb=b2d7ed4b39a5a03fb8f264e26b0d04a502928ddf diff --git a/XMLParse.py b/XMLParse.py index a3ef6ae..8ce8e40 100644 --- a/XMLParse.py +++ b/XMLParse.py @@ -43,8 +43,14 @@ class XMLParse: sys.stderr.write('DEBUG: XMLParse: Unknown RSS version %s\n' % version) else: sys.stderr.write('DEBUG: XMLParse: RSS document has no version information\n') + elif self.dom.documentElement.tagName == 'rdf:RDF': + # this is an RDF document + self.news = RDFParse(self.dom) + elif self.dom.documentElement.tagName == 'feed': + # this seems to be an Atom feed + self.news = AtomParse(self.dom) else: - sys.stderr.write('DEBUG: XMLParse: Unknown XML document %s\n' % dom.documentElement.tagname) + sys.stderr.write('DEBUG: XMLParse: Unknown XML document \'%s\'\n' % self.dom.documentElement.tagName) def parse(self): "Return a list of Blog objects from the XML file we parsed" @@ -55,21 +61,86 @@ class XMLParse: self.bloglist = self.news.parse() return self.bloglist -class RSS2Parse: +class Parse: + "Generic class for parsing XML feeds" def __init__(self, dom): self.dom = dom self.root = dom.documentElement def __retrieve_value__(self, fromNode): - "Returns the value from between two nodes, ie text" + "Retrieve a value from between two nodes" for node in fromNode.childNodes: if node.nodeType == 3: - # this is the information contained within our node return node.nodeValue else: - sys.stderr.write('DEBUG: RSS2Parse: Asked to retrieve value from wrong part of tree\n') + sys.stderr.write('DEBUG: Parse: Asked to retrieve value from wrong part of tree\n') return None - + +class AtomParse(Parse): + def parse(self): + channel = Blog() + for node in self.root.childNodes: + if node.nodeType == 1 and node.tagName == 'title': + channel.blogTitle = self.__retrieve_value__(node) + elif node.nodeType == 1 and node.tagName == 'link' and node.attributes.has_key('rel') and node.attributes['rel'].value == "alternate": + if node.attributes.has_key('href'): + channel.blogURL = node.attributes['href'].value + else: + sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n') + elif node.nodeType == 1 and node.tagName == 'entry': + # create an item and add it to the list + item = BlogItem() + channel.items.append(item) + # handlers for tags + for node2 in node.childNodes: + if node2.nodeType == 1 and node2.tagName == 'created': + date = self.__retrieve_value__(node2) + try: + item.itemDate = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%SZ')) + 28800 + except: + sys.stderr.write("DEBUG: AtomParse: time string %s is unparseable\n" % date) + elif node2.nodeType == 1 and node2.tagName == 'link' and node2.attributes.has_key('rel') and node2.attributes['rel'].value == 'alternate': + if node2.attributes.has_key('href'): + item.itemURL = node2.attributes['href'].value + else: + sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n') + elif node2.nodeType == 1 and node2.tagName == 'title': + item.itemTitle = self.__retrieve_value__(node2) + elif node2.nodeType == 1 and node2.tagName == 'summary': + for node3 in node2.childNodes: + if node3.nodeType == 1 and node3.tagName == 'div': + item.contents = self.__retrieve_value__(node3) + return [channel] + +class RDFParse(Parse): + def parse(self): + channel = Blog() + for node in self.root.childNodes: + if node.nodeType == 1 and node.tagName == 'channel': + for node2 in node.childNodes: + if node2.nodeType == 1 and node2.tagName == 'title': + channel.blogTitle = self.__retrieve_value__(node2) + elif node2.nodeType == 1 and node2.tagName == 'link': + channel.blogURL = self.__retrieve_value__(node2) + elif node.nodeType == 1 and node.tagName == 'item': + item = BlogItem() + for node2 in node.childNodes: + if node2.nodeType == 1 and node2.tagName == 'title': + item.itemTitle = self.__retrieve_value__(node2) + elif node2.nodeType == 1 and node2.tagName == 'link': + item.itemURL = self.__retrieve_value__(node2) + elif node2.nodeType == 1 and node2.tagName == 'dc:date': + date = self.__retrieve_value__(node2) + try: + item.itemDate = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%S+08:00')) + except: + sys.stderr.write("DEBUG: RDFParse: time string %s unparseable\n" % date) + elif node2.nodeType == 1 and node2.tagName == 'description': + item.contents = self.__retrieve_value__(node2) + channel.items.append(item) + return [channel] + +class RSS2Parse(Parse): def __parse_item__(self, fromNode): "Returns a BlogItem collected from fromNode" item = BlogItem() @@ -78,12 +149,12 @@ class RSS2Parse: item.itemTitle = self.__retrieve_value__(node) elif node.nodeType == 1 and node.tagName == 'pubDate': try: - item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S %Z')) + item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S GMT')) + 28800 except: try: - item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000')) + item.itemDate = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000')) + 28800 except: - sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % (self.__retrieve_value__(node))) + sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % self.__retrieve_value__(node)) elif node.nodeType == 1 and node.tagName == 'link': item.itemURL = self.__retrieve_value__(node) elif node.nodeType == 1 and node.tagName == 'description':