X-Git-Url: https://git.ucc.asn.au/?p=planet-ucc.git;a=blobdiff_plain;f=XMLParse.py;h=8ce8e401ea7ef6d33aafb311ad6f8374920142a6;hp=a50925fd1b8fc08ebb615d55e11ab44533b066ff;hb=52296aaabfab3fd89036e1acca23cd26d2a00173;hpb=18f48e511713b1d2d168585d656ebfe5f7c0f2ea diff --git a/XMLParse.py b/XMLParse.py index a50925f..8ce8e40 100644 --- a/XMLParse.py +++ b/XMLParse.py @@ -46,8 +46,11 @@ class XMLParse: elif self.dom.documentElement.tagName == 'rdf:RDF': # this is an RDF document self.news = RDFParse(self.dom) + elif self.dom.documentElement.tagName == 'feed': + # this seems to be an Atom feed + self.news = AtomParse(self.dom) else: - sys.stderr.write('DEBUG: XMLParse: Unknown XML document %s\n' % dom.documentElement.tagname) + sys.stderr.write('DEBUG: XMLParse: Unknown XML document \'%s\'\n' % self.dom.documentElement.tagName) def parse(self): "Return a list of Blog objects from the XML file we parsed" @@ -58,19 +61,58 @@ class XMLParse: self.bloglist = self.news.parse() return self.bloglist -class RDFParse: +class Parse: + "Generic class for parsing XML feeds" def __init__(self, dom): self.dom = dom self.root = dom.documentElement - + def __retrieve_value__(self, fromNode): + "Retrieve a value from between two nodes" for node in fromNode.childNodes: if node.nodeType == 3: return node.nodeValue else: - sys.stderr.write('DEBUG: RDFParse: Asked to retrieve value from wrong part of tree\n') + sys.stderr.write('DEBUG: Parse: Asked to retrieve value from wrong part of tree\n') return None - + +class AtomParse(Parse): + def parse(self): + channel = Blog() + for node in self.root.childNodes: + if node.nodeType == 1 and node.tagName == 'title': + channel.blogTitle = self.__retrieve_value__(node) + elif node.nodeType == 1 and node.tagName == 'link' and node.attributes.has_key('rel') and node.attributes['rel'].value == "alternate": + if node.attributes.has_key('href'): + channel.blogURL = node.attributes['href'].value + else: + sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n') + elif node.nodeType == 1 and node.tagName == 'entry': + # create an item and add it to the list + item = BlogItem() + channel.items.append(item) + # handlers for tags + for node2 in node.childNodes: + if node2.nodeType == 1 and node2.tagName == 'created': + date = self.__retrieve_value__(node2) + try: + item.itemDate = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%SZ')) + 28800 + except: + sys.stderr.write("DEBUG: AtomParse: time string %s is unparseable\n" % date) + elif node2.nodeType == 1 and node2.tagName == 'link' and node2.attributes.has_key('rel') and node2.attributes['rel'].value == 'alternate': + if node2.attributes.has_key('href'): + item.itemURL = node2.attributes['href'].value + else: + sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n') + elif node2.nodeType == 1 and node2.tagName == 'title': + item.itemTitle = self.__retrieve_value__(node2) + elif node2.nodeType == 1 and node2.tagName == 'summary': + for node3 in node2.childNodes: + if node3.nodeType == 1 and node3.tagName == 'div': + item.contents = self.__retrieve_value__(node3) + return [channel] + +class RDFParse(Parse): def parse(self): channel = Blog() for node in self.root.childNodes: @@ -90,7 +132,7 @@ class RDFParse: elif node2.nodeType == 1 and node2.tagName == 'dc:date': date = self.__retrieve_value__(node2) try: - item.itemDate = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%S+07:00')) + 3600 + item.itemDate = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%S+08:00')) except: sys.stderr.write("DEBUG: RDFParse: time string %s unparseable\n" % date) elif node2.nodeType == 1 and node2.tagName == 'description': @@ -98,21 +140,7 @@ class RDFParse: channel.items.append(item) return [channel] -class RSS2Parse: - def __init__(self, dom): - self.dom = dom - self.root = dom.documentElement - - def __retrieve_value__(self, fromNode): - "Returns the value from between two nodes, ie text" - for node in fromNode.childNodes: - if node.nodeType == 3: - # this is the information contained within our node - return node.nodeValue - else: - sys.stderr.write('DEBUG: RSS2Parse: Asked to retrieve value from wrong part of tree\n') - return None - +class RSS2Parse(Parse): def __parse_item__(self, fromNode): "Returns a BlogItem collected from fromNode" item = BlogItem()