yet again
[planet-ucc.git] / XMLParse.py
index a3ef6ae..8ce8e40 100644 (file)
@@ -43,8 +43,14 @@ class XMLParse:
                                        sys.stderr.write('DEBUG: XMLParse: Unknown RSS version %s\n' % version)
                        else:
                                sys.stderr.write('DEBUG: XMLParse: RSS document has no version information\n')
+               elif self.dom.documentElement.tagName == 'rdf:RDF':
+                       # this is an RDF document
+                       self.news       = RDFParse(self.dom)
+               elif self.dom.documentElement.tagName == 'feed':
+                       # this seems to be an Atom feed
+                       self.news       = AtomParse(self.dom)
                else:
-                       sys.stderr.write('DEBUG: XMLParse: Unknown XML document %s\n' % dom.documentElement.tagname)
+                       sys.stderr.write('DEBUG: XMLParse: Unknown XML document \'%s\'\n' % self.dom.documentElement.tagName)
                        
        def parse(self):
                "Return a list of Blog objects from the XML file we parsed"
@@ -55,21 +61,86 @@ class XMLParse:
                        self.bloglist   = self.news.parse()
                        return self.bloglist
 
-class RSS2Parse:
+class Parse:
+       "Generic class for parsing XML feeds"
        def __init__(self, dom):
                self.dom        = dom
                self.root       = dom.documentElement
 
        def __retrieve_value__(self, fromNode):
-               "Returns the value from between two nodes, ie <node>text</node>"
+               "Retrieve a value from between two nodes"
                for node in fromNode.childNodes:
                        if node.nodeType == 3:
-                               # this is the information contained within our node
                                return node.nodeValue
                        else:
-                               sys.stderr.write('DEBUG: RSS2Parse: Asked to retrieve value from wrong part of tree\n')
+                               sys.stderr.write('DEBUG: Parse: Asked to retrieve value from wrong part of tree\n')
                                return None
-       
+
+class AtomParse(Parse):        
+       def parse(self):
+               channel = Blog()
+               for node in self.root.childNodes:
+                       if node.nodeType == 1 and node.tagName == 'title':
+                               channel.blogTitle       = self.__retrieve_value__(node)
+                       elif node.nodeType == 1 and node.tagName == 'link' and node.attributes.has_key('rel') and node.attributes['rel'].value == "alternate":
+                               if node.attributes.has_key('href'):
+                                       channel.blogURL         = node.attributes['href'].value
+                               else:
+                                       sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n')
+                       elif node.nodeType == 1 and node.tagName == 'entry':
+                               # create an item and add it to the list
+                               item    = BlogItem()
+                               channel.items.append(item)
+                               # handlers for tags
+                               for node2 in node.childNodes:
+                                       if node2.nodeType == 1 and node2.tagName == 'created':
+                                               date    = self.__retrieve_value__(node2)
+                                               try:
+                                                       item.itemDate   = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%SZ')) + 28800
+                                               except:
+                                                       sys.stderr.write("DEBUG: AtomParse: time string %s is unparseable\n" % date)
+                                       elif node2.nodeType == 1 and node2.tagName == 'link' and node2.attributes.has_key('rel') and node2.attributes['rel'].value == 'alternate':
+                                               if node2.attributes.has_key('href'):
+                                                       item.itemURL    = node2.attributes['href'].value
+                                               else:
+                                                       sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n')
+                                       elif node2.nodeType == 1 and node2.tagName == 'title':
+                                               item.itemTitle  = self.__retrieve_value__(node2)
+                                       elif node2.nodeType == 1 and node2.tagName == 'summary':
+                                               for node3 in node2.childNodes:
+                                                       if node3.nodeType == 1 and node3.tagName == 'div':
+                                                               item.contents   = self.__retrieve_value__(node3)
+               return [channel]
+
+class RDFParse(Parse):
+       def parse(self):
+               channel = Blog()
+               for node in self.root.childNodes:
+                       if node.nodeType == 1 and node.tagName == 'channel':
+                               for node2 in node.childNodes:
+                                       if node2.nodeType == 1 and node2.tagName == 'title':
+                                               channel.blogTitle       = self.__retrieve_value__(node2)
+                                       elif node2.nodeType == 1 and node2.tagName == 'link':
+                                               channel.blogURL         = self.__retrieve_value__(node2)
+                       elif node.nodeType == 1 and node.tagName == 'item':
+                               item    = BlogItem()
+                               for node2 in node.childNodes:
+                                       if node2.nodeType == 1 and node2.tagName == 'title':
+                                               item.itemTitle          = self.__retrieve_value__(node2)
+                                       elif node2.nodeType == 1 and node2.tagName == 'link':
+                                               item.itemURL            = self.__retrieve_value__(node2)
+                                       elif node2.nodeType == 1 and node2.tagName == 'dc:date':
+                                               date                    = self.__retrieve_value__(node2)
+                                               try:
+                                                       item.itemDate   = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%S+08:00'))
+                                               except:
+                                                       sys.stderr.write("DEBUG: RDFParse: time string %s unparseable\n" % date)
+                                       elif node2.nodeType == 1 and node2.tagName == 'description':
+                                               item.contents           = self.__retrieve_value__(node2)
+                               channel.items.append(item)
+               return [channel]
+
+class RSS2Parse(Parse):
        def __parse_item__(self, fromNode):
                "Returns a BlogItem collected from fromNode"
                item    = BlogItem()
@@ -78,12 +149,12 @@ class RSS2Parse:
                                item.itemTitle  = self.__retrieve_value__(node)
                        elif node.nodeType == 1 and node.tagName == 'pubDate':
                                try:
-                                       item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S %Z'))
+                                       item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S GMT')) + 28800
                                except:
                                        try:
-                                               item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000'))
+                                               item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000')) + 28800
                                        except:
-                                               sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % (self.__retrieve_value__(node)))
+                                               sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % self.__retrieve_value__(node))
                        elif node.nodeType == 1 and node.tagName == 'link':
                                item.itemURL    = self.__retrieve_value__(node)
                        elif node.nodeType == 1 and node.tagName == 'description':

UCC git Repository :: git.ucc.asn.au