This file is no longer needed.
authordavyd <davyd>
Mon, 15 Mar 2004 02:38:33 +0000 (02:38 +0000)
committerdavyd <davyd>
Mon, 15 Mar 2004 02:38:33 +0000 (02:38 +0000)
XMLParse.py [deleted file]

diff --git a/XMLParse.py b/XMLParse.py
deleted file mode 100644 (file)
index 8ce8e40..0000000
+++ /dev/null
@@ -1,186 +0,0 @@
-#
-# XMLParse.py
-#
-# Parse arbitrary XML news streams into an object type
-# understandable by Planet UCC.
-#
-# (c) 2004, Davyd Madeley <[email protected]>
-#
-
-import sys, time
-from xml.dom.minidom import parseString
-
-class Blog:
-       def __init__(self):
-               self.blogTitle  = None
-               self.blogURL    = None
-               self.imageURL   = None
-               self.imageLink  = None
-               self.items      = []
-
-class BlogItem:
-       def __init__(self):
-               self.itemTitle  = None
-               self.itemDate   = None
-               self.itemURL    = None
-               self.contents   = None
-
-class XMLParse:
-       def __init__(self, XMLString):
-               # parse our XML file
-               self.dom        = parseString(XMLString)
-               self.bloglist   = None
-               # find out what sort of XML format we're dealing with
-               if self.dom.documentElement.tagName == 'rss':
-                       # this is some sort of RSS feed
-                       # find out what version
-                       if self.dom.documentElement.attributes.has_key('version'):
-                               version = self.dom.documentElement.attributes['version'].value
-                               if version == '2.0':
-                                       # this is an RSS2 document
-                                       self.news       = RSS2Parse(self.dom)
-                               else:
-                                       sys.stderr.write('DEBUG: XMLParse: Unknown RSS version %s\n' % version)
-                       else:
-                               sys.stderr.write('DEBUG: XMLParse: RSS document has no version information\n')
-               elif self.dom.documentElement.tagName == 'rdf:RDF':
-                       # this is an RDF document
-                       self.news       = RDFParse(self.dom)
-               elif self.dom.documentElement.tagName == 'feed':
-                       # this seems to be an Atom feed
-                       self.news       = AtomParse(self.dom)
-               else:
-                       sys.stderr.write('DEBUG: XMLParse: Unknown XML document \'%s\'\n' % self.dom.documentElement.tagName)
-                       
-       def parse(self):
-               "Return a list of Blog objects from the XML file we parsed"
-               # quick cache for XML parsing
-               if self.bloglist:
-                       return self.bloglist
-               else:
-                       self.bloglist   = self.news.parse()
-                       return self.bloglist
-
-class Parse:
-       "Generic class for parsing XML feeds"
-       def __init__(self, dom):
-               self.dom        = dom
-               self.root       = dom.documentElement
-
-       def __retrieve_value__(self, fromNode):
-               "Retrieve a value from between two nodes"
-               for node in fromNode.childNodes:
-                       if node.nodeType == 3:
-                               return node.nodeValue
-                       else:
-                               sys.stderr.write('DEBUG: Parse: Asked to retrieve value from wrong part of tree\n')
-                               return None
-
-class AtomParse(Parse):        
-       def parse(self):
-               channel = Blog()
-               for node in self.root.childNodes:
-                       if node.nodeType == 1 and node.tagName == 'title':
-                               channel.blogTitle       = self.__retrieve_value__(node)
-                       elif node.nodeType == 1 and node.tagName == 'link' and node.attributes.has_key('rel') and node.attributes['rel'].value == "alternate":
-                               if node.attributes.has_key('href'):
-                                       channel.blogURL         = node.attributes['href'].value
-                               else:
-                                       sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n')
-                       elif node.nodeType == 1 and node.tagName == 'entry':
-                               # create an item and add it to the list
-                               item    = BlogItem()
-                               channel.items.append(item)
-                               # handlers for tags
-                               for node2 in node.childNodes:
-                                       if node2.nodeType == 1 and node2.tagName == 'created':
-                                               date    = self.__retrieve_value__(node2)
-                                               try:
-                                                       item.itemDate   = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%SZ')) + 28800
-                                               except:
-                                                       sys.stderr.write("DEBUG: AtomParse: time string %s is unparseable\n" % date)
-                                       elif node2.nodeType == 1 and node2.tagName == 'link' and node2.attributes.has_key('rel') and node2.attributes['rel'].value == 'alternate':
-                                               if node2.attributes.has_key('href'):
-                                                       item.itemURL    = node2.attributes['href'].value
-                                               else:
-                                                       sys.stderr.write('DEBUG: AtomParse: Could not find href for link, ignoring\n')
-                                       elif node2.nodeType == 1 and node2.tagName == 'title':
-                                               item.itemTitle  = self.__retrieve_value__(node2)
-                                       elif node2.nodeType == 1 and node2.tagName == 'summary':
-                                               for node3 in node2.childNodes:
-                                                       if node3.nodeType == 1 and node3.tagName == 'div':
-                                                               item.contents   = self.__retrieve_value__(node3)
-               return [channel]
-
-class RDFParse(Parse):
-       def parse(self):
-               channel = Blog()
-               for node in self.root.childNodes:
-                       if node.nodeType == 1 and node.tagName == 'channel':
-                               for node2 in node.childNodes:
-                                       if node2.nodeType == 1 and node2.tagName == 'title':
-                                               channel.blogTitle       = self.__retrieve_value__(node2)
-                                       elif node2.nodeType == 1 and node2.tagName == 'link':
-                                               channel.blogURL         = self.__retrieve_value__(node2)
-                       elif node.nodeType == 1 and node.tagName == 'item':
-                               item    = BlogItem()
-                               for node2 in node.childNodes:
-                                       if node2.nodeType == 1 and node2.tagName == 'title':
-                                               item.itemTitle          = self.__retrieve_value__(node2)
-                                       elif node2.nodeType == 1 and node2.tagName == 'link':
-                                               item.itemURL            = self.__retrieve_value__(node2)
-                                       elif node2.nodeType == 1 and node2.tagName == 'dc:date':
-                                               date                    = self.__retrieve_value__(node2)
-                                               try:
-                                                       item.itemDate   = time.mktime(time.strptime(date, '%Y-%m-%dT%H:%M:%S+08:00'))
-                                               except:
-                                                       sys.stderr.write("DEBUG: RDFParse: time string %s unparseable\n" % date)
-                                       elif node2.nodeType == 1 and node2.tagName == 'description':
-                                               item.contents           = self.__retrieve_value__(node2)
-                               channel.items.append(item)
-               return [channel]
-
-class RSS2Parse(Parse):
-       def __parse_item__(self, fromNode):
-               "Returns a BlogItem collected from fromNode"
-               item    = BlogItem()
-               for node in fromNode.childNodes:
-                       if node.nodeType == 1 and node.tagName == 'title':
-                               item.itemTitle  = self.__retrieve_value__(node)
-                       elif node.nodeType == 1 and node.tagName == 'pubDate':
-                               try:
-                                       item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S GMT')) + 28800
-                               except:
-                                       try:
-                                               item.itemDate   = time.mktime(time.strptime(self.__retrieve_value__(node), '%a, %d %b %Y %H:%M:%S +0000')) + 28800
-                                       except:
-                                               sys.stderr.write("DEBUG: RSS2Parse: time string %s unparseable\n" % self.__retrieve_value__(node))
-                       elif node.nodeType == 1 and node.tagName == 'link':
-                               item.itemURL    = self.__retrieve_value__(node)
-                       elif node.nodeType == 1 and node.tagName == 'description':
-                               item.contents   = self.__retrieve_value__(node)
-               return item
-
-       def parse(self):
-               "Returns a list of Blog objects for parsing into an arbitrary data format."
-               channellist     = []
-               for node in self.root.childNodes:
-                       if node.nodeType == 1 and node.tagName == 'channel':
-                               channel = Blog()
-                               channellist.append(channel)
-                               # populate channel with information from the blog
-                               for node2 in node.childNodes:
-                                       if node2.nodeType == 1 and node2.tagName == 'title':
-                                               channel.blogTitle       = self.__retrieve_value__(node2)
-                                       elif node2.nodeType == 1 and node2.tagName == 'link':
-                                               channel.blogURL         = self.__retrieve_value__(node2)
-                                       elif node2.nodeType == 1 and node2.tagName == 'image':
-                                               for node3 in node2.childNodes:
-                                                       if node3.nodeType == 1 and node3.tagName == 'url':
-                                                               channel.imageURL                = self.__retrieve_value__(node3)
-                                                       elif node3.nodeType == 1 and node3.tagName == 'link':
-                                                               channel.imageLink       = self.__retrieve_value__(node3)
-                                       elif node2.nodeType == 1 and node2.tagName == 'item':
-                                               item    = self.__parse_item__(node2)
-                                               channel.items.append(item)
-               return channellist

UCC git Repository :: git.ucc.asn.au