X-Git-Url: https://git.ucc.asn.au/?a=blobdiff_plain;f=XMLParse2.py;fp=XMLParse2.py;h=1cec66838912963af64dffb38e37e2f6b20c3069;hb=783caef083f5dcafaed71dfa3f3a8035109717e9;hp=0000000000000000000000000000000000000000;hpb=1eee04708eb32a4eaa15adbca4b4013f1efb7e6c;p=planet-ucc.git diff --git a/XMLParse2.py b/XMLParse2.py new file mode 100644 index 0000000..1cec668 --- /dev/null +++ b/XMLParse2.py @@ -0,0 +1,98 @@ +# +# XMLParse2.py +# +# Parse arbitrary XML news streams into an object type +# understandable by Planet UCC. +# Now uses feedparser to parse 9 different types of RSS _and_ Atom +# +# (c) 2004, Davyd Madeley +# + +import sys, time +import CacheHandler +sys.path.insert(0, 'extra') +import feedparser + +class Blog: + def __init__(self): + self.blogTitle = None + self.blogURL = None + self.feedURL = None + self.imageURL = None + self.imageLink = None + self.items = [] + self.cache = None + +class BlogItem: + def __init__(self): + self.itemTitle = None + self.itemDate = None + self.itemURL = None + self.contents = None + +class XMLParse: + def __init__(self, URL, blogObject): + self.feedURL = URL + self.blogObject = blogObject + + def parse(self): + "Return a single Blog object" + item = Blog() + if self.blogObject: + sys.stdout.write('Downloading feed %s...' % self.feedURL) + try: + data = feedparser.parse(self.feedURL, self.blogObject.cache.etag, self.blogObject.cache.date) + sys.stdout.write('done.\n') + except: + sys.stdout.write('failed.\n') + raise + return None + # check to see what we got returned + if data['items'] == [] and data['channel'] == {}: + sys.stdout.write('Feed %s is upto date.\n' % self.feedURL) + return self.blogObject + else: + sys.stdout.write('Downloading feed from %s (no cache)...' % self.feedURL) + try: + data = feedparser.parse(self.feedURL) + sys.stdout.write('done.\n') + except: + sys.stdout.write('failed.\n') + return None + # create caching data + try: + cache = CacheHandler.CacheObject() + cache.etag = data['etag'] + cache.date = data['modified'] + item.cache = cache + except: + item.cache = None + # parse the return of data into a blog + if data['channel'].has_key('title'): + item.blogTitle = data['channel']['title'] + else: + item.blogTitle = '(Unknown)' + if data['channel'].has_key('link'): + item.blogURL = data['channel']['link'] + else: + item.blogURL = self.feedURL + for entry in data['items']: + blogItem = BlogItem() + if entry.has_key('title'): + blogItem.itemTitle = entry['title'] + else: + blogItem.itemTitle = '(Untitled)' + if entry.has_key('link'): + blogItem.itemURL = entry['link'] + else: + blogItem.itemURL = item.blogURL + if entry.has_key('date_parsed'): + blogItem.itemDate = time.mktime(entry['date_parsed']) + else: + blogItem.itemDate = 0 + if entry.has_key('description'): + blogItem.contents = entry['description'] + else: + blogItem.contents = '(entry could not be retrieved)' + item.items.append(blogItem) + return item