X-Git-Url: https://git.ucc.asn.au/?p=planet-ucc.git;a=blobdiff_plain;f=XMLParse2.py;h=488a3443072c5aca543047bdd65f067b188bc2d7;hp=689fecb07b30bfa686d8902fde9a096de389a598;hb=2a563de3bcc327fa440671db368a261d70e9114b;hpb=081de18b6ab54348dab29dea74d374052a333947 diff --git a/XMLParse2.py b/XMLParse2.py index 689fecb..488a344 100644 --- a/XMLParse2.py +++ b/XMLParse2.py @@ -13,8 +13,11 @@ import CacheHandler sys.path.insert(0, 'extra') import feedparser +feedparser.USER_AGENT = "PlanetUCC/1.0b +http://planet.ucc.asn.au/ %s" % feedparser.USER_AGENT + class Blog: def __init__(self): + self.blogName = None self.blogTitle = None self.blogURL = None self.feedURL = None @@ -28,6 +31,7 @@ class BlogItem: self.itemTitle = None self.itemDate = None self.itemURL = None + self.commentsURL = None self.contents = None class XMLParse: @@ -38,21 +42,21 @@ class XMLParse: def parse(self): "Return a single Blog object" item = Blog() - if self.blogObject: - sys.stdout.write('Downloading feed %s...' % self.feedURL) + if self.blogObject and self.blogObject.cache: + sys.stdout.write('Downloading feed %s... ' % self.feedURL) try: data = feedparser.parse(self.feedURL, self.blogObject.cache.etag, self.blogObject.cache.date) - sys.stdout.write('done.\n') + # check to see what we got returned + if data['items'] == [] and data['channel'] == {}: + sys.stdout.write('cached.\n') + return self.blogObject + else: + sys.stdout.write('done.\n') except: sys.stdout.write('failed.\n') - raise return None - # check to see what we got returned - if data['items'] == [] and data['channel'] == {}: - sys.stdout.write('Feed %s is upto date.\n' % self.feedURL) - return self.blogObject else: - sys.stdout.write('Downloading feed from %s (no cache)...' % self.feedURL) + sys.stdout.write('Downloading feed (no cache) %s... ' % self.feedURL) try: data = feedparser.parse(self.feedURL) sys.stdout.write('done.\n') @@ -62,10 +66,17 @@ class XMLParse: # create caching data try: cache = CacheHandler.CacheObject() - cache.etag = data['etag'] - cache.date = data['modified'] + try: + cache.etag = data['etag'] + except: + cache.etag = None + try: + cache.date = data['modified'] + except: + cache.date = None item.cache = cache except: + sys.stderr.write('DEBUG: XMLParse2: cache item generation failed\n') item.cache = None # parse the return of data into a blog if data['channel'].has_key('title'): @@ -76,6 +87,10 @@ class XMLParse: item.blogURL = data['channel']['link'] else: item.blogURL = self.feedURL + if data['feed'].has_key ('image') and data['feed']['image'].has_key ('url'): + item.imageURL = data['feed']['image']['url'] + if data['feed'].has_key ('image') and data['feed']['image'].has_key ('link'): + item.imageLink = data['feed']['image']['link'] for entry in data['items']: blogItem = BlogItem() if entry.has_key('title'): @@ -86,13 +101,20 @@ class XMLParse: blogItem.itemURL = entry['link'] else: blogItem.itemURL = item.blogURL - if entry.has_key('date_parsed'): - blogItem.itemDate = time.mktime(entry['date_parsed']) + 28800 + if entry.has_key('modified_parsed'): + try: blogItem.itemDate = time.mktime(entry['modified_parsed']) + 28800 + except: blogItem.itemDate = 0 else: blogItem.itemDate = 0 - if entry.has_key('description'): + if entry.has_key('content'): + # get the contents of the first item with a text/html type + # no feeds without a text/html type have been encountered in the wild, but who knows + blogItem.contents = [content['value'] for content in entry['content'] if content['type'] == 'text/html'][0] + elif entry.has_key('description'): blogItem.contents = entry['description'] else: blogItem.contents = '(entry could not be retrieved)' + if entry.has_key ('comments'): + blogItem.commentsURL = entry['comments'] item.items.append(blogItem) return item