Change feed URL.
[planet-ucc.git] / XMLParse2.py
1 #
2 # XMLParse2.py
3 #
4 # Parse arbitrary XML news streams into an object type
5 # understandable by Planet UCC.
6 # Now uses feedparser to parse 9 different types of RSS _and_ Atom
7 #
8 # (c) 2004, Davyd Madeley <[email protected]>
9 #
10
11 import sys, time
12 import CacheHandler
13 sys.path.insert(0, 'extra')
14 import feedparser
15
16 feedparser.USER_AGENT = "PlanetUCC/1.0b +http://planet.ucc.asn.au/ %s" % feedparser.USER_AGENT
17
18 class Blog:
19         def __init__(self):
20                 self.blogName   = None
21                 self.blogTitle  = None
22                 self.blogURL    = None
23                 self.feedURL    = None
24                 self.imageURL   = None
25                 self.imageLink  = None
26                 self.items      = []
27                 self.cache      = None
28
29 class BlogItem:
30         def __init__(self):
31                 self.itemTitle  = None
32                 self.itemDate   = None
33                 self.itemURL    = None
34                 self.commentsURL = None
35                 self.contents   = None
36
37 class XMLParse:
38         def __init__(self, URL, blogObject):
39                 self.feedURL    = URL
40                 self.blogObject = blogObject
41                 
42         def parse(self):
43                 "Return a single Blog object"
44                 item            = Blog()
45                 if self.blogObject and self.blogObject.cache:
46                         sys.stdout.write('Downloading feed %s... ' % self.feedURL)
47                         try:
48                                 data    = feedparser.parse(self.feedURL, self.blogObject.cache.etag, self.blogObject.cache.date)
49                                 # check to see what we got returned
50                                 if data['items'] == [] and data['channel'] == {}:
51                                         sys.stdout.write('cached.\n')
52                                         return self.blogObject
53                                 else:
54                                         sys.stdout.write('done.\n')
55                         except:
56                                 sys.stdout.write('failed.\n')
57                                 return None
58                 else:
59                         sys.stdout.write('Downloading feed (no cache) %s... ' % self.feedURL)
60                         try:
61                                 data    = feedparser.parse(self.feedURL)
62                                 sys.stdout.write('done.\n')
63                         except:
64                                 sys.stdout.write('failed.\n')
65                                 return None
66                 # create caching data
67                 try:
68                         cache           = CacheHandler.CacheObject()
69                         try:
70                                 cache.etag      = data['etag']
71                         except:
72                                 cache.etag      = None
73                         try:
74                                 cache.date      = data['modified']
75                         except:
76                                 cache.date      = None
77                         item.cache      = cache
78                 except:
79                         sys.stderr.write('DEBUG: XMLParse2: cache item generation failed\n')
80                         item.cache      = None
81                 # parse the return of data into a blog
82                 if data['channel'].has_key('title'):
83                         item.blogTitle  = data['channel']['title']
84                 else:
85                         item.blogTitle  = '(Unknown)'
86                 if data['channel'].has_key('link'):
87                         item.blogURL    = data['channel']['link']
88                 else:
89                         item.blogURL    = self.feedURL
90                 if data['feed'].has_key ('image') and data['feed']['image'].has_key ('url'):
91                         item.imageURL   = data['feed']['image']['url']
92                 if data['feed'].has_key ('image') and data['feed']['image'].has_key ('link'):
93                         item.imageLink  = data['feed']['image']['link']
94                 for entry in data['items']:
95                         blogItem                = BlogItem()
96                         if entry.has_key('title'):
97                                 blogItem.itemTitle      = entry['title']
98                         else:
99                                 blogItem.itemTitle      = '(Untitled)'
100                         if entry.has_key('link'):
101                                 blogItem.itemURL        = entry['link']
102                         else:
103                                 blogItem.itemURL        = item.blogURL
104                         if entry.has_key('modified_parsed'):
105                                 try: blogItem.itemDate  = time.mktime(entry['modified_parsed']) + 28800
106                                 except: blogItem.itemDate = 0
107                         else:
108                                 blogItem.itemDate       = 0
109                         if entry.has_key('content'):
110                                 # get the contents of the first item with a text/html type
111                                 # no feeds without a text/html type have been encountered in the wild, but who knows
112                                 blogItem.contents       = [content['value'] for content in entry['content'] if content['type'] == 'text/html'][0]
113                         elif entry.has_key('description'):
114                                 blogItem.contents       = entry['description']
115                         else:
116                                 blogItem.contents       = '(entry could not be retrieved)'
117                         if entry.has_key ('comments'):
118                                 blogItem.commentsURL    = entry['comments']
119                         item.items.append(blogItem)
120                 return item

UCC git Repository :: git.ucc.asn.au