From: davyd Date: Mon, 9 Feb 2004 05:55:45 +0000 (+0000) Subject: See Changelog X-Git-Url: https://git.ucc.asn.au/?a=commitdiff_plain;h=783caef083f5dcafaed71dfa3f3a8035109717e9;p=planet-ucc.git See Changelog --- diff --git a/CacheHandler.py b/CacheHandler.py new file mode 100644 index 0000000..cc5c87b --- /dev/null +++ b/CacheHandler.py @@ -0,0 +1,37 @@ +# +# CacheHandler.py +# +# classes for dealing with object cache +# +# (c) 2004, Davyd Madeley +# + +import dircache, cPickle, sys, os + +class CacheHandler: + def __create_name__(self, title, feed): + return "%s_%s.cache" % (title.replace(' ', '_'), feed.replace(' ', '_').replace('http://', '').replace('/', '_')) + + def storeBlog(self, blog): + name = self.__create_name__(blog.blogTitle, blog.feedURL) + # write the blog to disk + try: + cPickle.dump(blog, open(os.path.join('cache', name), 'w')) + except: + sys.stderr.write('DEBUG: CacheHandler: Item could not be written to cache\n') + + def getBlog(self, title, feed): + name = self.__create_name__(title, feed) + # attempt to read the blog from disk + try: + blog = cPickle.load(open(os.path.join('cache', name))) + except: + blog = None + sys.stderr.write('DEBUG: CacheHandler: Could not read item from cache\n') + return blog + +class CacheObject: + "Stores Blog objects" + def __init__(self): + self.etag = None + self.date = None diff --git a/Changelog b/Changelog new file mode 100644 index 0000000..e977dce --- /dev/null +++ b/Changelog @@ -0,0 +1,76 @@ +2004-02-09 +========== + * Added XMLParse2.py + XMLParse2 is a wrapper to feedparser by Mark Pilgrim. + This required a slight change to the base API, as + feedparser does the downloading for us. According + to [TRS], using feedparser and a good caching + system will drop our download requirement significantly. + * Added extra/feedparser.py + Mark Pilgrim's feedparser version 3.0-beta-14 from + http://diveintomark.org/projects/feed_parser/ + * Added CacheHandler.py + CacheHandler and CacheObject are objects for dealing with + the caching of blog items. It will allow us to store + preparsed blogs in cache/ for retrieval if a new download + proves to be unrequired. + * planet.css + Added underline for date/entry links, because [TRS] did not + even realise they were links. I'm sure not underlining links + it a big usability faux-par. + Added default link style for .item + * sidebar.html + Changed the text slightly. + * XMLWriter.py + Added a (feed) link to the sidebar in the sections "feeds". + This links to the actual XML feed we are syndicating. + * update-planet + Modifications to including caching as well as changes to the + API used by XMLParse2, this breaks compatibility with XMLParse + (the original). + * Added Changelog + This file. Mentions changed stuff. + * Added faq.html + This should be self explanatory, really. + +2004-02-08 +========== + * crontab + Turned runtime down to 10 minutes. + * feedlist + Added more feeds. + * XMLParse.py + Added Atom support. + Bug fixes. + * XMLWriter.py + Bug fixes. + * sidebar.html + Changed text. + +2004-02-07 +========== + * Added footer.html + Footer on each page. + * Added header.html + Header on each page. + * Added icon.png + The icon that appears in the favicon spot. + * Added title.png + The title "Planet UCC" + * Added ucc.png + The UCC "Sun" logo (appears on the right hand side). + * Added crontab + The crontab used on billy. + * Added feedlist + The list of feeds we are downloading. + * Added planet.css + The Planet UCC stylesheet. + * Added sidebar.html + The text appearing in the sidebar. + * Added update-planet + The wrapper that puts a whole planet together. + * Added XMLParse.py + Generic abtract XML Parser (also contains Blog objects. + Currently supports RSSv2 and RDF. + * Added XMLWriter.py + Generic abtract XML Writer. Currently supports XHTML. diff --git a/XMLParse2.py b/XMLParse2.py new file mode 100644 index 0000000..1cec668 --- /dev/null +++ b/XMLParse2.py @@ -0,0 +1,98 @@ +# +# XMLParse2.py +# +# Parse arbitrary XML news streams into an object type +# understandable by Planet UCC. +# Now uses feedparser to parse 9 different types of RSS _and_ Atom +# +# (c) 2004, Davyd Madeley +# + +import sys, time +import CacheHandler +sys.path.insert(0, 'extra') +import feedparser + +class Blog: + def __init__(self): + self.blogTitle = None + self.blogURL = None + self.feedURL = None + self.imageURL = None + self.imageLink = None + self.items = [] + self.cache = None + +class BlogItem: + def __init__(self): + self.itemTitle = None + self.itemDate = None + self.itemURL = None + self.contents = None + +class XMLParse: + def __init__(self, URL, blogObject): + self.feedURL = URL + self.blogObject = blogObject + + def parse(self): + "Return a single Blog object" + item = Blog() + if self.blogObject: + sys.stdout.write('Downloading feed %s...' % self.feedURL) + try: + data = feedparser.parse(self.feedURL, self.blogObject.cache.etag, self.blogObject.cache.date) + sys.stdout.write('done.\n') + except: + sys.stdout.write('failed.\n') + raise + return None + # check to see what we got returned + if data['items'] == [] and data['channel'] == {}: + sys.stdout.write('Feed %s is upto date.\n' % self.feedURL) + return self.blogObject + else: + sys.stdout.write('Downloading feed from %s (no cache)...' % self.feedURL) + try: + data = feedparser.parse(self.feedURL) + sys.stdout.write('done.\n') + except: + sys.stdout.write('failed.\n') + return None + # create caching data + try: + cache = CacheHandler.CacheObject() + cache.etag = data['etag'] + cache.date = data['modified'] + item.cache = cache + except: + item.cache = None + # parse the return of data into a blog + if data['channel'].has_key('title'): + item.blogTitle = data['channel']['title'] + else: + item.blogTitle = '(Unknown)' + if data['channel'].has_key('link'): + item.blogURL = data['channel']['link'] + else: + item.blogURL = self.feedURL + for entry in data['items']: + blogItem = BlogItem() + if entry.has_key('title'): + blogItem.itemTitle = entry['title'] + else: + blogItem.itemTitle = '(Untitled)' + if entry.has_key('link'): + blogItem.itemURL = entry['link'] + else: + blogItem.itemURL = item.blogURL + if entry.has_key('date_parsed'): + blogItem.itemDate = time.mktime(entry['date_parsed']) + else: + blogItem.itemDate = 0 + if entry.has_key('description'): + blogItem.contents = entry['description'] + else: + blogItem.contents = '(entry could not be retrieved)' + item.items.append(blogItem) + return item diff --git a/XMLWriter.py b/XMLWriter.py index 6f39119..4f13913 100644 --- a/XMLWriter.py +++ b/XMLWriter.py @@ -135,7 +135,7 @@ class XHTMLWriter: output += '

Feeds

\n' output += '

\n' for blog in self.parent.blogs: - output += '%s
\n' % (blog.blogURL, blog.blogTitle) + output += '%s (feed)
\n' % (blog.blogURL, blog.blogTitle, blog.feedURL) output += '

\n' output += '\n' output += '
\n' diff --git a/extra/feedparser.py b/extra/feedparser.py new file mode 100644 index 0000000..024194e --- /dev/null +++ b/extra/feedparser.py @@ -0,0 +1,1770 @@ +#!/usr/bin/env python +"""Universal feed parser + +Visit http://diveintomark.org/projects/feed_parser/ for the latest version + +Handles RSS 0.9x, RSS 1.0, RSS 2.0, Atom feeds + +Things it handles that choke other parsers: +- bastard combinations of RSS 0.9x and RSS 1.0 +- illegal 8-bit XML characters +- naked and/or invalid HTML in description +- content:encoded, xhtml:body, fullitem +- guid +- elements in non-standard namespaces or non-default namespaces +- multiple content items per entry (Atom) +- multiple links per entry (Atom) + +Other features: +- resolves relative URIs in some elements + - uses xml:base to define base URI + - uses URI of feed if no xml:base is given + - to control which elements are resolved, set _FeedParserMixin.can_be_relative_uri +- resolves relative URIs within embedded markup + - to control which elements are resolved, set _FeedParserMixin.can_contain_relative_uris +- sanitizes embedded markup in some elements + - to allow/disallow HTML elements, set _HTMLSanitizer.acceptable_elements + - to allow/disallow HTML attributes, set _HTMLSanitizer.acceptable_attributes + - to control which feed elements are sanitized, set _FeedParserMixin.can_contain_dangerous_markup + - to disable entirely (NOT RECOMMENDED), set _FeedParserMixin.can_contain_dangerous_markup = [] +- optionally tidies embedded markup + - fixes malformed HTML + - converts to XHTML + - converts character entities to numeric entities + - requires mxTidy + +Required: Python 2.1 or later +Recommended: Python 2.3 or later +""" + +__version__ = "3.0-beta-14" +__author__ = "Mark Pilgrim " +__copyright__ = "Copyright 2002-4, Mark Pilgrim" +__contributors__ = ["Jason Diamond ", + "John Beimler ", + "Fazal Majid "] +__license__ = "Python" +_debug = 0 + +# if you are embedding feedparser in a larger application, you should change this to your application name and URL +USER_AGENT = "UniversalFeedParser/%s%s +http://diveintomark.org/projects/feed_parser/" % (__version__, _debug and "-debug" or "") + +# ---------- required modules (should come with any Python distribution) ---------- +import sgmllib, re, sys, copy, urlparse, time, rfc822 +try: + from cStringIO import StringIO as _StringIO +except: + from StringIO import StringIO as _StringIO + +# ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- + +# gzip is included with most Python distributions, but may not be available if you compiled your own +try: + import gzip +except: + gzip = None + +# timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers. +# Python 2.3 now has this functionality available in the standard socket library, so under +# 2.3 you don't need to install anything. +import socket +if hasattr(socket, 'setdefaulttimeout'): + socket.setdefaulttimeout(10) +else: + try: + import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py + timeoutsocket.setDefaultSocketTimeout(10) + except ImportError: + pass +import urllib2 + +# mxtidy allows feedparser to tidy malformed embedded HTML markup in description, content, etc. +# this does not affect HTML sanitizing, which is self-contained in the _HTMLSanitizer class +try: + from mx.Tidy import Tidy as _mxtidy # http://www.lemburg.com/files/python/mxTidy.html +except: + _mxtidy = None + +# If a real XML parser is available, feedparser will attempt to use it. feedparser works +# with both the built-in SAX parser and PyXML SAX parser. On platforms where the Python +# distribution does not come with an XML parser (such as Mac OS X 10.2 and some versions of +# FreeBSD), feedparser will just fall back on regex-based parsing. If XML libraries are +# available but the feed turns out not to be well-formed XML, feedparser will fall back +# on regex-based parsing and set the "bozo" bit in the results to indicate that the feed +# author is a bozo who can't generate well-formed XML. The two advantages of using a real +# XML parser are (1) Unicode support, and (2) to get people to stop yelling at me for not +# using one. +try: + import xml.sax + from xml.sax.saxutils import escape as xmlescape + _XML_AVAILABLE = 1 +except: + _XML_AVAILABLE = 0 + def xmlescape(data): + data = data.replace("&", "&") + data = data.replace(">", ">") + data = data.replace("<", "<") + return data + +# base64 support for Atom feeds that contain embedded binary data +try: + import base64, binascii +except: + base64 = binascii = None + +# ---------- don't touch these ---------- +sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') +sgmllib.special = re.compile('" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0) + + # match namespaces + try: + prefix, suffix = tag.split(':', 1) + except ValueError: + prefix, suffix = '', tag + prefix = self.namespacemap.get(prefix, prefix) + if prefix: + prefix = prefix + '_' + + # call special handler (if defined) or default handler + methodname = '_start_' + prefix + suffix + try: + method = getattr(self, methodname) + return method(attrsD) + except AttributeError: + return self.push(prefix + suffix, 1) + + def unknown_endtag(self, tag): + if _debug: sys.stderr.write('end %s\n' % tag) + # match namespaces + try: + prefix, suffix = tag.split(':', 1) + except ValueError: + prefix, suffix = '', tag + prefix = self.namespacemap.get(prefix, prefix) + if prefix: + prefix = prefix + '_' + + # call special handler (if defined) or default handler + methodname = '_end_' + prefix + suffix + try: + method = getattr(self, methodname) + method() + except AttributeError: + self.pop(prefix + suffix) + + # track inline content + if self.incontent and self.contentparams.get('mode') == 'escaped': + # element declared itself as escaped markup, but it isn't really + self.contentparams['mode'] = 'xml' + if self.incontent and self.contentparams.get('mode') == 'xml': + tag = tag.split(':')[-1] + self.handle_data("" % tag, escape=0) + + # track xml:base and xml:lang going out of scope + if self.basestack: + self.basestack.pop() + if self.basestack and self.basestack[-1]: + baseuri = self.basestack[-1] + if _debug: sys.stderr.write('self.baseuri=%s\n' % baseuri) + self.baseuri = baseuri + if self.langstack: + lang = self.langstack.pop() + if lang: + self.lang = lang + + def handle_charref(self, ref): + # called for each character reference, e.g. for " ", ref will be "160" + # Reconstruct the original character reference. + if not self.elementstack: return + text = "&#%s;" % ref + self.elementstack[-1][2].append(text) + + def handle_entityref(self, ref): + # called for each entity reference, e.g. for "©", ref will be "copy" + # Reconstruct the original entity reference. + if not self.elementstack: return + text = "&%s;" % ref + self.elementstack[-1][2].append(text) + + def handle_data(self, text, escape=1): + # called for each block of plain text, i.e. outside of any tag and + # not containing any character or entity references + if not self.elementstack: return + if escape and self.contentparams.get('mode') == 'xml': + text = xmlescape(text) + self.elementstack[-1][2].append(text) + + def handle_comment(self, text): + # called for each comment, e.g. + pass + + def handle_pi(self, text): + # called for each processing instruction, e.g. + pass + + def handle_decl(self, text): + # called for the DOCTYPE, if present, e.g. + # + if text.count('http://my.netscape.com/publish/formats/rss-0.91.dtd'): + self.version = 'rss091n' + + _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match + def _scan_name(self, i, declstartpos): + rawdata = self.rawdata + n = len(rawdata) + if i == n: + return None, -1 + m = self._new_declname_match(rawdata, i) + if m: + s = m.group() + name = s.strip() + if (i + len(s)) == n: + return None, -1 # end of buffer + return name.lower(), m.end() + else: + self.updatepos(declstartpos, i) + self.error("expected name token") + + def parse_declaration(self, i): + # override internal declaration handler to handle CDATA blocks + if _debug: sys.stderr.write("entering parse_declaration\n") + if re.search(r'^', self.rawdata[i:]): + if _debug: sys.stderr.write("found Netscape DOCTYPE\n") + self.version = 'rss091n' + if self.rawdata[i:i+9] == '', i) + if k == -1: k = len(self.rawdata) + self.handle_data(xmlescape(self.rawdata[i+9:k]), 0) + return k+3 + else: + k = self.rawdata.find('>', i) + return k+1 + + def trackNamespace(self, prefix, uri): + if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: + self.version = 'rss090' + if not prefix: return + if uri.find('backend.userland.com/rss') <> -1: + # match any backend.userland.com namespace + uri = 'http://backend.userland.com/rss' + if self.namespaces.has_key(uri): + self.namespacemap[prefix] = self.namespaces[uri] + + def resolveURI(self, uri): + return urlparse.urljoin(self.baseuri or '', uri) + + def decodeEntities(self, element, data): + if self.contentparams.get('mode') == 'escaped': + data = data.replace('<', '<') + data = data.replace('>', '>') + data = data.replace('&', '&') + data = data.replace('"', '"') + data = data.replace(''', "'") + return data + + def push(self, element, expectingText): +# print 'push', element, expectingText +# while self.elementstack and self.elementstack[-1][1]: +# self.pop(self.elementstack[-1][0]) + self.elementstack.append([element, expectingText, []]) + + def pop(self, element): +# print 'pop', element + if not self.elementstack: return +# while self.elementstack[-1][0] != element: self.pop(self.elementstack[-1][0]) + if self.elementstack[-1][0] != element: return + + element, expectingText, pieces = self.elementstack.pop() + output = "".join(pieces) + output = output.strip() + if not expectingText: return output + + # decode base64 content + if self.contentparams.get('mode') == 'base64' and base64: + try: + output = base64.decodestring(output) + except binascii.Error: + pass + except binascii.Incomplete: + pass + + # resolve relative URIs + if (element in self.can_be_relative_uri) and output: + output = self.resolveURI(output) + + # decode entities within embedded markup + output = self.decodeEntities(element, output) + + # resolve relative URIs within embedded markup + if element in self.can_contain_relative_uris: + output = _resolveRelativeURIs(output, self.baseuri) + + # sanitize embedded markup + if element in self.can_contain_dangerous_markup: + output = _sanitizeHTML(output) + + # store output in appropriate place(s) + if self.initem: + if element == 'content': + self.items[-1].setdefault(element, []) + contentparams = copy.deepcopy(self.contentparams) + contentparams['value'] = output + self.items[-1][element].append(contentparams) + elif element == 'category': + self.items[-1][element] = output + domain = self.items[-1]['categories'][-1][0] + self.items[-1]['categories'][-1] = (domain, output) + elif element == 'source': + self.items[-1]['source']['value'] = output + elif element == 'link': + self.items[-1][element] = output + if output: + self.items[-1]['links'][-1]['href'] = output + else: + if self.incontent and element != 'description': + contentparams = copy.deepcopy(self.contentparams) + contentparams['value'] = output + self.items[-1][element + '_detail'] = contentparams + self.items[-1][element] = output + elif self.inchannel and (not self.intextinput) and (not self.inimage): + if element == 'category': + domain = self.channel['categories'][-1][0] + self.channel['categories'][-1] = (domain, output) + elif element == 'link': + self.channel['links'][-1]['href'] = output + else: + if self.incontent and element != 'description': + contentparams = copy.deepcopy(self.contentparams) + contentparams['value'] = output + self.channel[element + '_detail'] = contentparams + self.channel[element] = output + return output + + def _mapToStandardPrefix(self, name): + colonpos = name.find(':') + if colonpos <> -1: + prefix = name[:colonpos] + suffix = name[colonpos+1:] + prefix = self.namespacemap.get(prefix, prefix) + name = prefix + ':' + suffix + return name + + def _getAttribute(self, attrsD, name): + return attrsD.get(self._mapToStandardPrefix(name)) + + def _save(self, key, value): + if value: + if self.initem: + self.items[-1].setdefault(key, value) + elif self.channel: + self.channel.setdefault(key, value) + + def _start_rss(self, attrsD): + versionmap = {'0.91': 'rss091u', + '0.92': 'rss092', + '0.93': 'rss093', + '0.94': 'rss094'} + if not self.version: + attr_version = attrsD.get('version', '') + version = versionmap.get(attr_version) + if version: + self.version = version + elif attr_version.startswith('2.'): + self.version = 'rss20' + else: + self.version = 'rss' + + def _start_channel(self, attrsD): + self.inchannel = 1 + + def _start_feed(self, attrsD): + self.inchannel = 1 + versionmap = {'0.1': 'atom01', + '0.2': 'atom02', + '0.3': 'atom03'} + if not self.version: + attr_version = attrsD.get('version') + version = versionmap.get(attr_version) + if version: + self.version = version + else: + self.version = 'atom' + + def _end_channel(self): + self.inchannel = 0 + _end_feed = _end_channel + + def _start_image(self, attrsD): + self.inimage = 1 + + def _end_image(self): + self.inimage = 0 + + def _start_textinput(self, attrsD): + self.intextinput = 1 + _start_textInput = _start_textinput + + def _end_textinput(self): + self.intextinput = 0 + _end_textInput = _end_textinput + + def _start_author(self, attrsD): + self.inauthor = 1 + self.push('author', 1) + _start_managingeditor = _start_author + _start_dc_author = _start_author + _start_dc_creator = _start_author + + def _end_author(self): + self.pop('author') + self.inauthor = 0 + self._sync_author_detail() + _end_managingeditor = _end_author + _end_dc_author = _end_author + _end_dc_creator = _end_author + + def _start_contributor(self, attrsD): + self.incontributor = 1 + context = self._getContext() + context.setdefault('contributors', []) + context['contributors'].append({}) + self.push('contributor', 0) + + def _end_contributor(self): + self.pop('contributor') + self.incontributor = 0 + + def _start_name(self, attrsD): + self.push('name', 0) + + def _end_name(self): + value = self.pop('name') + if self.inauthor: + self._save_author('name', value) + elif self.incontributor: + self._save_contributor('name', value) + pass + elif self.intextinput: + # TODO + pass + + def _start_url(self, attrsD): + self.push('url', 0) + _start_homepage = _start_url + _start_uri = _start_url + + def _end_url(self): + value = self.pop('url') + if self.inauthor: + self._save_author('url', value) + elif self.incontributor: + self._save_contributor('url', value) + elif self.inimage: + # TODO + pass + elif self.intextinput: + # TODO + pass + _end_homepage = _end_url + _end_uri = _end_url + + def _start_email(self, attrsD): + self.push('email', 0) + + def _end_email(self): + value = self.pop('email') + if self.inauthor: + self._save_author('email', value) + elif self.incontributor: + self._save_contributor('email', value) + pass + elif self.inimage: + # TODO + pass + elif self.intextinput: + # TODO + pass + + def _getContext(self): + if self.initem: + context = self.items[-1] + else: + context = self.channel + return context + + def _save_author(self, key, value): + context = self._getContext() + context.setdefault('author_detail', {}) + context['author_detail'][key] = value + self._sync_author_detail() + + def _save_contributor(self, key, value): + context = self._getContext() + context.setdefault('contributors', [{}]) + context['contributors'][-1][key] = value + + def _sync_author_detail(self): + context = self._getContext() + detail = context.get('author_detail') + if detail: + name = detail.get('name') + email = detail.get('email') + if name and email: + context['author'] = "%s (%s)" % (name, email) + elif name: + context['author'] = name + elif email: + context['author'] = email + else: + author = context.get('author') + if not author: return + emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author) + if not emailmatch: return + email = emailmatch.group(0) + author = author.replace(email, '') + author = author.replace('()', '') + author = author.strip() + context.setdefault('author_detail', {}) + context['author_detail']['name'] = author + context['author_detail']['email'] = email + + def _start_tagline(self, attrsD): + self.incontent += 1 + self.contentparams = {'mode': attrsD.get('mode', 'escaped'), + 'type': attrsD.get('type', 'text/plain'), + 'language': attrsD.get('xml:lang', self.lang), + 'base': attrsD.get('xml:base', self.baseuri)} + self.push('tagline', 1) + _start_subtitle = _start_tagline + + def _end_tagline(self): + value = self.pop('tagline') + self.incontent -= 1 + self.contentparams.clear() + if self.inchannel: + self.channel['description'] = value + _end_subtitle = _end_tagline + + def _start_copyright(self, attrsD): + self.incontent += 1 + self.contentparams = {'mode': attrsD.get('mode', 'escaped'), + 'type': attrsD.get('type', 'text/plain'), + 'language': attrsD.get('xml:lang', self.lang), + 'base': attrsD.get('xml:base', self.baseuri)} + self.push('copyright', 1) + _start_dc_rights = _start_copyright + + def _end_copyright(self): + self.pop('copyright') + self.incontent -= 1 + self.contentparams.clear() + _end_dc_rights = _end_copyright + + def _start_item(self, attrsD): + self.items.append({}) + self.push('item', 0) + self.initem = 1 + _start_entry = _start_item + + def _end_item(self): + self.pop('item') + self.initem = 0 + _end_entry = _end_item + + def _start_dc_language(self, attrsD): + self.push('language', 1) + _start_language = _start_dc_language + + def _end_dc_language(self): + self.lang = self.pop('language') + _end_language = _end_dc_language + + def _start_dc_publisher(self, attrsD): + self.push('publisher', 1) + _start_webmaster = _start_dc_publisher + + def _end_dc_publisher(self): + self.pop('publisher') + _end_webmaster = _end_dc_publisher + + def _start_dcterms_issued(self, attrsD): + self.push('issued', 1) + _start_issued = _start_dcterms_issued + + def _end_dcterms_issued(self): + value = self.pop('issued') + self._save('issued_parsed', _parse_date(value)) + _end_issued = _end_dcterms_issued + + def _start_dcterms_created(self, attrsD): + self.push('created', 1) + _start_created = _start_dcterms_created + + def _end_dcterms_created(self): + value = self.pop('created') + self._save('created_parsed', _parse_date(value)) + _end_created = _end_dcterms_created + + def _start_dcterms_modified(self, attrsD): + self.push('modified', 1) + _start_modified = _start_dcterms_modified + _start_dc_date = _start_dcterms_modified + _start_pubdate = _start_dcterms_modified + + def _end_dcterms_modified(self): + value = self.pop('modified') + parsed_value = _parse_date(value) + self._save('date', value) + self._save('date_parsed', parsed_value) + self._save('modified_parsed', parsed_value) + _end_modified = _end_dcterms_modified + _end_dc_date = _end_dcterms_modified + _end_pubdate = _end_dcterms_modified + + def _start_expirationdate(self, attrsD): + self.push('expired', 1) + + def _end_expirationdate(self): + self._save('expired_parsed', _parse_date(self.pop('expired'))) + + def _start_cc_license(self, attrsD): + self.push('license', 1) + value = self._getAttribute(attrsD, 'rdf:resource') + if value: + self.elementstack[-1][2].append(value) + self.pop('license') + + def _start_creativecommons_license(self, attrsD): + self.push('license', 1) + + def _end_creativecommons_license(self): + self.pop('license') + + def _start_category(self, attrsD): + self.push('category', 1) + domain = self._getAttribute(attrsD, 'domain') + cats = [] + if self.initem: + cats = self.items[-1].setdefault('categories', []) + elif self.inchannel: + cats = self.channel.setdefault('categories', []) + cats.append((domain, None)) + _start_dc_subject = _start_category + + def _end_category(self): + self.pop('category') + _end_dc_subject = _end_category + + def _start_cloud(self, attrsD): + self.channel['cloud'] = attrsD + + def _start_link(self, attrsD): + attrsD.setdefault('rel', 'alternate') + attrsD.setdefault('type', 'text/html') + if attrsD.has_key('href'): + attrsD['href'] = self.resolveURI(attrsD['href']) + expectingText = self.inchannel or self.initem + if self.initem: + self.items[-1].setdefault('links', []) + self.items[-1]['links'].append(attrsD) + elif self.inchannel: + self.channel.setdefault('links', []) + self.channel['links'].append(attrsD) + if attrsD.has_key('href'): + expectingText = 0 + if attrsD.get('type', '') in self.html_types: + if self.initem: + self.items[-1]['link'] = attrsD['href'] + elif self.inchannel: + self.channel['link'] = attrsD['href'] + else: + self.push('link', expectingText) + + def _start_guid(self, attrsD): + self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') + self.push('guid', 1) + + def _end_guid(self): + value = self.pop('guid') + self._save('id', value) + if self.guidislink: + # guid acts as link, but only if "ispermalink" is not present or is "true", + # and only if the item doesn't already have a link element + self._save('link', value) + + def _start_id(self, attrsD): + self.push('id', 1) + + def _end_id(self): + value = self.pop('id') + self._save('guid', value) + + def _start_title(self, attrsD): + self.incontent += 1 + self.contentparams = {'mode': attrsD.get('mode', 'escaped'), + 'type': attrsD.get('type', 'text/plain'), + 'language': attrsD.get('xml:lang', self.lang), + 'base': attrsD.get('xml:base', self.baseuri)} + self.push('title', self.inchannel or self.initem) + _start_dc_title = _start_title + + def _end_title(self): + self.pop('title') + self.incontent -= 1 + self.contentparams.clear() + _end_dc_title = _end_title + + def _start_description(self, attrsD): + self.incontent += 1 + self.contentparams = {'mode': attrsD.get('mode', 'escaped'), + 'type': attrsD.get('type', 'text/html'), + 'language': attrsD.get('xml:lang', self.lang), + 'base': attrsD.get('xml:base', self.baseuri)} + self.push('description', self.inchannel or self.initem) + + def _end_description(self): + value = self.pop('description') + if self.initem: + self.items[-1]['summary'] = value + elif self.inchannel: + self.channel['tagline'] = value + self.incontent -= 1 + self.contentparams.clear() + + def _start_info(self, attrsD): + self.incontent += 1 + self.contentparams = {'mode': attrsD.get('mode', 'escaped'), + 'type': attrsD.get('type', 'text/plain'), + 'language': attrsD.get('xml:lang', self.lang), + 'base': attrsD.get('xml:base', self.baseuri)} + self.push('info', 1) + + def _end_info(self): + self.pop('info') + self.incontent -= 1 + self.contentparams.clear() + + def _start_generator(self, attrsD): + if attrsD: + self.channel['generator_detail'] = attrsD + self.push('generator', 1) + + def _end_generator(self): + value = self.pop('generator') + if self.channel.has_key('generator_detail'): + self.channel['generator_detail']['name'] = value + + def _start_admin_generatoragent(self, attrsD): + self.push('generator', 1) + value = self._getAttribute(attrsD, 'rdf:resource') + if value: + self.elementstack[-1][2].append(value) + self.pop('generator') + + def _start_admin_errorreportsto(self, attrsD): + self.push('errorreportsto', 1) + value = self._getAttribute(attrsD, 'rdf:resource') + if value: + self.elementstack[-1][2].append(value) + self.pop('errorreportsto') + + def _start_summary(self, attrsD): + self.incontent += 1 + self.contentparams = {'mode': attrsD.get('mode', 'escaped'), + 'type': attrsD.get('type', 'text/plain'), + 'language': attrsD.get('xml:lang', self.lang), + 'base': attrsD.get('xml:base', self.baseuri)} + self.push('summary', 1) + + def _end_summary(self): + value = self.pop('summary') + if self.items: + self.items[-1]['description'] = value + self.incontent -= 1 + self.contentparams.clear() + + def _start_enclosure(self, attrsD): + if self.initem: + self.items[-1].setdefault('enclosures', []) + self.items[-1]['enclosures'].append(attrsD) + + def _start_source(self, attrsD): + if self.initem: + self.items[-1]['source'] = attrsD + self.push('source', 1) + + def _end_source(self): + self.pop('source') + + def _start_content(self, attrsD): + self.incontent += 1 + self.contentparams = {'mode': attrsD.get('mode', 'xml'), + 'type': attrsD.get('type', 'text/plain'), + 'language': attrsD.get('xml:lang', self.lang), + 'base': attrsD.get('xml:base', self.baseuri)} + self.push('content', 1) + + def _start_body(self, attrsD): + self.incontent += 1 + self.contentparams = {'mode': 'xml', + 'type': 'application/xhtml+xml', + 'language': attrsD.get('xml:lang', self.lang), + 'base': attrsD.get('xml:base', self.baseuri)} + self.push('content', 1) + _start_xhtml_body = _start_body + + def _start_content_encoded(self, attrsD): + self.incontent += 1 + self.contentparams = {'mode': 'escaped', + 'type': 'text/html', + 'language': attrsD.get('xml:lang', self.lang), + 'base': attrsD.get('xml:base', self.baseuri)} + self.push('content', 1) + _start_fullitem = _start_content_encoded + + def _end_content(self): + value = self.pop('content') + if self.contentparams.get('type') in (['text/plain'] + self.html_types): + self._save('description', value) + self.incontent -= 1 + self.contentparams.clear() + _end_body = _end_content + _end_xhtml_body = _end_content + _end_content_encoded = _end_content + _end_fullitem = _end_content + +if _XML_AVAILABLE: + class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):#, xml.sax.handler.DTDHandler): + def __init__(self, baseuri): + if _debug: sys.stderr.write('trying StrictFeedParser\n') + xml.sax.handler.ContentHandler.__init__(self) + _FeedParserMixin.__init__(self, baseuri) + self.bozo = 0 + self.exc = None + + def startPrefixMapping(self, prefix, uri): + self.trackNamespace(prefix, uri) + + def startElementNS(self, name, qname, attrs): + namespace, localname = name + namespace = str(namespace) + prefix = self.namespaces.get(namespace, '') + if prefix: + localname = prefix + ':' + localname + localname = str(localname).lower() + + # qname implementation is horribly broken in Python 2.1 (it + # doesn't report any), and slightly broken in Python 2.2 (it + # doesn't report the xml: namespace). So we match up namespaces + # with a known list first, and then possibly override them with + # the qnames the SAX parser gives us (if indeed it gives us any + # at all). Thanks to MatejC for helping me test this and + # tirelessly telling me that it didn't work yet. + attrsD = {} + for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): + prefix = self.namespaces.get(namespace, '') + if prefix: + attrlocalname = prefix + ":" + attrlocalname + attrsD[str(attrlocalname).lower()] = attrvalue + for qname in attrs.getQNames(): + attrsD[str(qname).lower()] = attrs.getValueByQName(qname) + self.unknown_starttag(localname, attrsD.items()) + + def resolveEntity(self, publicId, systemId): + return _StringIO() + + def characters(self, text): + self.handle_data(text) + + def endElementNS(self, name, qname): + namespace, localname = name + namespace = str(namespace) + prefix = self.namespaces.get(namespace, '') + if prefix: + localname = prefix + ':' + localname + localname = str(localname).lower() + self.unknown_endtag(localname) + + def fatalError(self, exc): + self.bozo = 1 + self.exc = exc + error = fatalError + +class _LooseFeedParser(_FeedParserMixin, sgmllib.SGMLParser): + def __init__(self, baseuri): + sgmllib.SGMLParser.__init__(self) + _FeedParserMixin.__init__(self, baseuri) + +class _BaseHTMLProcessor(sgmllib.SGMLParser): + elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', + 'img', 'input', 'isindex', 'link', 'meta', 'param'] + + def __init__(self): + sgmllib.SGMLParser.__init__(self) + + def reset(self): + # extend (called by sgmllib.SGMLParser.__init__) + self.pieces = [] + sgmllib.SGMLParser.reset(self) + + def normalize_attrs(self, attrs): + # utility method to be called by descendants + attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs] + attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] + return attrs + + def unknown_starttag(self, tag, attrs): + # called for each start tag + # attrs is a list of (attr, value) tuples + # e.g. for
, tag="pre", attrs=[("class", "screen")]
+        strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
+        if tag in self.elements_no_end_tag:
+            self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
+        else:
+            self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
+        
+    def unknown_endtag(self, tag):
+        # called for each end tag, e.g. for 
, tag will be "pre" + # Reconstruct the original end tag. + if tag not in self.elements_no_end_tag: + self.pieces.append("" % locals()) + + def handle_charref(self, ref): + # called for each character reference, e.g. for " ", ref will be "160" + # Reconstruct the original character reference. + self.pieces.append("&#%(ref)s;" % locals()) + + def handle_entityref(self, ref): + # called for each entity reference, e.g. for "©", ref will be "copy" + # Reconstruct the original entity reference. + self.pieces.append("&%(ref)s;" % locals()) + + def handle_data(self, text): + # called for each block of plain text, i.e. outside of any tag and + # not containing any character or entity references + # Store the original text verbatim. + self.pieces.append(text) + + def handle_comment(self, text): + # called for each HTML comment, e.g. + # Reconstruct the original comment. + self.pieces.append("" % locals()) + + def handle_pi(self, text): + # called for each processing instruction, e.g. + # Reconstruct original processing instruction. + self.pieces.append("" % locals()) + + def handle_decl(self, text): + # called for the DOCTYPE, if present, e.g. + # + # Reconstruct original DOCTYPE + self.pieces.append("" % locals()) + + def output(self): + """Return processed HTML as a single string""" + return "".join(self.pieces) + +class _RelativeURIResolver(_BaseHTMLProcessor): + relative_uris = [('a', 'href'), + ('applet', 'codebase'), + ('area', 'href'), + ('blockquote', 'cite'), + ('body', 'background'), + ('del', 'cite'), + ('form', 'action'), + ('frame', 'longdesc'), + ('frame', 'src'), + ('iframe', 'longdesc'), + ('iframe', 'src'), + ('head', 'profile'), + ('img', 'longdesc'), + ('img', 'src'), + ('img', 'usemap'), + ('input', 'src'), + ('input', 'usemap'), + ('ins', 'cite'), + ('link', 'href'), + ('object', 'classid'), + ('object', 'codebase'), + ('object', 'data'), + ('object', 'usemap'), + ('q', 'cite'), + ('script', 'src')] + + def __init__(self, baseuri): + _BaseHTMLProcessor.__init__(self) + self.baseuri = baseuri + + def resolveURI(self, uri): + return urlparse.urljoin(self.baseuri, uri) + + def unknown_starttag(self, tag, attrs): + attrs = self.normalize_attrs(attrs) + attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] + _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) + +def _resolveRelativeURIs(htmlSource, baseURI): + p = _RelativeURIResolver(baseURI) + p.feed(htmlSource) + return p.output() + +class _HTMLSanitizer(_BaseHTMLProcessor): + acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', + 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', + 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', + 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', + 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', + 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', + 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', + 'thead', 'tr', 'tt', 'u', 'ul', 'var'] + + acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', + 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', + 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', + 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', + 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', + 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', + 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', + 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', + 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', + 'usemap', 'valign', 'value', 'vspace', 'width'] + + unacceptable_elements_with_end_tag = ['script', 'applet'] + + def reset(self): + _BaseHTMLProcessor.reset(self) + self.unacceptablestack = 0 + + def unknown_starttag(self, tag, attrs): + if not tag in self.acceptable_elements: + if tag in self.unacceptable_elements_with_end_tag: + self.unacceptablestack += 1 + return + attrs = self.normalize_attrs(attrs) + attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] + _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) + + def unknown_endtag(self, tag): + if not tag in self.acceptable_elements: + if tag in self.unacceptable_elements_with_end_tag: + self.unacceptablestack -= 1 + return + _BaseHTMLProcessor.unknown_endtag(self, tag) + + def handle_pi(self, text): + pass + + def handle_decl(self, text): + pass + + def handle_data(self, text): + if not self.unacceptablestack: + _BaseHTMLProcessor.handle_data(self, text) + +def _sanitizeHTML(htmlSource): + p = _HTMLSanitizer() + p.feed(htmlSource) + data = p.output() + if _mxtidy: + nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0) + if data.count(''): + data = data.split('>', 1)[1] + if data.count(' stream + + This function lets you define parsers that take any input source + (URL, pathname to local or network file, or actual data as a string) + and deal with it in a uniform manner. Returned object is guaranteed + to have all the basic stdio read methods (read, readline, readlines). + Just .close() the object when you're done with it. + + If the etag argument is supplied, it will be used as the value of an + If-None-Match request header. + + If the modified argument is supplied, it must be a tuple of 9 integers + as returned by gmtime() in the standard Python time module. This MUST + be in GMT (Greenwich Mean Time). The formatted date/time will be used + as the value of an If-Modified-Since request header. + + If the agent argument is supplied, it will be used as the value of a + User-Agent request header. + + If the referrer argument is supplied, it will be used as the value of a + Referer[sic] request header. + """ + + if hasattr(url_file_stream_or_string, "read"): + return url_file_stream_or_string + + if url_file_stream_or_string == "-": + return sys.stdin + + if not agent: + agent = USER_AGENT + + # try to open with urllib2 (to use optional headers) + request = urllib2.Request(url_file_stream_or_string) + if etag: + request.add_header("If-None-Match", etag) + if modified: + # format into an RFC 1123-compliant timestamp. We can't use + # time.strftime() since the %a and %b directives can be affected + # by the current locale, but RFC 2616 states that dates must be + # in English. + short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] + months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] + request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) + request.add_header("User-Agent", agent) + if referrer: + request.add_header("Referer", referrer) + if gzip: + request.add_header("Accept-encoding", "gzip") + opener = urllib2.build_opener(_FeedURLHandler()) + opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent + try: + try: + return opener.open(request) + except: + # url_file_stream_or_string is not a valid URL, but it might be a valid filename + pass + finally: + opener.close() # JohnD + + # try to open with native open function (if url_file_stream_or_string is a filename) + try: + return open(url_file_stream_or_string) + except: + pass + + # treat url_file_stream_or_string as string + return _StringIO(str(url_file_stream_or_string)) + +# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by +# Drake and licensed under the Python license. Removed all range checking +# for month, day, hour, minute, and second, since mktime will normalize +# these later +def _w3dtf_parse(s): + def __extract_date(m): + year = int(m.group("year")) + if year < 100: + year = 100 * int(time.gmtime()[0] / 100) + int(year) + if year < 1000: + return 0, 0, 0 + julian = m.group("julian") + if julian: + julian = int(julian) + month = julian / 30 + 1 + day = julian % 30 + 1 + jday = None + while jday != julian: + t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) + jday = time.gmtime(t)[-2] + diff = abs(jday - julian) + if jday > julian: + if diff < day: + day = day - diff + else: + month = month - 1 + day = 31 + elif jday < julian: + if day + diff < 28: + day = day + diff + else: + month = month + 1 + return year, month, day + month = m.group("month") + day = 1 + if month is None: + month = 1 + else: + month = int(month) + day = m.group("day") + if day: + day = int(day) + else: + day = 1 + return year, month, day + + def __extract_time(m): + if not m: + return 0, 0, 0 + hours = m.group("hours") + if not hours: + return 0, 0, 0 + hours = int(hours) + minutes = int(m.group("minutes")) + seconds = m.group("seconds") + if seconds: + seconds = int(seconds) + else: + seconds = 0 + return hours, minutes, seconds + + def __extract_tzd(m): + """Return the Time Zone Designator as an offset in seconds from UTC.""" + if not m: + return 0 + tzd = m.group("tzd") + if not tzd: + return 0 + if tzd == "Z": + return 0 + hours = int(m.group("tzdhours")) + minutes = m.group("tzdminutes") + if minutes: + minutes = int(minutes) + else: + minutes = 0 + offset = (hours*60 + minutes) * 60 + if tzd[0] == "+": + return -offset + return offset + + __date_re = ("(?P\d\d\d\d)" + "(?:(?P-|)" + "(?:(?P\d\d\d)" + "|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?") + __tzd_re = "(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)" + __tzd_rx = re.compile(__tzd_re) + __time_re = ("(?P\d\d)(?P:|)(?P\d\d)" + "(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?" + + __tzd_re) + __datetime_re = "%s(?:T%s)?" % (__date_re, __time_re) + __datetime_rx = re.compile(__datetime_re) + m = __datetime_rx.match(s) + if m is None or m.group() != s: + return None + gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) + if gmt[0] == 0: return + return time.mktime(gmt) + __extract_tzd(m) - time.timezone + +# Additional ISO-8601 date parsing routines written by Fazal Majid +# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 +# parser is beyond the scope of feedparser and would be a worthwhile addition +# to the Python library +# A single regular expression cannot parse ISO 8601 date formats into groups +# as the standard is highly irregular (for instance is 030104 2003-01-04 or +# 0301-04-01), so we use templates instead +# Please note the order in templates is significant because we need a +# greedy match +_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO', + 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', + '-YY-?MM', '-OOO', '-YY', + '--MM-?DD', '--MM', + '---DD', + 'CC', ''] +_iso8601_re = [ + tmpl.replace( + 'YYYY', r'(?P\d{4})').replace( + 'YY', r'(?P\d\d)').replace( + 'MM', r'(?P[01]\d)').replace( + 'DD', r'(?P[0123]\d)').replace( + 'OOO', r'(?P[0123]\d\d)').replace( + 'CC', r'(?P\d\d$)') + + r'(T?(?P\d{2}):(?P\d{2})' + + r'(:(?P\d{2}))?' + + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' + for tmpl in _iso8601_tmpl] +del tmpl + +_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] +del regex + +# rfc822.py defines several time zones, but we define some extra ones. +# "ET" is equivalent to "EST", etc. +_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} +rfc822._timezones.update(_additional_timezones) + +def _parse_date(date): + """Parses a variety of date formats into a tuple of 9 integers""" + date = str(date) + try: + # try the standard rfc822 library, which handles + # RFC822, RFC1123, RFC2822, and asctime + tm = rfc822.parsedate_tz(date) + if tm: + return time.gmtime(rfc822.mktime_tz(tm)) + # not a RFC2822 date, try W3DTF profile of ISO-8601 + try: + tm = _w3dtf_parse(date) + except ValueError: + tm = None + if tm: + return time.gmtime(tm) + # try various non-W3DTF ISO-8601-compatible formats like 20040105 + m = None + for _iso8601_match in _iso8601_matches: + m = _iso8601_match(date) + if m: break + if not m: return + # catch truly malformed strings + if m.span() == (0, 0): return + params = m.groupdict() + ordinal = params.get("ordinal", 0) + if ordinal: + ordinal = int(ordinal) + else: + ordinal = 0 + year = params.get("year", "--") + if not year or year == "--": + year = time.gmtime()[0] + elif len(year) == 2: + # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 + year = 100 * int(time.gmtime()[0] / 100) + int(year) + else: + year = int(year) + month = params.get("month", "-") + if not month or month == "-": + # ordinals are NOT normalized by mktime, we simulate them + # by setting month=1, day=ordinal + if ordinal: + month = 1 + else: + month = time.gmtime()[1] + month = int(month) + day = params.get("day", 0) + if not day: + # see above + if ordinal: + day = ordinal + elif params.get("century", 0) or \ + params.get("year", 0) or params.get("month", 0): + day = 1 + else: + day = time.gmtime()[2] + else: + day = int(day) + # special case of the century - is the first year of the 21st century + # 2000 or 2001 ? The debate goes on... + if "century" in params.keys(): + year = (int(params["century"]) - 1) * 100 + 1 + # in ISO 8601 most fields are optional + for field in ["hour", "minute", "second", "tzhour", "tzmin"]: + if not params.get(field, None): + params[field] = 0 + hour = int(params.get("hour", 0)) + minute = int(params.get("minute", 0)) + second = int(params.get("second", 0)) + # weekday is normalized by mktime(), we can ignore it + weekday = 0 + # daylight savings is complex, but not needed for feedparser's purposes + # as time zones, if specified, include mention of whether it is active + # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and + # and most implementations have DST bugs + daylight_savings_flag = 0 + tm = [year, month, day, hour, minute, second, weekday, + ordinal, daylight_savings_flag] + # ISO 8601 time zone adjustments + tz = params.get("tz") + if tz and tz != "Z": + if tz[0] == "-": + tm[3] += int(params.get("tzhour", 0)) + tm[4] += int(params.get("tzmin", 0)) + elif tz[0] == "+": + tm[3] -= int(params.get("tzhour", 0)) + tm[4] -= int(params.get("tzmin", 0)) + else: + return None + # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) + # which is guaranteed to normalize d/m/y/h/m/s + # many implementations have bugs, but we'll pretend they don't + return time.localtime(time.mktime(tm)) + except: + return None + +def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None): + """Parse a feed from a URL, file, stream, or string""" + result = {} + f = _open_resource(url_file_stream_or_string, etag=etag, modified=modified, agent=agent, referrer=referrer) + data = f.read() + if hasattr(f, "headers"): + if gzip and f.headers.get('content-encoding', '') == 'gzip': + try: + data = gzip.GzipFile(fileobj=_StringIO(data)).read() + except: + # some feeds claim to be gzipped but they're not, so we get garbage + data = '' + if hasattr(f, "info"): + info = f.info() + result["etag"] = info.getheader("ETag") + last_modified = info.getheader("Last-Modified") + if last_modified: + result["modified"] = _parse_date(last_modified) + if hasattr(f, "url"): + result["url"] = f.url + result["status"] = 200 # default, may be overridden later + if hasattr(f, "status"): + result["status"] = f.status + if hasattr(f, "headers"): + result["headers"] = f.headers.dict + # get the xml encoding + xmlheaderRe = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>') # Andrei's version + match = xmlheaderRe.match(data) + if match: + result["encoding"] = match.groups()[0].lower() + f.close() + result['channel'] = {} + result['items'] = {} + baseuri = result.get('headers', {}).get('content-location', result.get('url')) + # try true XML parser first + if _XML_AVAILABLE: + if _debug: sys.stderr.write('using xml library\n') + result['bozo'] = 0 + feedparser = _StrictFeedParser(baseuri) + if re.search(r'', data): + feedparser.version = 'rss091n' + source = xml.sax.xmlreader.InputSource() + source.setByteStream(_StringIO(data)) + saxparser = xml.sax.make_parser()#["drv_libxml2"]) + saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) + saxparser.setContentHandler(feedparser) + saxparser.setErrorHandler(feedparser) + try: + saxparser.setDTDHandler(feedparser) + saxparser.setEntityResolver(feedparser) + except xml.sax.SAXNotSupportedException: + if _debug: sys.stderr.write('using an xml library that does not support DTDHandler and EntityResolver (this is not a problem)\n') + # libxml2 driver does not currently support DTDHandler or EntityResolver + pass + if hasattr(saxparser, '_ns_stack'): + # work around bug in built-in SAX parser (doesn't recognize xml: namespace) + # PyXML doesn't have this problem, and it doesn't have _ns_stack either + saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'}) + try: + saxparser.parse(source) + except Exception, e: + # SAX parser is supposed to catch all of these and call feedparser.fatal_error, + # which captures them. For some reason, some Unicode-related errors go + # uncaught on some combination of platform, XML library, Python version, + # and phase of the moon. + feedparser.bozo = 1 + feedparser.bozo_exception = e + if feedparser.bozo: + # feed is not well-formed XML, fall back on regex-based parser + if _debug: sys.stderr.write('xml parsing failed, using regexes. now you have two problems...\n') + result['bozo'] = 1 + result['bozo_exception'] = feedparser.exc + # munge short tags, e.g. becomes + data = re.sub(r'<(\S+)/>', r'<\1>', data) + feedparser = _LooseFeedParser(baseuri) + feedparser.feed(data) + else: + if _debug: sys.stderr.write('no xml libraries available, using regexes\n') + data = re.sub(r'<(\S+)/>', r'<\1>', data) + feedparser = _LooseFeedParser(baseuri) + feedparser.feed(data) + result['channel'] = feedparser.channel + result['items'] = feedparser.items + result['version'] = feedparser.version + return result + +_TEST_SUITE = ('http://www.pocketsoap.com/rssTests/rss1.0withModules.xml', + 'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNS.xml', + 'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNSLocalNameClash.xml', + 'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModules.xml', + 'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModulesLocalNameClash.xml', + 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModules.xml', + 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNS.xml', + 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNSLocalNameClash.xml') + +if __name__ == '__main__': + if sys.argv[1:]: + urls = sys.argv[1:] + else: + urls = _TEST_SUITE + from pprint import pprint + for url in urls: + print url + print + result = parse(url) + pprint(result) + print + +#TODO +#- image +#- textinput/textInput +#- comments +# +#encoding notes: +#- RFC 3023 +#- content-type.startswith('text/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else "us-ascii" +#- content-type.startswith('application/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else look for encoding="(.*?)" in document, else "utf-8" +#- parsing encoding: http://www.w3.org/TR/REC-xml#NT-EncodingDecl +# +#REVISION HISTORY +#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements, +# added Simon Fell's test suite +#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections +#2.0 - 10/19/2002 +# JD - use inchannel to watch out for image and textinput elements which can +# also contain title, link, and description elements +# JD - check for isPermaLink="false" attribute on guid elements +# JD - replaced openAnything with open_resource supporting ETag and +# If-Modified-Since request headers +# JD - parse now accepts etag, modified, agent, and referrer optional +# arguments +# JD - modified parse to return a dictionary instead of a tuple so that any +# etag or modified information can be returned and cached by the caller +#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything +# because of etag/modified, return the old etag/modified to the caller to +# indicate why nothing is being returned +#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its +# useless. Fixes the problem JD was addressing by adding it. +#2.1 - 11/14/2002 - MAP - added gzip support +#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent. +# start_admingeneratoragent is an example of how to handle elements with +# only attributes, no content. +#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify); +# also, make sure we send the User-Agent even if urllib2 isn't available. +# Match any variation of backend.userland.com/rss namespace. +#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is. +#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's +# snapshot of July 1 ; changed +# project name +#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree); +# removed unnecessary urllib code -- urllib2 should always be available anyway; +# return actual url, status, and full HTTP headers (as result['url'], +# result['status'], and result['headers']) if parsing a remote feed over HTTP -- +# this should pass all the HTTP tests at ; +# added the latest namespace-of-the-week for RSS 2.0 +#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom +# User-Agent (otherwise urllib2 sends two, which confuses some servers) +#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for +# inline and as used in some RSS 2.0 feeds +#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or +# textInput, and also to return the character encoding (if specified) +#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking +# nested divs within content (JohnD); fixed missing sys import (JohanS); +# fixed regular expression to capture XML character encoding (Andrei); +# added support for Atom 0.3-style links; fixed bug with textInput tracking; +# added support for cloud (MartijnP); added support for multiple +# category/dc:subject (MartijnP); normalize content model: "description" gets +# description (which can come from description, summary, or full content if no +# description), "content" gets dict of base/language/type/value (which can come +# from content:encoded, xhtml:body, content, or fullitem); +# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang +# tracking; fixed bug tracking unknown tags; fixed bug tracking content when +# element is not in default namespace (like Pocketsoap feed); +# resolve relative URLs in link, guid, docs, url, comments, wfw:comment, +# wfw:commentRSS; resolve relative URLs within embedded HTML markup in +# description, xhtml:body, content, content:encoded, title, subtitle, +# summary, info, tagline, and copyright; added support for pingback and +# trackback namespaces +#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback +# namespaces, as opposed to 2.6 when I said I did but didn't really; +# sanitize HTML markup within some elements; added mxTidy support (if +# installed) to tidy HTML markup within some elements; fixed indentation +# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available +# (FazalM); universal date parsing and normalization (FazalM): 'created', modified', +# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed', +# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified' +# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa +#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory +# leak not closing url opener (JohnD); added dc:publisher support (MarekK); +# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK) +#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in +# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL); +# fixed relative URI processing for guid (skadz); added ICBM support; added +# base64 support +#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many +# blogspot.com sites); added _debug variable +#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing +#3.0 - MAP - parse entire feed with real XML parser (if available); added several +# new supported namespaces; fixed bug tracking naked markup in description; +# added support for enclosure; added support for source; re-added support for +# cloud which got dropped somehow; added support for expirationDate; fixed +# xml:lang inheritance; fixed multiple bugs tracking xml:base URI, one for +# documents that don't define one explicitly and one for documents that define +# an outer and an inner xml:base that goes out of scope before the end of the +# document; fixed bug parsing multiple links at feed level; added feed type and +# version detection, results["version"] will be one of SUPPORTED_VERSIONS.keys() +# or empty string if unrecognized; added support for creativeCommons:license and +# cc:license; added support for full Atom content model in title, tagline, info, +# copyright, summary; fixed bug with gzip encoding (not always telling server +# we support it when we do); support Atom-style author element in author_detail +# (dictionary of "name", "url", "email"); map author to author_detail if author +# contains name + email address; better handling of empty HTML tags (br, hr, img, +# etc.) in embedded markup, in either HTML or XHTML form (
,
,
); +# fixed CDATA handling in non-wellformed feeds under Python 2.1 diff --git a/faq.html b/faq.html new file mode 100644 index 0000000..887552b --- /dev/null +++ b/faq.html @@ -0,0 +1,81 @@ + + + + + Planet UCC - Frequently Asked Questions + + + + +
+

+

+ Planet UCC + +
+ + + +
+

Frequently Asked Questions

+
+

What is a Planet?

+

+ A Planet is a collection of weblogs (commonly referred to as Blogs) for members + of a community. Since UCC is as technical as communities get (and it's members + love to blog), Planet UCC was created as a way of collecting all of those blogs + in one place. +

+
+
+

Who wrote Planet UCC?

+

+ Planet UCC was written by Davyd Madeley + <davyd@ucc.asn.au> with input and ideas + from a number of other people. Planet UCC was written in + Python, with art created in + the Gimp. +

+
+
+

How do I get syndicated to Planet UCC?

+

+ Email the Planetmaster + <planet@ucc.asn.au> +

+
+
+

Are there other Planets out there?

+

+ Of course there are! Some of them even have life. A few known Planets are: +

+ +
+ +
+ + + diff --git a/planet.css b/planet.css index 5027762..69da088 100644 --- a/planet.css +++ b/planet.css @@ -103,21 +103,21 @@ color: #999999; } -.item p.body a { +.item a { color: blue; text-decoration: underline; } -.item p.body a:visited { +.item a:visited { color: blue; text-decoration: underline; } .item p.time a { color: #999999; - text-decoration: none; + text-decoration: underline; } .item p.time a:visited { color: #999999; - text-decoration: none; + text-decoration: underline; } .footer { diff --git a/sidebar.html b/sidebar.html index ae4b57a..1591684 100644 --- a/sidebar.html +++ b/sidebar.html @@ -7,12 +7,18 @@ with help from the Gimp and other tools.
- It currently supports RSSv2, RDF and Atom - (used by Blogger) news syndication formats, - as well as XHTML as an output format. Other - formats can be added with ease.
+ Planet UCC now uses XMLParse2, a next generation + parser that uses + feedparser. + This means that Planet UCC can now parser 9 different types of RSS and + Atom (from Blogger). Those interested in the source can check it out from UCC CVS.
Planet UCC can be considered BETA

+

Links

+

+ Frequently Asked Questions
+ UCC Homepage +

diff --git a/update-planet b/update-planet index c6f04dd..6122579 100755 --- a/update-planet +++ b/update-planet @@ -7,34 +7,37 @@ # (c) 2004, Davyd Madeley # -import sys, urllib2, codecs -import XMLParse, XMLWriter +import sys, codecs +import XMLParse2 as XMLParse, XMLWriter, CacheHandler -# step 1: read in the config and download the feeds +# step 1: read in the config and check each object from cache +cache = CacheHandler.CacheHandler() feeds = [] + for feed in open('feedlist').readlines(): if feed.strip()[0] != '#': storage = feed.strip().split('\t') name, feed = storage[0], storage[-1] - sys.stdout.write('Downloading feed "%s" from %s... ' % (name, feed)) try: - # XXX: might want to consider some good caching code in here - feeds.append((name, feed, urllib2.urlopen(feed).read())) - sys.stdout.write('done.\n') + feeds.append((name, feed, cache.getBlog(name, feed))) +# # XXX: might want to consider some good caching code in here +# feeds.append((name, feed, urllib2.urlopen(feed).read())) except: - sys.stdout.write('failed.\n') + raise # step 2: process each feed blogs = [] for feed in feeds: - xml = XMLParse.XMLParse(feed[2]).parse() - for blog in xml: - blog.blogTitle = feed[0] - blogs += xml + # XMLParse2 takes two paramaters, a URL and a CacheObject + blog = XMLParse.XMLParse(feed[1], feed[2]).parse() + blog.blogTitle = feed[0] + blog.feedURL = feed[1] + blogs.append(blog) + # write the cache back down to disk + cache.storeBlog(blog) # step 3: write feed to disk try: codecs.open('planet.html', 'wb', 'utf-8').write(XMLWriter.XMLWriter(XMLWriter.XHTMLWriter, blogs).write()) except: sys.stderr.write('DEBUG: update-planet: could not write planet.html, aborting\n') - raise