From: davyd Date: Mon, 3 May 2004 16:15:31 +0000 (+0000) Subject: Fixes for dodgy blogs, new feedparser X-Git-Url: https://git.ucc.asn.au/?a=commitdiff_plain;h=2b074c0d428d190e8ddc08b01597dbe397a5254c;p=planet-ucc.git Fixes for dodgy blogs, new feedparser --- diff --git a/Changelog b/Changelog index bf8e029..ebcf42f 100644 --- a/Changelog +++ b/Changelog @@ -1,3 +1,12 @@ +2004-04-28 +========== + * update-planet + Cacheability hacks for Adrian Woodley's blog. + * XMLParse2.py + Logging tweaks. + * extras/feedparser.py + Upgraded to new version of feedparser. + 2004-03-22 ========== * CacheHandler.py diff --git a/XMLParse2.py b/XMLParse2.py index 861b333..d1ca814 100644 --- a/XMLParse2.py +++ b/XMLParse2.py @@ -42,19 +42,20 @@ class XMLParse: "Return a single Blog object" item = Blog() if self.blogObject and self.blogObject.cache: - sys.stdout.write('Downloading feed %s...' % self.feedURL) + sys.stdout.write('Downloading feed %s... ' % self.feedURL) try: data = feedparser.parse(self.feedURL, self.blogObject.cache.etag, self.blogObject.cache.date) - sys.stdout.write('done.\n') + # check to see what we got returned + if data['items'] == [] and data['channel'] == {}: + sys.stdout.write('cached.\n') + return self.blogObject + else: + sys.stdout.write('done.\n') except: sys.stdout.write('failed.\n') return None - # check to see what we got returned - if data['items'] == [] and data['channel'] == {}: - sys.stdout.write('Feed %s is upto date.\n' % self.feedURL) - return self.blogObject else: - sys.stdout.write('Downloading feed from %s (no cache)...' % self.feedURL) + sys.stdout.write('Downloading feed (no cache) %s... ' % self.feedURL) try: data = feedparser.parse(self.feedURL) sys.stdout.write('done.\n') diff --git a/extra/feedparser.py b/extra/feedparser.py index 024194e..4c4afd9 100644 --- a/extra/feedparser.py +++ b/extra/feedparser.py @@ -3,41 +3,14 @@ Visit http://diveintomark.org/projects/feed_parser/ for the latest version -Handles RSS 0.9x, RSS 1.0, RSS 2.0, Atom feeds - -Things it handles that choke other parsers: -- bastard combinations of RSS 0.9x and RSS 1.0 -- illegal 8-bit XML characters -- naked and/or invalid HTML in description -- content:encoded, xhtml:body, fullitem -- guid -- elements in non-standard namespaces or non-default namespaces -- multiple content items per entry (Atom) -- multiple links per entry (Atom) - -Other features: -- resolves relative URIs in some elements - - uses xml:base to define base URI - - uses URI of feed if no xml:base is given - - to control which elements are resolved, set _FeedParserMixin.can_be_relative_uri -- resolves relative URIs within embedded markup - - to control which elements are resolved, set _FeedParserMixin.can_contain_relative_uris -- sanitizes embedded markup in some elements - - to allow/disallow HTML elements, set _HTMLSanitizer.acceptable_elements - - to allow/disallow HTML attributes, set _HTMLSanitizer.acceptable_attributes - - to control which feed elements are sanitized, set _FeedParserMixin.can_contain_dangerous_markup - - to disable entirely (NOT RECOMMENDED), set _FeedParserMixin.can_contain_dangerous_markup = [] -- optionally tidies embedded markup - - fixes malformed HTML - - converts to XHTML - - converts character entities to numeric entities - - requires mxTidy +Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds Required: Python 2.1 or later Recommended: Python 2.3 or later +Recommended: libxml2 """ -__version__ = "3.0-beta-14" +__version__ = "3.0-beta-22" __author__ = "Mark Pilgrim " __copyright__ = "Copyright 2002-4, Mark Pilgrim" __contributors__ = ["Jason Diamond ", @@ -45,12 +18,20 @@ __contributors__ = ["Jason Diamond ", "Fazal Majid "] __license__ = "Python" _debug = 0 +_debug_never_use_libxml2 = 0 # if you are embedding feedparser in a larger application, you should change this to your application name and URL USER_AGENT = "UniversalFeedParser/%s%s +http://diveintomark.org/projects/feed_parser/" % (__version__, _debug and "-debug" or "") +# If you want feedparser to automatically run HTML markup through HTML Tidy, set this to 1. +# This is off by default because of reports of crashing on some platforms. If it crashes +# for you, please submit a bug report with your OS platform, Python version, and the URL +# of the feed you were attempting to parse. +# Requires mxTidy +TIDY_MARKUP = 0 + # ---------- required modules (should come with any Python distribution) ---------- -import sgmllib, re, sys, copy, urlparse, time, rfc822 +import sgmllib, re, sys, copy, urlparse, time, rfc822, types try: from cStringIO import StringIO as _StringIO except: @@ -66,24 +47,23 @@ except: # timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers. # Python 2.3 now has this functionality available in the standard socket library, so under -# 2.3 you don't need to install anything. -import socket -if hasattr(socket, 'setdefaulttimeout'): - socket.setdefaulttimeout(10) -else: - try: - import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py - timeoutsocket.setDefaultSocketTimeout(10) - except ImportError: - pass +# 2.3 you don't need to install anything. But you probably should anyway, because the socket +# module is buggy and timeoutsocket is better. +try: + import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py + timeoutsocket.setDefaultSocketTimeout(10) +except ImportError: + import socket + if hasattr(socket, 'setdefaulttimeout'): + socket.setdefaulttimeout(10) import urllib2 -# mxtidy allows feedparser to tidy malformed embedded HTML markup in description, content, etc. -# this does not affect HTML sanitizing, which is self-contained in the _HTMLSanitizer class -try: - from mx.Tidy import Tidy as _mxtidy # http://www.lemburg.com/files/python/mxTidy.html -except: - _mxtidy = None +_mxtidy = None +if TIDY_MARKUP: + try: + from mx.Tidy import Tidy as _mxtidy + except: + pass # If a real XML parser is available, feedparser will attempt to use it. feedparser works # with both the built-in SAX parser and PyXML SAX parser. On platforms where the Python @@ -96,11 +76,12 @@ except: # using one. try: import xml.sax - from xml.sax.saxutils import escape as xmlescape + from xml.sax.saxutils import escape as _xmlescape + class CharacterEncodingOverride(xml.sax.SAXException): pass _XML_AVAILABLE = 1 except: _XML_AVAILABLE = 0 - def xmlescape(data): + def _xmlescape(data): data = data.replace("&", "&") data = data.replace(">", ">") data = data.replace("<", "<") @@ -129,7 +110,9 @@ SUPPORTED_VERSIONS = {'': 'unknown', 'atom01': 'Atom 0.1', 'atom02': 'Atom 0.2', 'atom03': 'Atom 0.3', - 'atom': 'Atom (unknown version)' + 'atom': 'Atom (unknown version)', + 'cdf': 'CDF', + 'hotrss': 'Hot RSS' } try: @@ -142,10 +125,29 @@ except NameError: rc[k] = v return rc +from UserDict import UserDict +class FeedParserDict(UserDict): + def __getitem__(self, key): + if key == 'channel': key = 'feed' + if key == 'items': key = 'entries' + return UserDict.__getitem__(self, key) + + def __getattr__(self, key): + try: + return self.__dict__[key] + except KeyError: + pass + try: + return self.__getitem__(key) + except: + raise AttributeError, "object has no attribute '%s'" % key + class _FeedParserMixin: - namespaces = {"http://backend.userland.com/rss": "", + namespaces = {"": "", + "http://backend.userland.com/rss": "", "http://blogs.law.harvard.edu/tech/rss": "", "http://purl.org/rss/1.0/": "", + "http://my.netscape.com/rdf/simple/0.9/": "", "http://example.com/newformat#": "", "http://example.com/necho": "", "http://purl.org/echo/": "", @@ -196,28 +198,29 @@ class _FeedParserMixin: "http://www.w3.org/XML/1998/namespace": "xml" } - can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentRSS', 'docs', 'url', 'comments'] + can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'comments'] can_contain_relative_uris = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright'] can_contain_dangerous_markup = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright'] html_types = ['text/html', 'application/xhtml+xml'] - def __init__(self, baseuri=None): + def __init__(self, baseuri=None, encoding='utf-8'): if _debug: sys.stderr.write("initializing FeedParser\n") - self.channel = {} # channel- or feed-level data - self.items = [] # list of item- or entry-level data + self.feeddata = FeedParserDict() # feed-level data + self.encoding = encoding # character encoding + self.entries = [] # list of entry-level data self.version = '' # feed type/version, see SUPPORTED_VERSIONS # the following are used internally to track state; # some of this is kind of out of control and should # probably be refactored into a finite state machine - self.inchannel = 0 - self.initem = 0 + self.infeed = 0 + self.inentry = 0 self.incontent = 0 self.intextinput = 0 self.inimage = 0 self.inauthor = 0 self.incontributor = 0 - self.contentparams = {} + self.contentparams = FeedParserDict() self.namespacemap = {} self.elementstack = [] self.basestack = [] @@ -233,11 +236,11 @@ class _FeedParserMixin: # track xml:base and xml:lang attrsD = dict(attrs) - baseuri = attrsD.get('xml:base') + baseuri = attrsD.get('xml:base', attrsD.get('base')) if baseuri: if _debug: sys.stderr.write('self.baseuri=%s\n' % baseuri) self.baseuri = baseuri - lang = attrsD.get('xml:lang') + lang = attrsD.get('xml:lang', attrsD.get('lang')) if lang: self.lang = lang self.basestack.append(baseuri) @@ -267,9 +270,9 @@ class _FeedParserMixin: return self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0) # match namespaces - try: + if tag.find(':') <> -1: prefix, suffix = tag.split(':', 1) - except ValueError: + else: prefix, suffix = '', tag prefix = self.namespacemap.get(prefix, prefix) if prefix: @@ -286,9 +289,9 @@ class _FeedParserMixin: def unknown_endtag(self, tag): if _debug: sys.stderr.write('end %s\n' % tag) # match namespaces - try: + if tag.find(':') <> -1: prefix, suffix = tag.split(':', 1) - except ValueError: + else: prefix, suffix = '', tag prefix = self.namespacemap.get(prefix, prefix) if prefix: @@ -340,8 +343,9 @@ class _FeedParserMixin: # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references if not self.elementstack: return +# if _debug: sys.stderr.write(text) if escape and self.contentparams.get('mode') == 'xml': - text = xmlescape(text) + text = _xmlescape(text) self.elementstack[-1][2].append(text) def handle_comment(self, text): @@ -353,39 +357,15 @@ class _FeedParserMixin: pass def handle_decl(self, text): - # called for the DOCTYPE, if present, e.g. - # - if text.count('http://my.netscape.com/publish/formats/rss-0.91.dtd'): - self.version = 'rss091n' - - _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match - def _scan_name(self, i, declstartpos): - rawdata = self.rawdata - n = len(rawdata) - if i == n: - return None, -1 - m = self._new_declname_match(rawdata, i) - if m: - s = m.group() - name = s.strip() - if (i + len(s)) == n: - return None, -1 # end of buffer - return name.lower(), m.end() - else: - self.updatepos(declstartpos, i) - self.error("expected name token") + pass def parse_declaration(self, i): # override internal declaration handler to handle CDATA blocks if _debug: sys.stderr.write("entering parse_declaration\n") - if re.search(r'^', self.rawdata[i:]): - if _debug: sys.stderr.write("found Netscape DOCTYPE\n") - self.version = 'rss091n' if self.rawdata[i:i+9] == '', i) if k == -1: k = len(self.rawdata) - self.handle_data(xmlescape(self.rawdata[i+9:k]), 0) + self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) return k+3 else: k = self.rawdata.find('>', i) @@ -394,6 +374,8 @@ class _FeedParserMixin: def trackNamespace(self, prefix, uri): if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: self.version = 'rss090' + if (prefix, uri) == (None, 'http://purl.org/rss/1.0/') and not self.version: + self.version = 'rss10' if not prefix: return if uri.find('backend.userland.com/rss') <> -1: # match any backend.userland.com namespace @@ -414,13 +396,11 @@ class _FeedParserMixin: return data def push(self, element, expectingText): -# print 'push', element, expectingText # while self.elementstack and self.elementstack[-1][1]: # self.pop(self.elementstack[-1][0]) self.elementstack.append([element, expectingText, []]) def pop(self, element): -# print 'pop', element if not self.elementstack: return # while self.elementstack[-1][0] != element: self.pop(self.elementstack[-1][0]) if self.elementstack[-1][0] != element: return @@ -448,47 +428,56 @@ class _FeedParserMixin: # resolve relative URIs within embedded markup if element in self.can_contain_relative_uris: - output = _resolveRelativeURIs(output, self.baseuri) + output = _resolveRelativeURIs(output, self.baseuri, self.encoding) # sanitize embedded markup if element in self.can_contain_dangerous_markup: - output = _sanitizeHTML(output) + output = _sanitizeHTML(output, self.encoding) + + if type(output) == types.StringType: + try: + output = unicode(output, self.encoding) + except: + pass # store output in appropriate place(s) - if self.initem: + if self.inentry: if element == 'content': - self.items[-1].setdefault(element, []) + self.entries[-1].setdefault(element, []) contentparams = copy.deepcopy(self.contentparams) contentparams['value'] = output - self.items[-1][element].append(contentparams) + self.entries[-1][element].append(contentparams) elif element == 'category': - self.items[-1][element] = output - domain = self.items[-1]['categories'][-1][0] - self.items[-1]['categories'][-1] = (domain, output) + self.entries[-1][element] = output + domain = self.entries[-1]['categories'][-1][0] + self.entries[-1]['categories'][-1] = (domain, output) elif element == 'source': - self.items[-1]['source']['value'] = output + self.entries[-1]['source']['value'] = output elif element == 'link': - self.items[-1][element] = output + self.entries[-1][element] = output if output: - self.items[-1]['links'][-1]['href'] = output + self.entries[-1]['links'][-1]['href'] = output else: - if self.incontent and element != 'description': + self.entries[-1][element] = output + if self.incontent: + if element == 'description': + element = 'summary' contentparams = copy.deepcopy(self.contentparams) contentparams['value'] = output - self.items[-1][element + '_detail'] = contentparams - self.items[-1][element] = output - elif self.inchannel and (not self.intextinput) and (not self.inimage): + self.entries[-1][element + '_detail'] = contentparams + elif self.infeed and (not self.intextinput) and (not self.inimage): + self.feeddata[element] = output if element == 'category': - domain = self.channel['categories'][-1][0] - self.channel['categories'][-1] = (domain, output) + domain = self.feeddata['categories'][-1][0] + self.feeddata['categories'][-1] = (domain, output) elif element == 'link': - self.channel['links'][-1]['href'] = output - else: - if self.incontent and element != 'description': - contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - self.channel[element + '_detail'] = contentparams - self.channel[element] = output + self.feeddata['links'][-1]['href'] = output + elif self.incontent: + if element == 'description': + element = 'tagline' + contentparams = copy.deepcopy(self.contentparams) + contentparams['value'] = output + self.feeddata[element + '_detail'] = contentparams return output def _mapToStandardPrefix(self, name): @@ -505,10 +494,10 @@ class _FeedParserMixin: def _save(self, key, value): if value: - if self.initem: - self.items[-1].setdefault(key, value) - elif self.channel: - self.channel.setdefault(key, value) + if self.inentry: + self.entries[-1].setdefault(key, value) + elif self.feeddata: + self.feeddata.setdefault(key, value) def _start_rss(self, attrsD): versionmap = {'0.91': 'rss091u', @@ -524,12 +513,28 @@ class _FeedParserMixin: self.version = 'rss20' else: self.version = 'rss' + + def _start_dlhottitles(self, attrsD): + self.version = 'hotrss' def _start_channel(self, attrsD): - self.inchannel = 1 - + self.infeed = 1 + self._cdf_common(attrsD) + _start_feedinfo = _start_channel + + def _cdf_common(self, attrsD): + if attrsD.has_key('lastmod'): + if _debug: sys.stderr.write(attrsD['lastmod'] + '\n') + self._start_modified({}) + self.elementstack[-1][-1] = attrsD['lastmod'] + self._end_modified() + if attrsD.has_key('href'): + self._start_link({}) + self.elementstack[-1][-1] = attrsD['href'] + self._end_link() + def _start_feed(self, attrsD): - self.inchannel = 1 + self.infeed = 1 versionmap = {'0.1': 'atom01', '0.2': 'atom02', '0.3': 'atom03'} @@ -542,7 +547,7 @@ class _FeedParserMixin: self.version = 'atom' def _end_channel(self): - self.inchannel = 0 + self.infeed = 0 _end_feed = _end_channel def _start_image(self, attrsD): @@ -553,9 +558,13 @@ class _FeedParserMixin: def _start_textinput(self, attrsD): self.intextinput = 1 + self.push('textinput', 0) + context = self._getContext() + context.setdefault('textinput', FeedParserDict()) _start_textInput = _start_textinput def _end_textinput(self): + self.pop('textinput') self.intextinput = 0 _end_textInput = _end_textinput @@ -578,7 +587,7 @@ class _FeedParserMixin: self.incontributor = 1 context = self._getContext() context.setdefault('contributors', []) - context['contributors'].append({}) + context['contributors'].append(FeedParserDict()) self.push('contributor', 0) def _end_contributor(self): @@ -594,13 +603,12 @@ class _FeedParserMixin: self._save_author('name', value) elif self.incontributor: self._save_contributor('name', value) - pass elif self.intextinput: - # TODO - pass + context = self._getContext() + context['textinput']['name'] = value def _start_url(self, attrsD): - self.push('url', 0) + self.push('url', 1) _start_homepage = _start_url _start_uri = _start_url @@ -614,7 +622,7 @@ class _FeedParserMixin: # TODO pass elif self.intextinput: - # TODO + # TODO (map to link) pass _end_homepage = _end_url _end_uri = _end_url @@ -629,29 +637,23 @@ class _FeedParserMixin: elif self.incontributor: self._save_contributor('email', value) pass - elif self.inimage: - # TODO - pass - elif self.intextinput: - # TODO - pass def _getContext(self): - if self.initem: - context = self.items[-1] + if self.inentry: + context = self.entries[-1] else: - context = self.channel + context = self.feeddata return context def _save_author(self, key, value): context = self._getContext() - context.setdefault('author_detail', {}) + context.setdefault('author_detail', FeedParserDict()) context['author_detail'][key] = value self._sync_author_detail() def _save_contributor(self, key, value): context = self._getContext() - context.setdefault('contributors', [{}]) + context.setdefault('contributors', [FeedParserDict()]) context['contributors'][-1][key] = value def _sync_author_detail(self): @@ -672,19 +674,25 @@ class _FeedParserMixin: emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author) if not emailmatch: return email = emailmatch.group(0) + # probably a better way to do the following, but it passes all the tests author = author.replace(email, '') author = author.replace('()', '') author = author.strip() - context.setdefault('author_detail', {}) + if author and (author[0] == '('): + author = author[1:] + if author and (author[-1] == ')'): + author = author[:-1] + author = author.strip() + context.setdefault('author_detail', FeedParserDict()) context['author_detail']['name'] = author context['author_detail']['email'] = email def _start_tagline(self, attrsD): self.incontent += 1 - self.contentparams = {'mode': attrsD.get('mode', 'escaped'), + self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), 'type': attrsD.get('type', 'text/plain'), 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)} + 'base': attrsD.get('xml:base', self.baseuri)}) self.push('tagline', 1) _start_subtitle = _start_tagline @@ -692,16 +700,16 @@ class _FeedParserMixin: value = self.pop('tagline') self.incontent -= 1 self.contentparams.clear() - if self.inchannel: - self.channel['description'] = value + if self.infeed: + self.feeddata['description'] = value _end_subtitle = _end_tagline def _start_copyright(self, attrsD): self.incontent += 1 - self.contentparams = {'mode': attrsD.get('mode', 'escaped'), + self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), 'type': attrsD.get('type', 'text/plain'), 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)} + 'base': attrsD.get('xml:base', self.baseuri)}) self.push('copyright', 1) _start_dc_rights = _start_copyright @@ -712,14 +720,16 @@ class _FeedParserMixin: _end_dc_rights = _end_copyright def _start_item(self, attrsD): - self.items.append({}) + self.entries.append(FeedParserDict()) self.push('item', 0) - self.initem = 1 + self.inentry = 1 + self._cdf_common(attrsD) _start_entry = _start_item + _start_product = _start_item def _end_item(self): self.pop('item') - self.initem = 0 + self.inentry = 0 _end_entry = _end_item def _start_dc_language(self, attrsD): @@ -764,6 +774,7 @@ class _FeedParserMixin: def _end_dcterms_modified(self): value = self.pop('modified') + if _debug: sys.stderr.write('_end_dcterms_modified, value=' + value + '\n') parsed_value = _parse_date(value) self._save('date', value) self._save('date_parsed', parsed_value) @@ -795,41 +806,51 @@ class _FeedParserMixin: self.push('category', 1) domain = self._getAttribute(attrsD, 'domain') cats = [] - if self.initem: - cats = self.items[-1].setdefault('categories', []) - elif self.inchannel: - cats = self.channel.setdefault('categories', []) + if self.inentry: + cats = self.entries[-1].setdefault('categories', []) + elif self.infeed: + cats = self.feeddata.setdefault('categories', []) cats.append((domain, None)) _start_dc_subject = _start_category + _start_keywords = _start_category def _end_category(self): self.pop('category') _end_dc_subject = _end_category + _end_keywords = _end_category def _start_cloud(self, attrsD): - self.channel['cloud'] = attrsD + self.feeddata['cloud'] = attrsD def _start_link(self, attrsD): attrsD.setdefault('rel', 'alternate') attrsD.setdefault('type', 'text/html') if attrsD.has_key('href'): attrsD['href'] = self.resolveURI(attrsD['href']) - expectingText = self.inchannel or self.initem - if self.initem: - self.items[-1].setdefault('links', []) - self.items[-1]['links'].append(attrsD) - elif self.inchannel: - self.channel.setdefault('links', []) - self.channel['links'].append(attrsD) + expectingText = self.infeed or self.inentry + if self.inentry: + self.entries[-1].setdefault('links', []) + self.entries[-1]['links'].append(attrsD) + elif self.infeed: + self.feeddata.setdefault('links', []) + self.feeddata['links'].append(attrsD) if attrsD.has_key('href'): expectingText = 0 if attrsD.get('type', '') in self.html_types: - if self.initem: - self.items[-1]['link'] = attrsD['href'] - elif self.inchannel: - self.channel['link'] = attrsD['href'] + if self.inentry: + self.entries[-1]['link'] = attrsD['href'] + elif self.infeed: + self.feeddata['link'] = attrsD['href'] else: self.push('link', expectingText) + _start_producturl = _start_link + + def _end_link(self): + value = self.pop('link') + if self.intextinput: + context = self._getContext() + context['textinput']['link'] = value + _end_producturl = _end_link def _start_guid(self, attrsD): self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') @@ -852,42 +873,52 @@ class _FeedParserMixin: def _start_title(self, attrsD): self.incontent += 1 - self.contentparams = {'mode': attrsD.get('mode', 'escaped'), + self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), 'type': attrsD.get('type', 'text/plain'), 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)} - self.push('title', self.inchannel or self.initem) + 'base': attrsD.get('xml:base', self.baseuri)}) + self.push('title', self.infeed or self.inentry) _start_dc_title = _start_title def _end_title(self): - self.pop('title') + value = self.pop('title') self.incontent -= 1 self.contentparams.clear() + if self.intextinput: + context = self._getContext() + context['textinput']['title'] = value _end_dc_title = _end_title - def _start_description(self, attrsD): + def _start_description(self, attrsD, default_content_type='text/html'): self.incontent += 1 - self.contentparams = {'mode': attrsD.get('mode', 'escaped'), - 'type': attrsD.get('type', 'text/html'), + self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), + 'type': attrsD.get('type', default_content_type), 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)} - self.push('description', self.inchannel or self.initem) + 'base': attrsD.get('xml:base', self.baseuri)}) + self.push('description', self.infeed or self.inentry) + + def _start_abstract(self, attrsD): + return self._start_description(attrsD, 'text/plain') def _end_description(self): value = self.pop('description') - if self.initem: - self.items[-1]['summary'] = value - elif self.inchannel: - self.channel['tagline'] = value self.incontent -= 1 self.contentparams.clear() - + context = self._getContext() + if self.intextinput: + context['textinput']['description'] = value + elif self.inentry: + context['summary'] = value + elif self.infeed: + context['tagline'] = value + _end_abstract = _end_description + def _start_info(self, attrsD): self.incontent += 1 - self.contentparams = {'mode': attrsD.get('mode', 'escaped'), + self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), 'type': attrsD.get('type', 'text/plain'), 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)} + 'base': attrsD.get('xml:base', self.baseuri)}) self.push('info', 1) def _end_info(self): @@ -897,13 +928,15 @@ class _FeedParserMixin: def _start_generator(self, attrsD): if attrsD: - self.channel['generator_detail'] = attrsD + if attrsD.has_key('url'): + attrsD['url'] = self.resolveURI(attrsD['url']) + self.feeddata['generator_detail'] = attrsD self.push('generator', 1) def _end_generator(self): value = self.pop('generator') - if self.channel.has_key('generator_detail'): - self.channel['generator_detail']['name'] = value + if self.feeddata.has_key('generator_detail'): + self.feeddata['generator_detail']['name'] = value def _start_admin_generatoragent(self, attrsD): self.push('generator', 1) @@ -921,27 +954,27 @@ class _FeedParserMixin: def _start_summary(self, attrsD): self.incontent += 1 - self.contentparams = {'mode': attrsD.get('mode', 'escaped'), + self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), 'type': attrsD.get('type', 'text/plain'), 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)} + 'base': attrsD.get('xml:base', self.baseuri)}) self.push('summary', 1) def _end_summary(self): value = self.pop('summary') - if self.items: - self.items[-1]['description'] = value + if self.entries: + self.entries[-1]['description'] = value self.incontent -= 1 self.contentparams.clear() def _start_enclosure(self, attrsD): - if self.initem: - self.items[-1].setdefault('enclosures', []) - self.items[-1]['enclosures'].append(attrsD) + if self.inentry: + self.entries[-1].setdefault('enclosures', []) + self.entries[-1]['enclosures'].append(attrsD) def _start_source(self, attrsD): - if self.initem: - self.items[-1]['source'] = attrsD + if self.inentry: + self.entries[-1]['source'] = attrsD self.push('source', 1) def _end_source(self): @@ -949,27 +982,35 @@ class _FeedParserMixin: def _start_content(self, attrsD): self.incontent += 1 - self.contentparams = {'mode': attrsD.get('mode', 'xml'), + self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'), 'type': attrsD.get('type', 'text/plain'), 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)} + 'base': attrsD.get('xml:base', self.baseuri)}) + self.push('content', 1) + + def _start_prodlink(self, attrsD): + self.incontent += 1 + self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'), + 'type': attrsD.get('type', 'text/html'), + 'language': attrsD.get('xml:lang', self.lang), + 'base': attrsD.get('xml:base', self.baseuri)}) self.push('content', 1) def _start_body(self, attrsD): self.incontent += 1 - self.contentparams = {'mode': 'xml', + self.contentparams = FeedParserDict({'mode': 'xml', 'type': 'application/xhtml+xml', 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)} + 'base': attrsD.get('xml:base', self.baseuri)}) self.push('content', 1) _start_xhtml_body = _start_body def _start_content_encoded(self, attrsD): self.incontent += 1 - self.contentparams = {'mode': 'escaped', + self.contentparams = FeedParserDict({'mode': 'escaped', 'type': 'text/html', 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)} + 'base': attrsD.get('xml:base', self.baseuri)}) self.push('content', 1) _start_fullitem = _start_content_encoded @@ -983,13 +1024,14 @@ class _FeedParserMixin: _end_xhtml_body = _end_content _end_content_encoded = _end_content _end_fullitem = _end_content + _end_prodlink = _end_content if _XML_AVAILABLE: - class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):#, xml.sax.handler.DTDHandler): - def __init__(self, baseuri): + class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler, xml.sax.handler.EntityResolver):#, xml.sax.handler.DTDHandler): + def __init__(self, baseuri, encoding): if _debug: sys.stderr.write('trying StrictFeedParser\n') xml.sax.handler.ContentHandler.__init__(self) - _FeedParserMixin.__init__(self, baseuri) + _FeedParserMixin.__init__(self, baseuri, encoding) self.bozo = 0 self.exc = None @@ -998,8 +1040,11 @@ if _XML_AVAILABLE: def startElementNS(self, name, qname, attrs): namespace, localname = name - namespace = str(namespace) - prefix = self.namespaces.get(namespace, '') + namespace = str(namespace or '') + if namespace.find('backend.userland.com/rss') <> -1: + # match any backend.userland.com namespace + namespace = 'http://backend.userland.com/rss' + prefix = self.namespaces.get(namespace, 'unknown') if prefix: localname = prefix + ':' + localname localname = str(localname).lower() @@ -1036,28 +1081,35 @@ if _XML_AVAILABLE: localname = str(localname).lower() self.unknown_endtag(localname) - def fatalError(self, exc): + def error(self, exc): self.bozo = 1 self.exc = exc - error = fatalError - -class _LooseFeedParser(_FeedParserMixin, sgmllib.SGMLParser): - def __init__(self, baseuri): - sgmllib.SGMLParser.__init__(self) - _FeedParserMixin.__init__(self, baseuri) + + def fatalError(self, exc): + self.error(exc) + raise exc class _BaseHTMLProcessor(sgmllib.SGMLParser): elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param'] - def __init__(self): + def __init__(self, encoding): + self.encoding = encoding sgmllib.SGMLParser.__init__(self) def reset(self): - # extend (called by sgmllib.SGMLParser.__init__) self.pieces = [] sgmllib.SGMLParser.reset(self) + def feed(self, data): + data = re.compile(r'', r'<\1>', data) + data = data.replace(''', "'") + data = data.replace('"', '"') + if type(data) == types.UnicodeType: + data = data.encode(self.encoding) + sgmllib.SGMLParser.feed(self, data) + def normalize_attrs(self, attrs): # utility method to be called by descendants attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs] @@ -1068,6 +1120,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for
, tag="pre", attrs=[("class", "screen")]
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
         strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
         if tag in self.elements_no_end_tag:
             self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
@@ -1094,6 +1147,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
         # called for each block of plain text, i.e. outside of any tag and
         # not containing any character or entity references
         # Store the original text verbatim.
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
         self.pieces.append(text)
         
     def handle_comment(self, text):
@@ -1113,9 +1167,37 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
         # Reconstruct original DOCTYPE
         self.pieces.append("" % locals())
         
+    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
+    def _scan_name(self, i, declstartpos):
+        rawdata = self.rawdata
+        if _debug: sys.stderr.write("i=%s, declstartpos=%s, rawdata=%s\n" % (i, declstartpos, rawdata))
+        n = len(rawdata)
+        if i == n:
+            return None, -1
+        m = self._new_declname_match(rawdata, i)
+        if m:
+            s = m.group()
+            name = s.strip()
+            if (i + len(s)) == n:
+                return None, -1  # end of buffer
+            return name.lower(), m.end()
+        else:
+            self.handle_data(rawdata)
+#            self.updatepos(declstartpos, i)
+            return None, -1
+
     def output(self):
         """Return processed HTML as a single string"""
-        return "".join(self.pieces)
+        if _debug:
+            for p in self.pieces:
+                sys.stderr.write(p)
+            sys.stderr.write('\n')
+        return "".join([str(p) for p in self.pieces])
+
+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
+    def __init__(self, baseuri, encoding):
+        sgmllib.SGMLParser.__init__(self)
+        _FeedParserMixin.__init__(self, baseuri, encoding)
 
 class _RelativeURIResolver(_BaseHTMLProcessor):
     relative_uris = [('a', 'href'),
@@ -1144,8 +1226,8 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
                      ('q', 'cite'),
                      ('script', 'src')]
 
-    def __init__(self, baseuri):
-        _BaseHTMLProcessor.__init__(self)
+    def __init__(self, baseuri, encoding):
+        _BaseHTMLProcessor.__init__(self, encoding)
         self.baseuri = baseuri
 
     def resolveURI(self, uri):
@@ -1156,8 +1238,10 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
         
-def _resolveRelativeURIs(htmlSource, baseURI):
-    p = _RelativeURIResolver(baseURI)
+def _resolveRelativeURIs(htmlSource, baseURI, encoding):
+    if _debug: sys.stderr.write("entering _resolveRelativeURIs\n")
+    p = _RelativeURIResolver(baseURI, encoding)
+    if _debug: sys.stderr.write(repr(type(htmlSource)) + '\n')
     p.feed(htmlSource)
     return p.output()
 
@@ -1214,11 +1298,11 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
         if not self.unacceptablestack:
             _BaseHTMLProcessor.handle_data(self, text)
 
-def _sanitizeHTML(htmlSource):
-    p = _HTMLSanitizer()
+def _sanitizeHTML(htmlSource, encoding):
+    p = _HTMLSanitizer(encoding)
     p.feed(htmlSource)
     data = p.output()
-    if _mxtidy:
+    if _mxtidy and TIDY_MARKUP:
         nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
         if data.count('').match(xml_data)
+    xml_encoding = xml_encoding_match and xml_encoding_match.groups()[0].lower() or ''
+    if (http_content_type == 'application/xml') or \
+       (http_content_type == 'application/xml-dtd') or \
+       (http_content_type == 'application/xml-external-parsed-entity') or \
+       (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
+        if http_encoding:
+            true_encoding = http_encoding
+        elif xml_encoding:
+            true_encoding = xml_encoding
+        else:
+            true_encoding = 'utf-8'
+    elif (http_content_type == 'text/xml') or \
+         (http_content_type == 'text/xml-external-parsed-entity') or \
+         (http_content_type.startswith('text/') and http_content_type.endswith('+xml')):
+        if http_encoding:
+            true_encoding = http_encoding
+        else:
+            true_encoding = 'us-ascii'
+    else:
+        true_encoding = xml_encoding or 'utf-8'
+    return true_encoding, http_encoding, xml_encoding
+    
+def _changeEncodingDeclaration(data, encoding):
+    """Changes an XML data stream on the fly to specify a new encoding
+
+    data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
+    encoding is a string recognized by encodings.aliases
+    """
+    if _debug: sys.stderr.write('entering _changeEncodingDeclaration\n')
+    if _debug: sys.stderr.write('proposed encoding: %s\n' % encoding)
+    #import cjkcodecs.aliases
+    #import japanese
+    data = unicode(data, encoding)
+    declmatch = re.compile(u'^<\?xml[^>]*?>')
+    newdecl = unicode("""""" % encoding, encoding)
+    if declmatch.search(data):
+        data = declmatch.sub(newdecl, data)
+    else:
+        data = newdecl + u'\n' + data
+    return data.encode(encoding)
+
+def _stripDoctype(data):
+    """Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
+
+    rss_version may be "rss091n" or None
+    stripped_data is the same XML document, minus the DOCTYPE
+    """
+    doctype_pattern = re.compile(r']*?)>', re.MULTILINE)
+    doctype_results = doctype_pattern.findall(data)
+    doctype = doctype_results and doctype_results[0] or ''
+    if doctype.lower().count('netscape'):
+        version = 'rss091n'
+    else:
+        version = None
+    data = doctype_pattern.sub('', data)
+    return version, data
+    
 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None):
     """Parse a feed from a URL, file, stream, or string"""
-    result = {}
+    result = FeedParserDict()
     f = _open_resource(url_file_stream_or_string, etag=etag, modified=modified, agent=agent, referrer=referrer)
     data = f.read()
     if hasattr(f, "headers"):
@@ -1579,35 +1784,71 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
         result["status"] = f.status
     if hasattr(f, "headers"):
         result["headers"] = f.headers.dict
-    # get the xml encoding
-    xmlheaderRe = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>') # Andrei's version
-    match = xmlheaderRe.match(data)
-    if match:
-        result["encoding"] = match.groups()[0].lower()
     f.close()
-    result['channel'] = {}
-    result['items'] = {}
+    if result.get("status", 0) == 304:
+        result['feed'] = FeedParserDict()
+        result['entries'] = []
+        result['debug_message'] = "The feed has not changed since you last checked, so the server sent no data.  This is a feature, not a bug!"
+        return result
+    result['encoding'], http_encoding, xml_encoding = _getCharacterEncoding(result.get("headers", {}), data)
+    result['version'], data = _stripDoctype(data)
     baseuri = result.get('headers', {}).get('content-location', result.get('url'))
     # try true XML parser first
-    if _XML_AVAILABLE:
+    if not _XML_AVAILABLE:
+        if _debug: sys.stderr.write('no xml libraries available\n')
+    use_strict_parser = _XML_AVAILABLE
+    if use_strict_parser:
         if _debug: sys.stderr.write('using xml library\n')
         result['bozo'] = 0
-        feedparser = _StrictFeedParser(baseuri)
-        if re.search(r'', data):
-            feedparser.version = 'rss091n'
-        source = xml.sax.xmlreader.InputSource()
-        source.setByteStream(_StringIO(data))
-        saxparser = xml.sax.make_parser()#["drv_libxml2"])
+        feedparser = _StrictFeedParser(baseuri, result['encoding'])
+        if _debug and _debug_never_use_libxml2:
+            sys.stderr.write('not using libxml2 (even if available)\n')
+            additional_parsers = []
+        else:
+            additional_parsers = ["drv_libxml2"]
+        saxparser = xml.sax.make_parser(additional_parsers)
         saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
         saxparser.setContentHandler(feedparser)
         saxparser.setErrorHandler(feedparser)
         try:
             saxparser.setDTDHandler(feedparser)
+        except xml.sax.SAXNotSupportedException:
+            # libxml2 driver does not support DTDHandler
+            if _debug: sys.stderr.write('using an xml library that does not support DTDHandler (not a big deal)\n')
+        try:
             saxparser.setEntityResolver(feedparser)
         except xml.sax.SAXNotSupportedException:
-            if _debug: sys.stderr.write('using an xml library that does not support DTDHandler and EntityResolver (this is not a problem)\n')
-            # libxml2 driver does not currently support DTDHandler or EntityResolver
-            pass
+            # libxml2 driver does not support EntityResolver
+            if _debug: sys.stderr.write('using an xml library that does not support EntityResolver (not a big deal)\n')
+        encoding_set = (result['encoding'] == xml_encoding)
+        if not encoding_set:
+            bozo_exception = None
+            proposed_encodings = [result['encoding'], xml_encoding, 'utf-8', 'iso-8859-1', 'windows-1252']
+            tried_encodings = []
+            for proposed_encoding in proposed_encodings:
+                if proposed_encodings in tried_encodings: continue
+                tried_encodings.append(proposed_encoding)
+                try:
+                    data = _changeEncodingDeclaration(data, proposed_encoding)
+                except Exception, bozo_exception:
+                    if _debug: sys.stderr.write('character encoding is wrong\n')
+                else:
+                    if proposed_encoding != result['encoding']:
+                        try:
+                            raise CharacterEncodingOverride, "document declared as %s, but parsed as %s" % (result['encoding'], proposed_encoding)
+                        except CharacterEncodingOverride, bozo_exception:
+                            result['bozo'] = 1
+                            result['bozo_exception'] = bozo_exception
+                    result['encoding'] = proposed_encoding
+                    encoding_set = 1
+                    break
+        if not encoding_set:
+            result['bozo'] = 1
+            result['bozo_exception'] = bozo_exception
+            use_strict_parser = 0
+    if use_strict_parser:
+        source = xml.sax.xmlreader.InputSource()
+        source.setByteStream(_StringIO(data))
         if hasattr(saxparser, '_ns_stack'):
             # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
             # PyXML doesn't have this problem, and it doesn't have _ns_stack either
@@ -1615,45 +1856,29 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
         try:
             saxparser.parse(source)
         except Exception, e:
-            # SAX parser is supposed to catch all of these and call feedparser.fatal_error,
-            # which captures them.  For some reason, some Unicode-related errors go
-            # uncaught on some combination of platform, XML library, Python version,
-            # and phase of the moon.
+            if _debug: sys.stderr.write('xml parsing failed\n')
             feedparser.bozo = 1
-            feedparser.bozo_exception = e
+            feedparser.bozo_exception = feedparser.exc or e
         if feedparser.bozo:
             # feed is not well-formed XML, fall back on regex-based parser
-            if _debug: sys.stderr.write('xml parsing failed, using regexes.  now you have two problems...\n')
             result['bozo'] = 1
-            result['bozo_exception'] = feedparser.exc
-            # munge short tags, e.g.  becomes 
-            data = re.sub(r'<(\S+)/>', r'<\1>', data)
-            feedparser = _LooseFeedParser(baseuri)
-            feedparser.feed(data)
-    else:
-        if _debug: sys.stderr.write('no xml libraries available, using regexes\n')
-        data = re.sub(r'<(\S+)/>', r'<\1>', data)
-        feedparser = _LooseFeedParser(baseuri)
+            result['bozo_exception'] = feedparser.bozo_exception
+            use_strict_parser = 0
+    if not use_strict_parser:
+        if _debug: sys.stderr.write('using regexes, now you have two problems\n')
+        feedparser = _LooseFeedParser(baseuri, result['encoding'])
         feedparser.feed(data)
-    result['channel'] = feedparser.channel
-    result['items'] = feedparser.items
-    result['version'] = feedparser.version
+    result['feed'] = feedparser.feeddata
+    result['entries'] = feedparser.entries
+    result['version'] = result['version'] or feedparser.version
     return result
 
-_TEST_SUITE = ('http://www.pocketsoap.com/rssTests/rss1.0withModules.xml',
-              'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNS.xml',
-              'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNSLocalNameClash.xml',
-              'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModules.xml',
-              'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModulesLocalNameClash.xml',
-              'http://www.pocketsoap.com/rssTests/rss2.0NSwithModules.xml',
-              'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNS.xml',
-              'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNSLocalNameClash.xml')
-
 if __name__ == '__main__':
-    if sys.argv[1:]:
-        urls = sys.argv[1:]
+    if not sys.argv[1:]:
+        print __doc__
+        sys.exit(0)
     else:
-        urls = _TEST_SUITE
+        urls = sys.argv[1:]
     from pprint import pprint
     for url in urls:
         print url
@@ -1664,14 +1889,6 @@ if __name__ == '__main__':
 
 #TODO
 #- image
-#- textinput/textInput
-#- comments
-#
-#encoding notes:
-#- RFC 3023
-#- content-type.startswith('text/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else "us-ascii"
-#- content-type.startswith('application/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else look for encoding="(.*?)" in document, else "utf-8"
-#- parsing encoding: http://www.w3.org/TR/REC-xml#NT-EncodingDecl
 #
 #REVISION HISTORY
 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
@@ -1751,20 +1968,62 @@ if __name__ == '__main__':
 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
 #  blogspot.com sites); added _debug variable
 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
-#3.0 - MAP - parse entire feed with real XML parser (if available); added several
-#  new supported namespaces; fixed bug tracking naked markup in description;
-#  added support for enclosure; added support for source; re-added support for
-#  cloud which got dropped somehow; added support for expirationDate; fixed
-#  xml:lang inheritance; fixed multiple bugs tracking xml:base URI, one for
-#  documents that don't define one explicitly and one for documents that define
-#  an outer and an inner xml:base that goes out of scope before the end of the
-#  document; fixed bug parsing multiple links at feed level; added feed type and
-#  version detection, results["version"] will be one of SUPPORTED_VERSIONS.keys()
-#  or empty string if unrecognized; added support for creativeCommons:license and
-#  cc:license; added support for full Atom content model in title, tagline, info,
-#  copyright, summary; fixed bug with gzip encoding (not always telling server
-#  we support it when we do); support Atom-style author element in author_detail
+#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
+#  added several new supported namespaces; fixed bug tracking naked markup in
+#  description; added support for enclosure; added support for source; re-added
+#  support for cloud which got dropped somehow; added support for expirationDate
+#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
+#  xml:base URI, one for documents that don't define one explicitly and one for
+#  documents that define an outer and an inner xml:base that goes out of scope
+#  before the end of the document
+#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
+#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result["version"]
+#  will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
+#  added support for creativeCommons:license and cc:license; added support for
+#  full Atom content model in title, tagline, info, copyright, summary; fixed bug
+#  with gzip encoding (not always telling server we support it when we do)
+#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
 #  (dictionary of "name", "url", "email"); map author to author_detail if author
-#  contains name + email address; better handling of empty HTML tags (br, hr, img,
-#  etc.) in embedded markup, in either HTML or XHTML form (
,
,
); -# fixed CDATA handling in non-wellformed feeds under Python 2.1 +# contains name + email address +#3.0b8 - 1/28/2004 - MAP - added support for contributor +#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added +# support for summary +#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from +# xml.util.iso8601 +#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain +# dangerous markup; fiddled with decodeEntities (not right); liberalized +# date parsing even further +#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right); +# added support to Atom 0.2 subtitle; added support for Atom content model +# in copyright; better sanitizing of dangerous HTML elements with end tags +# (script, frameset) +#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img, +# etc.) in embedded markup, in either HTML or XHTML form (
,
,
) +#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under +# Python 2.1 +#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS; +# fixed bug capturing author and contributor URL; fixed bug resolving relative +# links in author and contributor URL; fixed bug resolvin relative links in +# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's +# namespace tests, and included them permanently in the test suite with his +# permission; fixed namespace handling under Python 2.1 +#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15) +#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023 +#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei); +# use libxml2 (if available) +#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author +# name was in parentheses; removed ultra-problematic mxTidy support; patch to +# workaround crash in PyXML/expat when encountering invalid entities +# (MarkMoraes); support for textinput/textInput +#3.0b20 - 4/7/2004 - MAP - added CDF support +#3.0b21 - 4/14/2004 - MAP - added Hot RSS support +#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in +# results dict; changed results dict to allow getting values with results.key +# as well as results[key]; work around embedded illformed HTML with half +# a DOCTYPE; work around malformed Content-Type header; if character encoding +# is wrong, try several common ones before falling back to regexes (if this +# works, bozo_exception is set to CharacterEncodingOverride); fixed character +# encoding issues in BaseHTMLProcessor by tracking encoding and converting +# from Unicode to raw strings before feeding data to sgmllib.SGMLParser; +# convert each value in results to Unicode (if possible), even if using +# regex-based parsing diff --git a/feedlist b/feedlist index edd69a5..a7e3373 100644 --- a/feedlist +++ b/feedlist @@ -4,10 +4,8 @@ # name url # Davyd Madeley http://www.livejournal.com/users/davyd/data/rss -# This feed is broken for caching, also slow Ian McKellar http://www.livejournal.com/users/loic/data/rss -# Ian McKellar http://ian.mckellar.org/wp-rss2.php -# Grahame Bowland http://www.livejournal.com/users/grahame/data/rss +Grahame Bowland http://www.advogato.org/person/gbowland/rss.xml Adam Wright http://www.livejournal.com/users/hipikat/data/rss Adrian Chadd http://blog.cacheboy.net/blogs/cacheboy/index.rdf Trent Lloyd http://www.livejournal.com/users/lathiat/data/rss @@ -25,3 +23,19 @@ Aaron Alderman http://www.livejournal.com/users/palaceboy/data/rss Brad Wake http://www.livejournal.com/users/thebmw/data/rss Paul Marinceu http://www.advogato.org/person/elixxir/rss.xml David Thackaberry http://www.livejournal.com/users/tryce/data/rss +Rhys Bevilaqua http://www.livejournal.com/users/norp/data/rss +Colm Kiely http://www.livejournal.com/users/col_ki/data/rss +Ben Murrihy http://www.livejournal.com/users/benmurrihy/data/rss +Davis Griffin http://www.livejournal.com/users/c_avdas/data/rss +Ewan MacLeod http://www.livejournal.com/users/drayke_/data/rss +Rob Slaughter http://www.livejournal.com/users/robthesilent/data/rss +Alex Dawson http://www.livejournal.com/users/theducks/data/rss +Tracey Brown http://www.livejournal.com/users/tazaria/data/rss +Lionel Pryce http://www.livejournal.com/users/jetblackvalias/data/rss +Carlo Andreacchio http://www.livejournal.com/users/the_icon_of_sin/data/rss +Rohan Joyce http://www.livejournal.com/users/booto/data/rss +Greg Cresp http://www.livejournal.com/users/the_riviera_kid/data/rss +Adrian Woodley http://www.diskworld.com.au/blog/adrian/index.rss +Chris Harris http://www.diskworld.com.au/blog/chris/index.rss +Chris Grubb http://www.livejournal.com/users/maelstrm/data/rss +Michael Grubb http://www.livejournal.com/users/grubbmr/data/rss diff --git a/update-planet b/update-planet index 4e23e5b..0de1c28 100755 --- a/update-planet +++ b/update-planet @@ -38,8 +38,10 @@ for feed in feeds: blog.feedURL = feed[1] blogs.append(blog) # check the old copy of the cache, vs the new copy - if not feed[2] or not feed[2].cache or not blog or not blog.cache or feed[2].cache != blog.cache: + if not feed[2] or not feed[2].cache or not blog.cache or feed[2].cache != blog.cache: tainted = True + elif len(blog.items) > 0 and len(feed[2].items) > 0 and (blog.items[0].itemTitle != feed[2].items[0].itemTitle or blog.items[0].contents != feed[2].items[0].contents): + tainted = True # write the cache back down to disk cache.storeBlog(blog) else: