X-Git-Url: https://git.ucc.asn.au/?a=blobdiff_plain;f=extra%2Ffeedparser.py;h=d101dd9fc1567770b794c28b7c2328bdaa706b58;hb=c907f1dfe8becd4106a1544366d58b027ac89d2a;hp=4c4afd9f984dc70bfed8eae1a22c3e2ca6d18788;hpb=1053c3e285585dfa0116c185525448ae1e07d0c0;p=planet-ucc.git diff --git a/extra/feedparser.py b/extra/feedparser.py index 4c4afd9..d101dd9 100644 --- a/extra/feedparser.py +++ b/extra/feedparser.py @@ -1,37 +1,50 @@ #!/usr/bin/env python """Universal feed parser -Visit http://diveintomark.org/projects/feed_parser/ for the latest version - Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds +Visit http://feedparser.org/ for the latest version +Visit http://feedparser.org/docs/ for the latest documentation + Required: Python 2.1 or later Recommended: Python 2.3 or later -Recommended: libxml2 +Recommended: CJKCodecs and iconv_codec """ -__version__ = "3.0-beta-22" -__author__ = "Mark Pilgrim " +#__version__ = "pre-3.3-" + "$Revision: 1.3 $"[11:15] + "-cvs" +__version__ = "3.3" +__license__ = "Python" __copyright__ = "Copyright 2002-4, Mark Pilgrim" +__author__ = "Mark Pilgrim " __contributors__ = ["Jason Diamond ", "John Beimler ", - "Fazal Majid "] -__license__ = "Python" + "Fazal Majid ", + "Aaron Swartz "] _debug = 0 -_debug_never_use_libxml2 = 0 -# if you are embedding feedparser in a larger application, you should change this to your application name and URL -USER_AGENT = "UniversalFeedParser/%s%s +http://diveintomark.org/projects/feed_parser/" % (__version__, _debug and "-debug" or "") +# HTTP "User-Agent" header to send to servers when downloading feeds. +# If you are embedding feedparser in a larger application, you should +# change this to your application name and URL. +USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ + +# HTTP "Accept" header to send to servers when downloading feeds. If you don't +# want to send an Accept header, set this to None. +ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" -# If you want feedparser to automatically run HTML markup through HTML Tidy, set this to 1. -# This is off by default because of reports of crashing on some platforms. If it crashes -# for you, please submit a bug report with your OS platform, Python version, and the URL -# of the feed you were attempting to parse. +# List of preferred XML parsers, by SAX driver name. These will be tried first, +# but if they're not installed, Python will keep searching through its own list +# of pre-installed parsers until it finds one that supports everything we need. +PREFERRED_XML_PARSERS = ["drv_libxml2"] + +# If you want feedparser to automatically run HTML markup through HTML Tidy, set +# this to 1. This is off by default because of reports of crashing on some +# platforms. If it crashes for you, please submit a bug report with your OS +# platform, Python version, and the URL of the feed you were attempting to parse. # Requires mxTidy TIDY_MARKUP = 0 # ---------- required modules (should come with any Python distribution) ---------- -import sgmllib, re, sys, copy, urlparse, time, rfc822, types +import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi try: from cStringIO import StringIO as _StringIO except: @@ -44,6 +57,10 @@ try: import gzip except: gzip = None +try: + import zlib +except: + zlib = None # timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers. # Python 2.3 now has this functionality available in the standard socket library, so under @@ -51,12 +68,12 @@ except: # module is buggy and timeoutsocket is better. try: import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py - timeoutsocket.setDefaultSocketTimeout(10) + timeoutsocket.setDefaultSocketTimeout(20) except ImportError: import socket if hasattr(socket, 'setdefaulttimeout'): - socket.setdefaulttimeout(10) -import urllib2 + socket.setdefaulttimeout(20) +import urllib, urllib2 _mxtidy = None if TIDY_MARKUP: @@ -65,19 +82,14 @@ if TIDY_MARKUP: except: pass -# If a real XML parser is available, feedparser will attempt to use it. feedparser works -# with both the built-in SAX parser and PyXML SAX parser. On platforms where the Python -# distribution does not come with an XML parser (such as Mac OS X 10.2 and some versions of -# FreeBSD), feedparser will just fall back on regex-based parsing. If XML libraries are -# available but the feed turns out not to be well-formed XML, feedparser will fall back -# on regex-based parsing and set the "bozo" bit in the results to indicate that the feed -# author is a bozo who can't generate well-formed XML. The two advantages of using a real -# XML parser are (1) Unicode support, and (2) to get people to stop yelling at me for not -# using one. +# If a real XML parser is available, feedparser will attempt to use it. feedparser has +# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the +# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some +# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. try: import xml.sax + xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers from xml.sax.saxutils import escape as _xmlescape - class CharacterEncodingOverride(xml.sax.SAXException): pass _XML_AVAILABLE = 1 except: _XML_AVAILABLE = 0 @@ -92,10 +104,26 @@ try: import base64, binascii except: base64 = binascii = None - + +# cjkcodecs and iconv_codec provide support for more character encodings. +# Both are available from http://cjkpython.i18n.org/ +try: + import cjkcodecs.aliases +except: + pass +try: + import iconv_codec +except: + pass + # ---------- don't touch these ---------- +class CharacterEncodingOverride(Exception): pass +class CharacterEncodingUnknown(Exception): pass +class NonXMLContentType(Exception): pass + sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') sgmllib.special = re.compile(' -1: @@ -387,22 +499,13 @@ class _FeedParserMixin: return urlparse.urljoin(self.baseuri or '', uri) def decodeEntities(self, element, data): - if self.contentparams.get('mode') == 'escaped': - data = data.replace('<', '<') - data = data.replace('>', '>') - data = data.replace('&', '&') - data = data.replace('"', '"') - data = data.replace(''', "'") return data - + def push(self, element, expectingText): -# while self.elementstack and self.elementstack[-1][1]: -# self.pop(self.elementstack[-1][0]) self.elementstack.append([element, expectingText, []]) def pop(self, element): if not self.elementstack: return -# while self.elementstack[-1][0] != element: self.pop(self.elementstack[-1][0]) if self.elementstack[-1][0] != element: return element, expectingText, pieces = self.elementstack.pop() @@ -427,14 +530,16 @@ class _FeedParserMixin: output = self.decodeEntities(element, output) # resolve relative URIs within embedded markup - if element in self.can_contain_relative_uris: - output = _resolveRelativeURIs(output, self.baseuri, self.encoding) + if self.contentparams.get('type', 'text/html') in self.html_types: + if element in self.can_contain_relative_uris: + output = _resolveRelativeURIs(output, self.baseuri, self.encoding) # sanitize embedded markup - if element in self.can_contain_dangerous_markup: - output = _sanitizeHTML(output, self.encoding) + if self.contentparams.get('type', 'text/html') in self.html_types: + if element in self.can_contain_dangerous_markup: + output = _sanitizeHTML(output, self.encoding) - if type(output) == types.StringType: + if self.encoding and (type(output) == types.StringType): try: output = unicode(output, self.encoding) except: @@ -458,14 +563,16 @@ class _FeedParserMixin: if output: self.entries[-1]['links'][-1]['href'] = output else: + if element == 'description': + element = 'summary' self.entries[-1][element] = output if self.incontent: - if element == 'description': - element = 'summary' contentparams = copy.deepcopy(self.contentparams) contentparams['value'] = output self.entries[-1][element + '_detail'] = contentparams elif self.infeed and (not self.intextinput) and (not self.inimage): + if element == 'description': + element = 'tagline' self.feeddata[element] = output if element == 'category': domain = self.feeddata['categories'][-1][0] @@ -473,8 +580,6 @@ class _FeedParserMixin: elif element == 'link': self.feeddata['links'][-1]['href'] = output elif self.incontent: - if element == 'description': - element = 'tagline' contentparams = copy.deepcopy(self.contentparams) contentparams['value'] = output self.feeddata[element + '_detail'] = contentparams @@ -493,11 +598,10 @@ class _FeedParserMixin: return attrsD.get(self._mapToStandardPrefix(name)) def _save(self, key, value): - if value: - if self.inentry: - self.entries[-1].setdefault(key, value) - elif self.feeddata: - self.feeddata.setdefault(key, value) + if self.inentry: + self.entries[-1].setdefault(key, value) + elif self.feeddata: + self.feeddata.setdefault(key, value) def _start_rss(self, attrsD): versionmap = {'0.91': 'rss091u', @@ -524,7 +628,6 @@ class _FeedParserMixin: def _cdf_common(self, attrsD): if attrsD.has_key('lastmod'): - if _debug: sys.stderr.write(attrsD['lastmod'] + '\n') self._start_modified({}) self.elementstack[-1][-1] = attrsD['lastmod'] self._end_modified() @@ -552,8 +655,12 @@ class _FeedParserMixin: def _start_image(self, attrsD): self.inimage = 1 + self.push('image', 0) + context = self._getContext() + context.setdefault('image', FeedParserDict()) def _end_image(self): + self.pop('image') self.inimage = 0 def _start_textinput(self, attrsD): @@ -607,6 +714,32 @@ class _FeedParserMixin: context = self._getContext() context['textinput']['name'] = value + def _start_width(self, attrsD): + self.push('width', 0) + + def _end_width(self): + value = self.pop('width') + try: + value = int(value) + except: + value = 0 + if self.inimage: + context = self._getContext() + context['image']['width'] = value + + def _start_height(self, attrsD): + self.push('height', 0) + + def _end_height(self): + value = self.pop('height') + try: + value = int(value) + except: + value = 0 + if self.inimage: + context = self._getContext() + context['image']['height'] = value + def _start_url(self, attrsD): self.push('url', 1) _start_homepage = _start_url @@ -619,11 +752,11 @@ class _FeedParserMixin: elif self.incontributor: self._save_contributor('url', value) elif self.inimage: - # TODO - pass + context = self._getContext() + context['image']['url'] = value elif self.intextinput: - # TODO (map to link) - pass + context = self._getContext() + context['textinput']['link'] = value _end_homepage = _end_url _end_uri = _end_url @@ -656,20 +789,20 @@ class _FeedParserMixin: context.setdefault('contributors', [FeedParserDict()]) context['contributors'][-1][key] = value - def _sync_author_detail(self): + def _sync_author_detail(self, key='author'): context = self._getContext() - detail = context.get('author_detail') + detail = context.get('%s_detail' % key) if detail: name = detail.get('name') email = detail.get('email') if name and email: - context['author'] = "%s (%s)" % (name, email) + context[key] = "%s (%s)" % (name, email) elif name: - context['author'] = name + context[key] = name elif email: - context['author'] = email + context[key] = email else: - author = context.get('author') + author = context.get(key) if not author: return emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author) if not emailmatch: return @@ -683,16 +816,16 @@ class _FeedParserMixin: if author and (author[-1] == ')'): author = author[:-1] author = author.strip() - context.setdefault('author_detail', FeedParserDict()) - context['author_detail']['name'] = author - context['author_detail']['email'] = email + context.setdefault('%s_detail' % key, FeedParserDict()) + context['%s_detail' % key]['name'] = author + context['%s_detail' % key]['email'] = email def _start_tagline(self, attrsD): self.incontent += 1 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), 'type': attrsD.get('type', 'text/plain'), - 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)}) + 'language': self.lang, + 'base': self.baseuri}) self.push('tagline', 1) _start_subtitle = _start_tagline @@ -708,8 +841,8 @@ class _FeedParserMixin: self.incontent += 1 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), 'type': attrsD.get('type', 'text/plain'), - 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)}) + 'language': self.lang, + 'base': self.baseuri}) self.push('copyright', 1) _start_dc_rights = _start_copyright @@ -723,6 +856,11 @@ class _FeedParserMixin: self.entries.append(FeedParserDict()) self.push('item', 0) self.inentry = 1 + self.guidislink = 0 + id = self._getAttribute(attrsD, 'rdf:about') + if id: + context = self._getContext() + context['id'] = id self._cdf_common(attrsD) _start_entry = _start_item _start_product = _start_item @@ -746,6 +884,7 @@ class _FeedParserMixin: def _end_dc_publisher(self): self.pop('publisher') + self._sync_author_detail('publisher') _end_webmaster = _end_dc_publisher def _start_dcterms_issued(self, attrsD): @@ -774,10 +913,7 @@ class _FeedParserMixin: def _end_dcterms_modified(self): value = self.pop('modified') - if _debug: sys.stderr.write('_end_dcterms_modified, value=' + value + '\n') parsed_value = _parse_date(value) - self._save('date', value) - self._save('date_parsed', parsed_value) self._save('modified_parsed', parsed_value) _end_modified = _end_dcterms_modified _end_dc_date = _end_dcterms_modified @@ -820,7 +956,7 @@ class _FeedParserMixin: _end_keywords = _end_category def _start_cloud(self, attrsD): - self.feeddata['cloud'] = attrsD + self.feeddata['cloud'] = FeedParserDict(attrsD) def _start_link(self, attrsD): attrsD.setdefault('rel', 'alternate') @@ -830,10 +966,10 @@ class _FeedParserMixin: expectingText = self.infeed or self.inentry if self.inentry: self.entries[-1].setdefault('links', []) - self.entries[-1]['links'].append(attrsD) + self.entries[-1]['links'].append(FeedParserDict(attrsD)) elif self.infeed: self.feeddata.setdefault('links', []) - self.feeddata['links'].append(attrsD) + self.feeddata['links'].append(FeedParserDict(attrsD)) if attrsD.has_key('href'): expectingText = 0 if attrsD.get('type', '') in self.html_types: @@ -850,15 +986,18 @@ class _FeedParserMixin: if self.intextinput: context = self._getContext() context['textinput']['link'] = value + if self.inimage: + context = self._getContext() + context['image']['link'] = value _end_producturl = _end_link def _start_guid(self, attrsD): self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') - self.push('guid', 1) + self.push('id', 1) def _end_guid(self): - value = self.pop('guid') - self._save('id', value) + value = self.pop('id') + self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) if self.guidislink: # guid acts as link, but only if "ispermalink" is not present or is "true", # and only if the item doesn't already have a link element @@ -869,14 +1008,15 @@ class _FeedParserMixin: def _end_id(self): value = self.pop('id') - self._save('guid', value) def _start_title(self, attrsD): self.incontent += 1 + if _debug: sys.stderr.write('attrsD.xml:lang = %s\n' % attrsD.get('xml:lang')) + if _debug: sys.stderr.write('self.lang = %s\n' % self.lang) self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), 'type': attrsD.get('type', 'text/plain'), - 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)}) + 'language': self.lang, + 'base': self.baseuri}) self.push('title', self.infeed or self.inentry) _start_dc_title = _start_title @@ -887,14 +1027,17 @@ class _FeedParserMixin: if self.intextinput: context = self._getContext() context['textinput']['title'] = value + elif self.inimage: + context = self._getContext() + context['image']['title'] = value _end_dc_title = _end_title def _start_description(self, attrsD, default_content_type='text/html'): self.incontent += 1 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), 'type': attrsD.get('type', default_content_type), - 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)}) + 'language': self.lang, + 'base': self.baseuri}) self.push('description', self.infeed or self.inentry) def _start_abstract(self, attrsD): @@ -907,18 +1050,20 @@ class _FeedParserMixin: context = self._getContext() if self.intextinput: context['textinput']['description'] = value - elif self.inentry: - context['summary'] = value - elif self.infeed: - context['tagline'] = value + elif self.inimage: + context['image']['description'] = value +# elif self.inentry: +# context['summary'] = value +# elif self.infeed: +# context['tagline'] = value _end_abstract = _end_description def _start_info(self, attrsD): self.incontent += 1 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), 'type': attrsD.get('type', 'text/plain'), - 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)}) + 'language': self.lang, + 'base': self.baseuri}) self.push('info', 1) def _end_info(self): @@ -930,7 +1075,7 @@ class _FeedParserMixin: if attrsD: if attrsD.has_key('url'): attrsD['url'] = self.resolveURI(attrsD['url']) - self.feeddata['generator_detail'] = attrsD + self.feeddata['generator_detail'] = FeedParserDict(attrsD) self.push('generator', 1) def _end_generator(self): @@ -944,6 +1089,7 @@ class _FeedParserMixin: if value: self.elementstack[-1][2].append(value) self.pop('generator') + self.feeddata['generator_detail'] = FeedParserDict({"url": value}) def _start_admin_errorreportsto(self, attrsD): self.push('errorreportsto', 1) @@ -956,8 +1102,8 @@ class _FeedParserMixin: self.incontent += 1 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'), 'type': attrsD.get('type', 'text/plain'), - 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)}) + 'language': self.lang, + 'base': self.baseuri}) self.push('summary', 1) def _end_summary(self): @@ -970,11 +1116,11 @@ class _FeedParserMixin: def _start_enclosure(self, attrsD): if self.inentry: self.entries[-1].setdefault('enclosures', []) - self.entries[-1]['enclosures'].append(attrsD) + self.entries[-1]['enclosures'].append(FeedParserDict(attrsD)) def _start_source(self, attrsD): if self.inentry: - self.entries[-1]['source'] = attrsD + self.entries[-1]['source'] = FeedParserDict(attrsD) self.push('source', 1) def _end_source(self): @@ -984,24 +1130,24 @@ class _FeedParserMixin: self.incontent += 1 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'), 'type': attrsD.get('type', 'text/plain'), - 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)}) + 'language': self.lang, + 'base': self.baseuri}) self.push('content', 1) def _start_prodlink(self, attrsD): self.incontent += 1 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'), 'type': attrsD.get('type', 'text/html'), - 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)}) + 'language': self.lang, + 'base': self.baseuri}) self.push('content', 1) def _start_body(self, attrsD): self.incontent += 1 self.contentparams = FeedParserDict({'mode': 'xml', 'type': 'application/xhtml+xml', - 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)}) + 'language': self.lang, + 'base': self.baseuri}) self.push('content', 1) _start_xhtml_body = _start_body @@ -1009,8 +1155,8 @@ class _FeedParserMixin: self.incontent += 1 self.contentparams = FeedParserDict({'mode': 'escaped', 'type': 'text/html', - 'language': attrsD.get('xml:lang', self.lang), - 'base': attrsD.get('xml:base', self.baseuri)}) + 'language': self.lang, + 'base': self.baseuri}) self.push('content', 1) _start_fullitem = _start_content_encoded @@ -1027,11 +1173,11 @@ class _FeedParserMixin: _end_prodlink = _end_content if _XML_AVAILABLE: - class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler, xml.sax.handler.EntityResolver):#, xml.sax.handler.DTDHandler): - def __init__(self, baseuri, encoding): + class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): + def __init__(self, baseuri, baselang, encoding): if _debug: sys.stderr.write('trying StrictFeedParser\n') xml.sax.handler.ContentHandler.__init__(self) - _FeedParserMixin.__init__(self, baseuri, encoding) + _FeedParserMixin.__init__(self, baseuri, baselang, encoding) self.bozo = 0 self.exc = None @@ -1066,8 +1212,8 @@ if _XML_AVAILABLE: attrsD[str(qname).lower()] = attrs.getValueByQName(qname) self.unknown_starttag(localname, attrsD.items()) - def resolveEntity(self, publicId, systemId): - return _StringIO() +# def resolveEntity(self, publicId, systemId): +# return _StringIO() def characters(self, text): self.handle_data(text) @@ -1095,6 +1241,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): def __init__(self, encoding): self.encoding = encoding + if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) sgmllib.SGMLParser.__init__(self) def reset(self): @@ -1106,13 +1253,16 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): data = re.sub(r'<(\S+)/>', r'<\1>', data) data = data.replace(''', "'") data = data.replace('"', '"') - if type(data) == types.UnicodeType: + if self.encoding and (type(data) == types.UnicodeType): data = data.encode(self.encoding) sgmllib.SGMLParser.feed(self, data) def normalize_attrs(self, attrs): # utility method to be called by descendants - attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs] + attrs = [(k.lower(), v) for k, v in attrs] +# if self.encoding: +# if _debug: sys.stderr.write('normalize_attrs, encoding=%s\n' % self.encoding) +# attrs = [(k, v.encode(self.encoding)) for k, v in attrs] attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] return attrs @@ -1170,7 +1320,6 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match def _scan_name(self, i, declstartpos): rawdata = self.rawdata - if _debug: sys.stderr.write("i=%s, declstartpos=%s, rawdata=%s\n" % (i, declstartpos, rawdata)) n = len(rawdata) if i == n: return None, -1 @@ -1188,17 +1337,32 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): def output(self): """Return processed HTML as a single string""" - if _debug: - for p in self.pieces: - sys.stderr.write(p) - sys.stderr.write('\n') return "".join([str(p) for p in self.pieces]) class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): - def __init__(self, baseuri, encoding): + def __init__(self, baseuri, baselang, encoding): sgmllib.SGMLParser.__init__(self) - _FeedParserMixin.__init__(self, baseuri, encoding) + _FeedParserMixin.__init__(self, baseuri, baselang, encoding) + def decodeEntities(self, element, data): + data = data.replace('<', '<') + data = data.replace('<', '<') + data = data.replace('>', '>') + data = data.replace('>', '>') + data = data.replace('&', '&') + data = data.replace('&', '&') + data = data.replace('"', '"') + data = data.replace('"', '"') + data = data.replace(''', ''') + data = data.replace(''', ''') + if self.contentparams.get('mode') == 'escaped': + data = data.replace('<', '<') + data = data.replace('>', '>') + data = data.replace('&', '&') + data = data.replace('"', '"') + data = data.replace(''', "'") + return data + class _RelativeURIResolver(_BaseHTMLProcessor): relative_uris = [('a', 'href'), ('applet', 'codebase'), @@ -1241,7 +1405,6 @@ class _RelativeURIResolver(_BaseHTMLProcessor): def _resolveRelativeURIs(htmlSource, baseURI, encoding): if _debug: sys.stderr.write("entering _resolveRelativeURIs\n") p = _RelativeURIResolver(baseURI, encoding) - if _debug: sys.stderr.write(repr(type(htmlSource)) + '\n') p.feed(htmlSource) return p.output() @@ -1317,25 +1480,33 @@ class _FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandl def http_error_default(self, req, fp, code, msg, headers): if ((code / 100) == 3) and (code != 304): return self.http_error_302(req, fp, code, msg, headers) - from urllib import addinfourl - infourl = addinfourl(fp, headers, req.get_full_url()) + infourl = urllib.addinfourl(fp, headers, req.get_full_url()) infourl.status = code return infourl def http_error_302(self, req, fp, code, msg, headers): - infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) - infourl.status = code + if headers.dict.has_key('location'): + infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) + else: + infourl = urllib.addinfourl(fp, headers, req.get_full_url()) + if not hasattr(infourl, 'status'): + infourl.status = code return infourl def http_error_301(self, req, fp, code, msg, headers): - infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) - infourl.status = code + if headers.dict.has_key('location'): + infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) + else: + infourl = urllib.addinfourl(fp, headers, req.get_full_url()) + if not hasattr(infourl, 'status'): + infourl.status = code return infourl http_error_300 = http_error_302 + http_error_303 = http_error_302 http_error_307 = http_error_302 -def _open_resource(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None): +def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): """URL, filename, or string --> stream This function lets you define parsers that take any input source @@ -1357,6 +1528,9 @@ def _open_resource(url_file_stream_or_string, etag=None, modified=None, agent=No If the referrer argument is supplied, it will be used as the value of a Referer[sic] request header. + + If handlers is supplied, it is a list of handlers used to build a + urllib2 opener. """ if hasattr(url_file_stream_or_string, "read"): @@ -1368,7 +1542,16 @@ def _open_resource(url_file_stream_or_string, etag=None, modified=None, agent=No if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): if not agent: agent = USER_AGENT - + # test for inline user:password for basic auth + auth = None + if base64: + urltype, rest = urllib.splittype(url_file_stream_or_string) + realhost, rest = urllib.splithost(rest) + if realhost: + user_passwd, realhost = urllib.splituser(realhost) + if user_passwd: + url_file_stream_or_string = "%s://%s%s" % (urltype, realhost, rest) + auth = base64.encodestring(user_passwd).strip() # try to open with urllib2 (to use optional headers) request = urllib2.Request(url_file_stream_or_string) request.add_header("User-Agent", agent) @@ -1384,22 +1567,22 @@ def _open_resource(url_file_stream_or_string, etag=None, modified=None, agent=No request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) if referrer: request.add_header("Referer", referrer) - if gzip: + if gzip and zlib: + request.add_header("Accept-encoding", "gzip, deflate") + elif gzip: request.add_header("Accept-encoding", "gzip") - opener = urllib2.build_opener(_FeedURLHandler()) + elif zlib: + request.add_header("Accept-encoding", "deflate") + else: + request.add_header("Accept-encoding", "") + if auth: + request.add_header("Authorization", "Basic %s" % auth) + if ACCEPT_HEADER: + request.add_header("Accept", ACCEPT_HEADER) + opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent try: - try: - return opener.open(request) -# except ValueError: -# # not a valid URL, but might be a valid filename -# pass -# except AssertionError: -# # under Python 2.1, non-URLs will fail with an AssertionError; -# # still might be a valid filename, so fall through -# pass - except: - return _StringIO('') + return opener.open(request) finally: opener.close() # JohnD @@ -1412,11 +1595,281 @@ def _open_resource(url_file_stream_or_string, etag=None, modified=None, agent=No # treat url_file_stream_or_string as string return _StringIO(str(url_file_stream_or_string)) +_date_handlers = [] +def registerDateHandler(func): + """Register a date handler function (takes string, returns 9-tuple date in GMT)""" + _date_handlers.insert(0, func) + +# ISO-8601 date parsing routines written by Fazal Majid. +# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 +# parser is beyond the scope of feedparser and would be a worthwhile addition +# to the Python library. +# A single regular expression cannot parse ISO 8601 date formats into groups +# as the standard is highly irregular (for instance is 030104 2003-01-04 or +# 0301-04-01), so we use templates instead. +# Please note the order in templates is significant because we need a +# greedy match. +_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO', + 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', + '-YY-?MM', '-OOO', '-YY', + '--MM-?DD', '--MM', + '---DD', + 'CC', ''] +_iso8601_re = [ + tmpl.replace( + 'YYYY', r'(?P\d{4})').replace( + 'YY', r'(?P\d\d)').replace( + 'MM', r'(?P[01]\d)').replace( + 'DD', r'(?P[0123]\d)').replace( + 'OOO', r'(?P[0123]\d\d)').replace( + 'CC', r'(?P\d\d$)') + + r'(T?(?P\d{2}):(?P\d{2})' + + r'(:(?P\d{2}))?' + + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' + for tmpl in _iso8601_tmpl] +del tmpl +_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] +del regex +def _parse_date_iso8601(dateString): + """Parse a variety of ISO-8601-compatible formats like 20040105""" + m = None + for _iso8601_match in _iso8601_matches: + m = _iso8601_match(dateString) + if m: break + if not m: return + if m.span() == (0, 0): return + params = m.groupdict() + ordinal = params.get("ordinal", 0) + if ordinal: + ordinal = int(ordinal) + else: + ordinal = 0 + year = params.get("year", "--") + if not year or year == "--": + year = time.gmtime()[0] + elif len(year) == 2: + # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 + year = 100 * int(time.gmtime()[0] / 100) + int(year) + else: + year = int(year) + month = params.get("month", "-") + if not month or month == "-": + # ordinals are NOT normalized by mktime, we simulate them + # by setting month=1, day=ordinal + if ordinal: + month = 1 + else: + month = time.gmtime()[1] + month = int(month) + day = params.get("day", 0) + if not day: + # see above + if ordinal: + day = ordinal + elif params.get("century", 0) or \ + params.get("year", 0) or params.get("month", 0): + day = 1 + else: + day = time.gmtime()[2] + else: + day = int(day) + # special case of the century - is the first year of the 21st century + # 2000 or 2001 ? The debate goes on... + if "century" in params.keys(): + year = (int(params["century"]) - 1) * 100 + 1 + # in ISO 8601 most fields are optional + for field in ["hour", "minute", "second", "tzhour", "tzmin"]: + if not params.get(field, None): + params[field] = 0 + hour = int(params.get("hour", 0)) + minute = int(params.get("minute", 0)) + second = int(params.get("second", 0)) + # weekday is normalized by mktime(), we can ignore it + weekday = 0 + # daylight savings is complex, but not needed for feedparser's purposes + # as time zones, if specified, include mention of whether it is active + # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and + # and most implementations have DST bugs + daylight_savings_flag = 0 + tm = [year, month, day, hour, minute, second, weekday, + ordinal, daylight_savings_flag] + # ISO 8601 time zone adjustments + tz = params.get("tz") + if tz and tz != "Z": + if tz[0] == "-": + tm[3] += int(params.get("tzhour", 0)) + tm[4] += int(params.get("tzmin", 0)) + elif tz[0] == "+": + tm[3] -= int(params.get("tzhour", 0)) + tm[4] -= int(params.get("tzmin", 0)) + else: + return None + # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) + # which is guaranteed to normalize d/m/y/h/m/s. + # Many implementations have bugs, but we'll pretend they don't. + return time.localtime(time.mktime(tm)) +registerDateHandler(_parse_date_iso8601) + +# 8-bit date handling routines written by ytrewq1. +_korean_year = u'\ub144' # b3e2 in euc-kr +_korean_month = u'\uc6d4' # bff9 in euc-kr +_korean_day = u'\uc77c' # c0cf in euc-kr +_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr +_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr + +_korean_onblog_date_re = \ + re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ + (_korean_year, _korean_month, _korean_day)) +_korean_nate_date_re = \ + re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ + (_korean_am, _korean_pm)) +def _parse_date_onblog(dateString): + """Parse a string according to the OnBlog 8-bit date format""" + m = _korean_onblog_date_re.match(dateString) + if not m: return + w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" % \ + {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ + 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ + 'zonediff': '+09:00'} + if _debug: sys.stderr.write("OnBlog date parsed as: %s\n" % w3dtfdate) + return _parse_date_w3dtf(w3dtfdate) +registerDateHandler(_parse_date_onblog) + +def _parse_date_nate(dateString): + """Parse a string according to the Nate 8-bit date format""" + m = _korean_nate_date_re.match(dateString) + if not m: return + hour = int(m.group(5)) + ampm = m.group(4) + if (ampm == _korean_pm): + hour += 12 + hour = str(hour) + if len(hour) == 1: + hour = '0' + hour + w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" % \ + {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ + 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ + 'zonediff': '+09:00'} + if _debug: sys.stderr.write("Nate date parsed as: %s\n" % w3dtfdate) + return _parse_date_w3dtf(w3dtfdate) +registerDateHandler(_parse_date_nate) + +_mssql_date_re = \ + re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})\.\d+') +def _parse_date_mssql(dateString): + """Parse a string according to the MS SQL date format""" + m = _mssql_date_re.match(dateString) + if not m: return + w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" % \ + {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ + 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ + 'zonediff': '+09:00'} + if _debug: sys.stderr.write("MS SQL date parsed as: %s\n" % w3dtfdate) + return _parse_date_w3dtf(w3dtfdate) +registerDateHandler(_parse_date_mssql) + +# Unicode strings for Greek date strings +_greek_months = \ + { \ + u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 + u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 + u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 + u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 + u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 + u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 + u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 + u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 + u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 + u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 + u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 + u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 + u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 + u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 + u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 + u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 + u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 + u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 + u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 + } + +_greek_wdays = \ + { \ + u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 + u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 + u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 + u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 + u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 + u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 + u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 + } + +_greek_date_format_re = \ + re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') + +def _parse_date_greek(dateString): + """Parse a string according to a Greek 8-bit date format.""" + m = _greek_date_format_re.match(dateString) + if not m: return + try: + wday = _greek_wdays[m.group(1)] + month = _greek_months[m.group(3)] + except: + return + rfc822date = "%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s" % \ + {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ + 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ + 'zonediff': m.group(8)} + if _debug: sys.stderr.write("Greek date parsed as: %s\n" % rfc822date) + return _parse_date_rfc822(rfc822date) +registerDateHandler(_parse_date_greek) + +# Unicode strings for Hungarian date strings +_hungarian_months = \ + { \ + u'janu\u00e1r': u'01', # e1 in iso-8859-2 + u'febru\u00e1ri': u'02', # e1 in iso-8859-2 + u'm\u00e1rcius': u'03', # e1 in iso-8859-2 + u'\u00e1prilis': u'04', # e1 in iso-8859-2 + u'm\u00e1ujus': u'05', # e1 in iso-8859-2 + u'j\u00fanius': u'06', # fa in iso-8859-2 + u'j\u00falius': u'07', # fa in iso-8859-2 + u'augusztus': u'08', + u'szeptember': u'09', + u'okt\u00f3ber': u'10', # f3 in iso-8859-2 + u'november': u'11', + u'december': u'12', + } + +_hungarian_date_format_re = \ + re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') + +def _parse_date_hungarian(dateString): + """Parse a string according to a Hungarian 8-bit date format.""" + m = _hungarian_date_format_re.match(dateString) + if not m: return + try: + month = _hungarian_months[m.group(2)] + day = m.group(3) + if len(day) == 1: + day = '0' + day + hour = m.group(4) + if len(hour) == 1: + hour = '0' + hour + except: + return + w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s" % \ + {'year': m.group(1), 'month': month, 'day': day,\ + 'hour': hour, 'minute': m.group(5),\ + 'zonediff': m.group(6)} + if _debug: sys.stderr.write("Hungarian date parsed as: %s\n" % w3dtfdate) + return _parse_date_w3dtf(w3dtfdate) +registerDateHandler(_parse_date_hungarian) + # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by # Drake and licensed under the Python license. Removed all range checking # for month, day, hour, minute, and second, since mktime will normalize # these later -def _w3dtf_parse(s): +def _parse_date_w3dtf(dateString): def __extract_date(m): year = int(m.group("year")) if year < 100: @@ -1504,146 +1957,39 @@ def _w3dtf_parse(s): + __tzd_re) __datetime_re = "%s(?:T%s)?" % (__date_re, __time_re) __datetime_rx = re.compile(__datetime_re) - m = __datetime_rx.match(s) - if m is None or m.group() != s: - return None + m = __datetime_rx.match(dateString) + if (m is None) or (m.group() != dateString): return gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) if gmt[0] == 0: return - return time.mktime(gmt) + __extract_tzd(m) - time.timezone - -# Additional ISO-8601 date parsing routines written by Fazal Majid -# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 -# parser is beyond the scope of feedparser and would be a worthwhile addition -# to the Python library -# A single regular expression cannot parse ISO 8601 date formats into groups -# as the standard is highly irregular (for instance is 030104 2003-01-04 or -# 0301-04-01), so we use templates instead -# Please note the order in templates is significant because we need a -# greedy match -_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO', - 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', - '-YY-?MM', '-OOO', '-YY', - '--MM-?DD', '--MM', - '---DD', - 'CC', ''] -_iso8601_re = [ - tmpl.replace( - 'YYYY', r'(?P\d{4})').replace( - 'YY', r'(?P\d\d)').replace( - 'MM', r'(?P[01]\d)').replace( - 'DD', r'(?P[0123]\d)').replace( - 'OOO', r'(?P[0123]\d\d)').replace( - 'CC', r'(?P\d\d$)') - + r'(T?(?P\d{2}):(?P\d{2})' - + r'(:(?P\d{2}))?' - + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' - for tmpl in _iso8601_tmpl] -del tmpl - -_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] -del regex - + return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) +registerDateHandler(_parse_date_w3dtf) + +def _parse_date_rfc822(dateString): + """Parse an RFC822, RFC1123, RFC2822, or asctime-style date""" + tm = rfc822.parsedate_tz(dateString) + if tm: + return time.gmtime(rfc822.mktime_tz(tm)) # rfc822.py defines several time zones, but we define some extra ones. # "ET" is equivalent to "EST", etc. _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} rfc822._timezones.update(_additional_timezones) +registerDateHandler(_parse_date_rfc822) -def _parse_date(date): - """Parses a variety of date formats into a tuple of 9 integers""" - date = str(date) - try: - # try the standard rfc822 library, which handles - # RFC822, RFC1123, RFC2822, and asctime - tm = rfc822.parsedate_tz(date) - if tm: - return time.gmtime(rfc822.mktime_tz(tm)) - # not a RFC2822 date, try W3DTF profile of ISO-8601 +def _parse_date(dateString): + """Parses a variety of date formats into a 9-tuple in GMT""" + for handler in _date_handlers: try: - tm = _w3dtf_parse(date) - except ValueError: - tm = None - if tm: - return time.gmtime(tm) - # try various non-W3DTF ISO-8601-compatible formats like 20040105 - m = None - for _iso8601_match in _iso8601_matches: - m = _iso8601_match(date) - if m: break - if not m: return - # catch truly malformed strings - if m.span() == (0, 0): return - params = m.groupdict() - ordinal = params.get("ordinal", 0) - if ordinal: - ordinal = int(ordinal) - else: - ordinal = 0 - year = params.get("year", "--") - if not year or year == "--": - year = time.gmtime()[0] - elif len(year) == 2: - # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 - year = 100 * int(time.gmtime()[0] / 100) + int(year) - else: - year = int(year) - month = params.get("month", "-") - if not month or month == "-": - # ordinals are NOT normalized by mktime, we simulate them - # by setting month=1, day=ordinal - if ordinal: - month = 1 - else: - month = time.gmtime()[1] - month = int(month) - day = params.get("day", 0) - if not day: - # see above - if ordinal: - day = ordinal - elif params.get("century", 0) or \ - params.get("year", 0) or params.get("month", 0): - day = 1 - else: - day = time.gmtime()[2] - else: - day = int(day) - # special case of the century - is the first year of the 21st century - # 2000 or 2001 ? The debate goes on... - if "century" in params.keys(): - year = (int(params["century"]) - 1) * 100 + 1 - # in ISO 8601 most fields are optional - for field in ["hour", "minute", "second", "tzhour", "tzmin"]: - if not params.get(field, None): - params[field] = 0 - hour = int(params.get("hour", 0)) - minute = int(params.get("minute", 0)) - second = int(params.get("second", 0)) - # weekday is normalized by mktime(), we can ignore it - weekday = 0 - # daylight savings is complex, but not needed for feedparser's purposes - # as time zones, if specified, include mention of whether it is active - # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and - # and most implementations have DST bugs - daylight_savings_flag = 0 - tm = [year, month, day, hour, minute, second, weekday, - ordinal, daylight_savings_flag] - # ISO 8601 time zone adjustments - tz = params.get("tz") - if tz and tz != "Z": - if tz[0] == "-": - tm[3] += int(params.get("tzhour", 0)) - tm[4] += int(params.get("tzmin", 0)) - elif tz[0] == "+": - tm[3] -= int(params.get("tzhour", 0)) - tm[4] -= int(params.get("tzmin", 0)) - else: - return None - # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) - # which is guaranteed to normalize d/m/y/h/m/s - # many implementations have bugs, but we'll pretend they don't - return time.localtime(time.mktime(tm)) - except: - return None + date9tuple = handler(dateString) + if not date9tuple: continue + if len(date9tuple) != 9: + if _debug: sys.stderr.write("date handler function must return 9-tuple\n") + raise ValueError + map(int, date9tuple) + return date9tuple + except Exception, e: + if _debug: sys.stderr.write("%s raised %s\n" % (handler.__name__, repr(e))) + pass + return None def _getCharacterEncoding(http_headers, xml_data): """Get the character encoding of the XML document @@ -1651,9 +1997,9 @@ def _getCharacterEncoding(http_headers, xml_data): http_headers is a dictionary xml_data is a raw string (not Unicode) - This is so much trickier than it sounds, - it's not even funny. According to RFC 3023 ("XML Media Types"), if - the HTTP Content-Type is application/xml, application/*+xml, + This is so much trickier than it sounds, it's not even funny. + According to RFC 3023 ("XML Media Types"), if the HTTP Content-Type + is application/xml, application/*+xml, application/xml-external-parsed-entity, or application/xml-dtd, the encoding given in the charset parameter of the HTTP Content-Type takes precedence over the encoding given in the XML prefix within the @@ -1662,11 +2008,35 @@ def _getCharacterEncoding(http_headers, xml_data): text/xml-external-parsed-entity, the encoding given in the XML prefix within the document is ALWAYS IGNORED and only the encoding given in the charset parameter of the HTTP Content-Type header should be - respected, and it defaults to "us-ascii" if not specified. If - Content-Type is unspecified (input was local file or non-HTTP source) + respected, and it defaults to "us-ascii" if not specified. + + Furthermore, discussion on the atom-syntax mailing list with the + author of RFC 3023 leads me to the conclusion that any document + served with a Content-Type of text/* and no charset parameter + must be treated as us-ascii. (We now do this.) And also that it + must always be flagged as non-well-formed. (We now do this too.) + + If Content-Type is unspecified (input was local file or non-HTTP source) or unrecognized (server just got it totally wrong), then go by the encoding given in the XML prefix of the document and default to - "utf-8" as per the XML specification. + "iso-8859-1" as per the HTTP specification (RFC 2616). + + Then, assuming we didn't find a character encoding in the HTTP headers + (and the HTTP Content-type allowed us to look in the body), we need + to sniff the first few bytes of the XML data and try to determine + whether the encoding is ASCII-compatible. Section F of the XML + specification shows the way here: + http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info + + If the sniffed encoding is not ASCII-compatible, we need to make it + ASCII compatible so that we can sniff further into the XML declaration + to find the encoding attribute, which will tell us the true encoding. + + Of course, none of this guarantees that we will be able to parse the + feed in the declared character encoding (assuming it was declared + correctly, which many are not). CJKCodecs and iconv_codec help a lot; + you should definitely install them if you can. + http://cjkpython.i18n.org/ """ def _parseHTTPContentType(content_type): @@ -1676,72 +2046,139 @@ def _getCharacterEncoding(http_headers, xml_data): If no content type is specified, returns ('', '') Both return parameters are guaranteed to be lowercase strings """ - if not content_type: - return '', '' - content_type = content_type.strip() - paramstr = content_type.split(';')[1:] - if not paramstr: - return content_type, '' - content_type = content_type.split(';', 1)[0].strip().lower() - if not paramstr[0]: - # declaration like "text/xml;" (note ending semicolon) - # dunno if this is malformed but it sure was hard to track down - return content_type, '' - import string - params = dict([map(string.lower, map(string.strip, p.strip().split('=', 1))) for p in paramstr]) - charset = params.get('charset') - if not charset: - return content_type, '' - if charset[0] in ('"', "'"): - charset = charset[1:] - if charset and charset[-1] in ('"', "'"): - charset = charset[:-1] - charset = charset.strip() - return content_type, charset - - true_encoding = None + content_type = content_type or '' + content_type, params = cgi.parse_header(content_type) + return content_type, params.get('charset', '').replace("'", "") + + sniffed_xml_encoding = '' + xml_encoding = '' + true_encoding = '' http_content_type, http_encoding = _parseHTTPContentType(http_headers.get("content-type")) - xml_encoding_match = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) - xml_encoding = xml_encoding_match and xml_encoding_match.groups()[0].lower() or '' - if (http_content_type == 'application/xml') or \ - (http_content_type == 'application/xml-dtd') or \ - (http_content_type == 'application/xml-external-parsed-entity') or \ - (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): - if http_encoding: - true_encoding = http_encoding - elif xml_encoding: - true_encoding = xml_encoding - else: - true_encoding = 'utf-8' - elif (http_content_type == 'text/xml') or \ - (http_content_type == 'text/xml-external-parsed-entity') or \ - (http_content_type.startswith('text/') and http_content_type.endswith('+xml')): - if http_encoding: - true_encoding = http_encoding + # Must sniff for non-ASCII-compatible character encodings before + # searching for XML declaration. This heuristic is defined in + # section F of the XML specification: + # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info + try: + if xml_data[:4] == '\x4c\x6f\xa7\x94': + # EBCDIC + xml_data = _ebcdic_to_ascii(xml_data) + elif xml_data[:4] == '\x00\x3c\x00\x3f': + # UTF-16BE + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): + # UTF-16BE with BOM + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): + # UTF-16LE with BOM + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\x00\x3c': + # UTF-32BE + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x00\x00': + # UTF-32LE + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + elif xml_data[:3] == '\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_xml_encoding = 'utf-8' + xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') else: - true_encoding = 'us-ascii' + # ASCII-compatible + pass + xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) + except: + xml_encoding_match = None + if xml_encoding_match: + xml_encoding = xml_encoding_match.groups()[0].lower() + if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): + xml_encoding = sniffed_xml_encoding + acceptable_content_type = 0 + application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') + text_content_types = ('text/xml', 'text/xml-external-parsed-entity') + if (http_content_type in application_content_types) or \ + (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): + acceptable_content_type = 1 + true_encoding = http_encoding or xml_encoding or 'utf-8' + elif (http_content_type in text_content_types) or \ + (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): + acceptable_content_type = 1 + true_encoding = http_encoding or 'us-ascii' + elif http_content_type.startswith('text/'): + true_encoding = http_encoding or 'us-ascii' + elif http_headers and (not http_headers.has_key('content-type')): + true_encoding = xml_encoding or 'iso-8859-1' else: true_encoding = xml_encoding or 'utf-8' - return true_encoding, http_encoding, xml_encoding + return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type -def _changeEncodingDeclaration(data, encoding): +def _toUTF8(data, encoding): """Changes an XML data stream on the fly to specify a new encoding data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already encoding is a string recognized by encodings.aliases """ - if _debug: sys.stderr.write('entering _changeEncodingDeclaration\n') - if _debug: sys.stderr.write('proposed encoding: %s\n' % encoding) - #import cjkcodecs.aliases - #import japanese - data = unicode(data, encoding) - declmatch = re.compile(u'^<\?xml[^>]*?>') - newdecl = unicode("""""" % encoding, encoding) - if declmatch.search(data): - data = declmatch.sub(newdecl, data) + if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) + # strip Byte Order Mark (if present) + if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): + if _debug: + sys.stderr.write('stripping BOM\n') + if encoding != 'utf-16be': + sys.stderr.write('trying utf-16be instead\n') + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): + if _debug: + sys.stderr.write('stripping BOM\n') + if encoding != 'utf-16le': + sys.stderr.write('trying utf-16le instead\n') + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == '\xef\xbb\xbf': + if _debug: + sys.stderr.write('stripping BOM\n') + if encoding != 'utf-8': + sys.stderr.write('trying utf-8 instead\n') + encoding = 'utf-8' + data = data[3:] + elif data[:4] == '\x00\x00\xfe\xff': + if _debug: + sys.stderr.write('stripping BOM\n') + if encoding != 'utf-32be': + sys.stderr.write('trying utf-32be instead\n') + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == '\xff\xfe\x00\x00': + if _debug: + sys.stderr.write('stripping BOM\n') + if encoding != 'utf-32le': + sys.stderr.write('trying utf-32le instead\n') + encoding = 'utf-32le' + data = data[4:] + newdata = unicode(data, encoding) + if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) + declmatch = re.compile('^<\?xml[^>]*?>') + newdecl = """""" + if declmatch.search(newdata): + newdata = declmatch.sub(newdecl, newdata) else: - data = newdecl + u'\n' + data - return data.encode(encoding) + newdata = newdecl + u'\n' + newdata + return newdata.encode("utf-8") def _stripDoctype(data): """Strips DOCTYPE from XML document, returns (rss_version, stripped_data) @@ -1749,6 +2186,8 @@ def _stripDoctype(data): rss_version may be "rss091n" or None stripped_data is the same XML document, minus the DOCTYPE """ + entity_pattern = re.compile(r']*?)>', re.MULTILINE) + data = entity_pattern.sub('', data) doctype_pattern = re.compile(r']*?)>', re.MULTILINE) doctype_results = doctype_pattern.findall(data) doctype = doctype_results and doctype_results[0] or '' @@ -1759,18 +2198,46 @@ def _stripDoctype(data): data = doctype_pattern.sub('', data) return version, data -def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None): +def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): """Parse a feed from a URL, file, stream, or string""" result = FeedParserDict() - f = _open_resource(url_file_stream_or_string, etag=etag, modified=modified, agent=agent, referrer=referrer) - data = f.read() - if hasattr(f, "headers"): + result['feed'] = FeedParserDict() + result['entries'] = [] + if _XML_AVAILABLE: + result['bozo'] = 0 + if type(handlers) == types.InstanceType: + handlers = [handlers] + try: + f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) + data = f.read() + except Exception, e: + result['bozo'] = 1 + result['bozo_exception'] = e + data = '' + f = None + + # if feed is gzip-compressed, decompress it + if f and data and hasattr(f, "headers"): if gzip and f.headers.get('content-encoding', '') == 'gzip': try: data = gzip.GzipFile(fileobj=_StringIO(data)).read() - except: - # some feeds claim to be gzipped but they're not, so we get garbage + except Exception, e: + # Some feeds claim to be gzipped but they're not, so + # we get garbage. Ideally, we should re-request the + # feed without the "Accept-encoding: gzip" header, + # but we don't. + result['bozo'] = 1 + result['bozo_exception'] = e data = '' + elif zlib and f.headers.get('content-encoding', '') == 'deflate': + try: + data = zlib.decompress(data, -zlib.MAX_WBITS) + except Exception, e: + result['bozo'] = 1 + result['bozo_exception'] = e + data = '' + + # save HTTP headers if hasattr(f, "info"): info = f.info() result["etag"] = info.getheader("ETag") @@ -1779,74 +2246,84 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer result["modified"] = _parse_date(last_modified) if hasattr(f, "url"): result["url"] = f.url - result["status"] = 200 # default, may be overridden later + result["status"] = 200 if hasattr(f, "status"): result["status"] = f.status if hasattr(f, "headers"): result["headers"] = f.headers.dict - f.close() + if hasattr(f, "close"): + f.close() + + # there are four encodings to keep track of: + # - http_encoding is the encoding declared in the Content-Type HTTP header + # - xml_encoding is the encoding declared in the