2 """Universal feed parser
4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds
6 Visit http://feedparser.org/ for the latest version
7 Visit http://feedparser.org/docs/ for the latest documentation
9 Required: Python 2.1 or later
10 Recommended: Python 2.3 or later
11 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
14 #__version__ = "pre-3.3-" + "$Revision: 1.3 $"[11:15] + "-cvs"
16 __license__ = "Python"
17 __copyright__ = "Copyright 2002-4, Mark Pilgrim"
18 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
19 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
20 "John Beimler <http://john.beimler.org/>",
21 "Fazal Majid <http://www.majid.info/mylos/weblog/>",
22 "Aaron Swartz <http://aaronsw.com>"]
25 # HTTP "User-Agent" header to send to servers when downloading feeds.
26 # If you are embedding feedparser in a larger application, you should
27 # change this to your application name and URL.
28 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
30 # HTTP "Accept" header to send to servers when downloading feeds. If you don't
31 # want to send an Accept header, set this to None.
32 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
34 # List of preferred XML parsers, by SAX driver name. These will be tried first,
35 # but if they're not installed, Python will keep searching through its own list
36 # of pre-installed parsers until it finds one that supports everything we need.
37 PREFERRED_XML_PARSERS = ["drv_libxml2"]
39 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
40 # this to 1. This is off by default because of reports of crashing on some
41 # platforms. If it crashes for you, please submit a bug report with your OS
42 # platform, Python version, and the URL of the feed you were attempting to parse.
43 # Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
46 # ---------- required modules (should come with any Python distribution) ----------
47 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi
49 from cStringIO import StringIO as _StringIO
51 from StringIO import StringIO as _StringIO
53 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
55 # gzip is included with most Python distributions, but may not be available if you compiled your own
65 # timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers.
66 # Python 2.3 now has this functionality available in the standard socket library, so under
67 # 2.3 you don't need to install anything. But you probably should anyway, because the socket
68 # module is buggy and timeoutsocket is better.
70 import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
71 timeoutsocket.setDefaultSocketTimeout(20)
74 if hasattr(socket, 'setdefaulttimeout'):
75 socket.setdefaulttimeout(20)
76 import urllib, urllib2
81 from mx.Tidy import Tidy as _mxtidy
85 # If a real XML parser is available, feedparser will attempt to use it. feedparser has
86 # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
87 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
88 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
91 xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
92 from xml.sax.saxutils import escape as _xmlescape
97 data = data.replace("&", "&")
98 data = data.replace(">", ">")
99 data = data.replace("<", "<")
102 # base64 support for Atom feeds that contain embedded binary data
104 import base64, binascii
106 base64 = binascii = None
108 # cjkcodecs and iconv_codec provide support for more character encodings.
109 # Both are available from http://cjkpython.i18n.org/
111 import cjkcodecs.aliases
119 # ---------- don't touch these ----------
120 class CharacterEncodingOverride(Exception): pass
121 class CharacterEncodingUnknown(Exception): pass
122 class NonXMLContentType(Exception): pass
124 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
125 sgmllib.special = re.compile('<!')
126 sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
128 SUPPORTED_VERSIONS = {'': 'unknown',
129 'rss090': 'RSS 0.90',
130 'rss091n': 'RSS 0.91 (Netscape)',
131 'rss091u': 'RSS 0.91 (Userland)',
132 'rss092': 'RSS 0.92',
133 'rss093': 'RSS 0.93',
134 'rss094': 'RSS 0.94',
137 'rss': 'RSS (unknown version)',
138 'atom01': 'Atom 0.1',
139 'atom02': 'Atom 0.2',
140 'atom03': 'Atom 0.3',
141 'atom': 'Atom (unknown version)',
149 # Python 2.1 does not have dict
150 from UserDict import UserDict
157 class FeedParserDict(UserDict):
158 def __getitem__(self, key):
159 keymap = {'channel': 'feed',
163 'date_parsed': 'modified_parsed',
164 'description': ['tagline', 'summary']}
165 realkey = keymap.get(key, key)
166 if type(realkey) == types.ListType:
168 if UserDict.has_key(self, k):
169 return UserDict.__getitem__(self, k)
170 return UserDict.__getitem__(self, key)
171 return UserDict.__getitem__(self, realkey)
173 def has_key(self, key):
174 return hasattr(self, key) or UserDict.has_key(self, key)
176 def __getattr__(self, key):
178 return self.__dict__[key]
182 return self.__getitem__(key)
184 raise AttributeError, "object has no attribute '%s'" % key
186 def __contains__(self, key):
187 return self.has_key(key)
189 def zopeCompatibilityHack():
190 global FeedParserDict
192 def FeedParserDict(aDict=None):
198 _ebcdic_to_ascii_map = None
199 def _ebcdic_to_ascii(s):
200 global _ebcdic_to_ascii_map
201 if not _ebcdic_to_ascii_map:
203 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
204 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
205 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
206 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
207 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
208 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
209 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
210 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
211 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
212 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
213 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
214 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
215 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
216 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
217 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
218 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
221 _ebcdic_to_ascii_map = string.maketrans( \
222 "".join(map(chr, range(256))), "".join(map(chr, emap)))
223 return s.translate(_ebcdic_to_ascii_map)
225 class _FeedParserMixin:
226 namespaces = {"": "",
227 "http://backend.userland.com/rss": "",
228 "http://blogs.law.harvard.edu/tech/rss": "",
229 "http://purl.org/rss/1.0/": "",
230 "http://my.netscape.com/rdf/simple/0.9/": "",
231 "http://example.com/newformat#": "",
232 "http://example.com/necho": "",
233 "http://purl.org/echo/": "",
234 "uri/of/echo/namespace#": "",
235 "http://purl.org/pie/": "",
236 "http://purl.org/atom/ns#": "",
237 "http://purl.org/rss/1.0/modules/rss091#": "",
239 "http://webns.net/mvcb/": "admin",
240 "http://purl.org/rss/1.0/modules/aggregation/": "ag",
241 "http://purl.org/rss/1.0/modules/annotate/": "annotate",
242 "http://media.tangent.org/rss/1.0/": "audio",
243 "http://backend.userland.com/blogChannelModule": "blogChannel",
244 "http://web.resource.org/cc/": "cc",
245 "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
246 "http://purl.org/rss/1.0/modules/company": "co",
247 "http://purl.org/rss/1.0/modules/content/": "content",
248 "http://my.theinfo.org/changed/1.0/rss/": "cp",
249 "http://purl.org/dc/elements/1.1/": "dc",
250 "http://purl.org/dc/terms/": "dcterms",
251 "http://purl.org/rss/1.0/modules/email/": "email",
252 "http://purl.org/rss/1.0/modules/event/": "ev",
253 "http://postneo.com/icbm/": "icbm",
254 "http://purl.org/rss/1.0/modules/image/": "image",
255 "http://xmlns.com/foaf/0.1/": "foaf",
256 "http://freshmeat.net/rss/fm/": "fm",
257 "http://purl.org/rss/1.0/modules/link/": "l",
258 "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
259 "http://prismstandard.org/namespaces/1.2/basic/": "prism",
260 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
261 "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
262 "http://purl.org/rss/1.0/modules/reference/": "ref",
263 "http://purl.org/rss/1.0/modules/richequiv/": "reqv",
264 "http://purl.org/rss/1.0/modules/search/": "search",
265 "http://purl.org/rss/1.0/modules/slash/": "slash",
266 "http://purl.org/rss/1.0/modules/servicestatus/": "ss",
267 "http://hacks.benhammersley.com/rss/streaming/": "str",
268 "http://purl.org/rss/1.0/modules/subscription/": "sub",
269 "http://purl.org/rss/1.0/modules/syndication/": "sy",
270 "http://purl.org/rss/1.0/modules/taxonomy/": "taxo",
271 "http://purl.org/rss/1.0/modules/threading/": "thr",
272 "http://purl.org/rss/1.0/modules/textinput/": "ti",
273 "http://madskills.com/public/xml/rss/module/trackback/":"trackback",
274 "http://wellformedweb.org/CommentAPI/": "wfw",
275 "http://purl.org/rss/1.0/modules/wiki/": "wiki",
276 "http://schemas.xmlsoap.org/soap/envelope/": "soap",
277 "http://www.w3.org/1999/xhtml": "xhtml",
278 "http://www.w3.org/XML/1998/namespace": "xml"
281 can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'comments', 'license']
282 can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'copyright', 'description']
283 can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'copyright', 'description']
284 html_types = ['text/html', 'application/xhtml+xml']
286 def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
287 if _debug: sys.stderr.write("initializing FeedParser\n")
288 self.feeddata = FeedParserDict() # feed-level data
289 self.encoding = encoding # character encoding
290 self.entries = [] # list of entry-level data
291 self.version = '' # feed type/version, see SUPPORTED_VERSIONS
293 # the following are used internally to track state;
294 # some of this is kind of out of control and should
295 # probably be refactored into a finite state machine
302 self.incontributor = 0
303 self.contentparams = FeedParserDict()
304 self.namespacemap = {}
305 self.elementstack = []
308 self.baseuri = baseuri or ''
309 self.lang = baselang or None
311 self.feeddata['language'] = baselang
313 def unknown_starttag(self, tag, attrs):
314 if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
316 attrs = [(k.lower(), v) for k, v in attrs]
317 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
319 # track xml:base and xml:lang
321 baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
322 self.baseuri = baseuri
323 lang = attrsD.get('xml:lang', attrsD.get('lang'))
325 # xml:lang could be explicitly set to '', we need to capture that
328 # if no xml:lang is specified, use parent lang
331 if tag in ('feed', 'rss', 'rdf:RDF'):
332 self.feeddata['language'] = lang
334 self.basestack.append(baseuri)
335 self.langstack.append(lang)
338 for prefix, uri in attrs:
339 if prefix.startswith('xmlns:'):
340 self.trackNamespace(prefix[6:], uri)
341 elif prefix == 'xmlns':
342 self.trackNamespace(None, uri)
344 # track inline content
345 if self.incontent and self.contentparams.get('mode') == 'escaped':
346 # element declared itself as escaped markup, but it isn't really
347 self.contentparams['mode'] = 'xml'
348 if self.incontent and self.contentparams.get('mode') == 'xml':
349 # Note: probably shouldn't simply recreate localname here, but
350 # our namespace handling isn't actually 100% correct in cases where
351 # the feed redefines the default namespace (which is actually
352 # the usual case for inline content, thanks Sam), so here we
353 # cheat and just reconstruct the element based on localname
354 # because that compensates for the bugs in our namespace handling.
355 # This will horribly munge inline content with non-empty qnames,
356 # but nobody actually does that, so I'm not fixing it.
357 tag = tag.split(':')[-1]
358 return self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0)
361 if tag.find(':') <> -1:
362 prefix, suffix = tag.split(':', 1)
364 prefix, suffix = '', tag
365 prefix = self.namespacemap.get(prefix, prefix)
367 prefix = prefix + '_'
369 # special hack for better tracking of empty textinput/image elements in illformed feeds
370 if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
372 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'width', 'height'):
375 # call special handler (if defined) or default handler
376 methodname = '_start_' + prefix + suffix
378 method = getattr(self, methodname)
379 return method(attrsD)
380 except AttributeError:
381 return self.push(prefix + suffix, 1)
383 def unknown_endtag(self, tag):
384 if _debug: sys.stderr.write('end %s\n' % tag)
386 if tag.find(':') <> -1:
387 prefix, suffix = tag.split(':', 1)
389 prefix, suffix = '', tag
390 prefix = self.namespacemap.get(prefix, prefix)
392 prefix = prefix + '_'
394 # call special handler (if defined) or default handler
395 methodname = '_end_' + prefix + suffix
397 method = getattr(self, methodname)
399 except AttributeError:
400 self.pop(prefix + suffix)
402 # track inline content
403 if self.incontent and self.contentparams.get('mode') == 'escaped':
404 # element declared itself as escaped markup, but it isn't really
405 self.contentparams['mode'] = 'xml'
406 if self.incontent and self.contentparams.get('mode') == 'xml':
407 tag = tag.split(':')[-1]
408 self.handle_data("</%s>" % tag, escape=0)
410 # track xml:base and xml:lang going out of scope
413 if self.basestack and self.basestack[-1]:
414 self.baseuri = self.basestack[-1]
417 if self.langstack: # and (self.langstack[-1] is not None):
418 self.lang = self.langstack[-1]
420 def handle_charref(self, ref):
421 # called for each character reference, e.g. for " ", ref will be "160"
422 if not self.elementstack: return
424 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
431 text = unichr(c).encode('utf-8')
432 self.elementstack[-1][2].append(text)
434 def handle_entityref(self, ref):
435 # called for each entity reference, e.g. for "©", ref will be "copy"
436 if not self.elementstack: return
437 if _debug: sys.stderr.write("entering handle_entityref with %s\n" % ref)
438 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
441 # entity resolution graciously donated by Aaron Swartz
443 import htmlentitydefs
444 if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
445 return htmlentitydefs.name2codepoint[k]
446 k = htmlentitydefs.entitydefs[k]
447 if k.startswith("&#") and k.endswith(";"):
448 return int(k[2:-1]) # not in latin-1
451 except KeyError: text = "&%s;" % ref
452 else: text = unichr(name2cp(ref)).encode('utf-8')
453 self.elementstack[-1][2].append(text)
455 def handle_data(self, text, escape=1):
456 # called for each block of plain text, i.e. outside of any tag and
457 # not containing any character or entity references
458 if not self.elementstack: return
459 if escape and self.contentparams.get('mode') == 'xml':
460 text = _xmlescape(text)
461 self.elementstack[-1][2].append(text)
463 def handle_comment(self, text):
464 # called for each comment, e.g. <!-- insert message here -->
467 def handle_pi(self, text):
468 # called for each processing instruction, e.g. <?instruction>
471 def handle_decl(self, text):
474 def parse_declaration(self, i):
475 # override internal declaration handler to handle CDATA blocks
476 if _debug: sys.stderr.write("entering parse_declaration\n")
477 if self.rawdata[i:i+9] == '<![CDATA[':
478 k = self.rawdata.find(']]>', i)
479 if k == -1: k = len(self.rawdata)
480 self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
483 k = self.rawdata.find('>', i)
486 def trackNamespace(self, prefix, uri):
487 if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
488 self.version = 'rss090'
489 if uri == 'http://purl.org/rss/1.0/' and not self.version:
490 self.version = 'rss10'
491 if not prefix: return
492 if uri.find('backend.userland.com/rss') <> -1:
493 # match any backend.userland.com namespace
494 uri = 'http://backend.userland.com/rss'
495 if self.namespaces.has_key(uri):
496 self.namespacemap[prefix] = self.namespaces[uri]
498 def resolveURI(self, uri):
499 return urlparse.urljoin(self.baseuri or '', uri)
501 def decodeEntities(self, element, data):
504 def push(self, element, expectingText):
505 self.elementstack.append([element, expectingText, []])
507 def pop(self, element):
508 if not self.elementstack: return
509 if self.elementstack[-1][0] != element: return
511 element, expectingText, pieces = self.elementstack.pop()
512 output = "".join(pieces)
513 output = output.strip()
514 if not expectingText: return output
516 # decode base64 content
517 if self.contentparams.get('mode') == 'base64' and base64:
519 output = base64.decodestring(output)
520 except binascii.Error:
522 except binascii.Incomplete:
525 # resolve relative URIs
526 if (element in self.can_be_relative_uri) and output:
527 output = self.resolveURI(output)
529 # decode entities within embedded markup
530 output = self.decodeEntities(element, output)
532 # resolve relative URIs within embedded markup
533 if self.contentparams.get('type', 'text/html') in self.html_types:
534 if element in self.can_contain_relative_uris:
535 output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
537 # sanitize embedded markup
538 if self.contentparams.get('type', 'text/html') in self.html_types:
539 if element in self.can_contain_dangerous_markup:
540 output = _sanitizeHTML(output, self.encoding)
542 if self.encoding and (type(output) == types.StringType):
544 output = unicode(output, self.encoding)
548 # store output in appropriate place(s)
550 if element == 'content':
551 self.entries[-1].setdefault(element, [])
552 contentparams = copy.deepcopy(self.contentparams)
553 contentparams['value'] = output
554 self.entries[-1][element].append(contentparams)
555 elif element == 'category':
556 self.entries[-1][element] = output
557 domain = self.entries[-1]['categories'][-1][0]
558 self.entries[-1]['categories'][-1] = (domain, output)
559 elif element == 'source':
560 self.entries[-1]['source']['value'] = output
561 elif element == 'link':
562 self.entries[-1][element] = output
564 self.entries[-1]['links'][-1]['href'] = output
566 if element == 'description':
568 self.entries[-1][element] = output
570 contentparams = copy.deepcopy(self.contentparams)
571 contentparams['value'] = output
572 self.entries[-1][element + '_detail'] = contentparams
573 elif self.infeed and (not self.intextinput) and (not self.inimage):
574 if element == 'description':
576 self.feeddata[element] = output
577 if element == 'category':
578 domain = self.feeddata['categories'][-1][0]
579 self.feeddata['categories'][-1] = (domain, output)
580 elif element == 'link':
581 self.feeddata['links'][-1]['href'] = output
583 contentparams = copy.deepcopy(self.contentparams)
584 contentparams['value'] = output
585 self.feeddata[element + '_detail'] = contentparams
588 def _mapToStandardPrefix(self, name):
589 colonpos = name.find(':')
591 prefix = name[:colonpos]
592 suffix = name[colonpos+1:]
593 prefix = self.namespacemap.get(prefix, prefix)
594 name = prefix + ':' + suffix
597 def _getAttribute(self, attrsD, name):
598 return attrsD.get(self._mapToStandardPrefix(name))
600 def _save(self, key, value):
602 self.entries[-1].setdefault(key, value)
604 self.feeddata.setdefault(key, value)
606 def _start_rss(self, attrsD):
607 versionmap = {'0.91': 'rss091u',
612 attr_version = attrsD.get('version', '')
613 version = versionmap.get(attr_version)
615 self.version = version
616 elif attr_version.startswith('2.'):
617 self.version = 'rss20'
621 def _start_dlhottitles(self, attrsD):
622 self.version = 'hotrss'
624 def _start_channel(self, attrsD):
626 self._cdf_common(attrsD)
627 _start_feedinfo = _start_channel
629 def _cdf_common(self, attrsD):
630 if attrsD.has_key('lastmod'):
631 self._start_modified({})
632 self.elementstack[-1][-1] = attrsD['lastmod']
634 if attrsD.has_key('href'):
636 self.elementstack[-1][-1] = attrsD['href']
639 def _start_feed(self, attrsD):
641 versionmap = {'0.1': 'atom01',
645 attr_version = attrsD.get('version')
646 version = versionmap.get(attr_version)
648 self.version = version
650 self.version = 'atom'
652 def _end_channel(self):
654 _end_feed = _end_channel
656 def _start_image(self, attrsD):
658 self.push('image', 0)
659 context = self._getContext()
660 context.setdefault('image', FeedParserDict())
662 def _end_image(self):
666 def _start_textinput(self, attrsD):
668 self.push('textinput', 0)
669 context = self._getContext()
670 context.setdefault('textinput', FeedParserDict())
671 _start_textInput = _start_textinput
673 def _end_textinput(self):
674 self.pop('textinput')
676 _end_textInput = _end_textinput
678 def _start_author(self, attrsD):
680 self.push('author', 1)
681 _start_managingeditor = _start_author
682 _start_dc_author = _start_author
683 _start_dc_creator = _start_author
685 def _end_author(self):
688 self._sync_author_detail()
689 _end_managingeditor = _end_author
690 _end_dc_author = _end_author
691 _end_dc_creator = _end_author
693 def _start_contributor(self, attrsD):
694 self.incontributor = 1
695 context = self._getContext()
696 context.setdefault('contributors', [])
697 context['contributors'].append(FeedParserDict())
698 self.push('contributor', 0)
700 def _end_contributor(self):
701 self.pop('contributor')
702 self.incontributor = 0
704 def _start_name(self, attrsD):
708 value = self.pop('name')
710 self._save_author('name', value)
711 elif self.incontributor:
712 self._save_contributor('name', value)
713 elif self.intextinput:
714 context = self._getContext()
715 context['textinput']['name'] = value
717 def _start_width(self, attrsD):
718 self.push('width', 0)
720 def _end_width(self):
721 value = self.pop('width')
727 context = self._getContext()
728 context['image']['width'] = value
730 def _start_height(self, attrsD):
731 self.push('height', 0)
733 def _end_height(self):
734 value = self.pop('height')
740 context = self._getContext()
741 context['image']['height'] = value
743 def _start_url(self, attrsD):
745 _start_homepage = _start_url
746 _start_uri = _start_url
749 value = self.pop('url')
751 self._save_author('url', value)
752 elif self.incontributor:
753 self._save_contributor('url', value)
755 context = self._getContext()
756 context['image']['url'] = value
757 elif self.intextinput:
758 context = self._getContext()
759 context['textinput']['link'] = value
760 _end_homepage = _end_url
763 def _start_email(self, attrsD):
764 self.push('email', 0)
766 def _end_email(self):
767 value = self.pop('email')
769 self._save_author('email', value)
770 elif self.incontributor:
771 self._save_contributor('email', value)
774 def _getContext(self):
776 context = self.entries[-1]
778 context = self.feeddata
781 def _save_author(self, key, value):
782 context = self._getContext()
783 context.setdefault('author_detail', FeedParserDict())
784 context['author_detail'][key] = value
785 self._sync_author_detail()
787 def _save_contributor(self, key, value):
788 context = self._getContext()
789 context.setdefault('contributors', [FeedParserDict()])
790 context['contributors'][-1][key] = value
792 def _sync_author_detail(self, key='author'):
793 context = self._getContext()
794 detail = context.get('%s_detail' % key)
796 name = detail.get('name')
797 email = detail.get('email')
799 context[key] = "%s (%s)" % (name, email)
805 author = context.get(key)
806 if not author: return
807 emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author)
808 if not emailmatch: return
809 email = emailmatch.group(0)
810 # probably a better way to do the following, but it passes all the tests
811 author = author.replace(email, '')
812 author = author.replace('()', '')
813 author = author.strip()
814 if author and (author[0] == '('):
816 if author and (author[-1] == ')'):
818 author = author.strip()
819 context.setdefault('%s_detail' % key, FeedParserDict())
820 context['%s_detail' % key]['name'] = author
821 context['%s_detail' % key]['email'] = email
823 def _start_tagline(self, attrsD):
825 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
826 'type': attrsD.get('type', 'text/plain'),
827 'language': self.lang,
828 'base': self.baseuri})
829 self.push('tagline', 1)
830 _start_subtitle = _start_tagline
832 def _end_tagline(self):
833 value = self.pop('tagline')
835 self.contentparams.clear()
837 self.feeddata['description'] = value
838 _end_subtitle = _end_tagline
840 def _start_copyright(self, attrsD):
842 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
843 'type': attrsD.get('type', 'text/plain'),
844 'language': self.lang,
845 'base': self.baseuri})
846 self.push('copyright', 1)
847 _start_dc_rights = _start_copyright
849 def _end_copyright(self):
850 self.pop('copyright')
852 self.contentparams.clear()
853 _end_dc_rights = _end_copyright
855 def _start_item(self, attrsD):
856 self.entries.append(FeedParserDict())
860 id = self._getAttribute(attrsD, 'rdf:about')
862 context = self._getContext()
864 self._cdf_common(attrsD)
865 _start_entry = _start_item
866 _start_product = _start_item
871 _end_entry = _end_item
873 def _start_dc_language(self, attrsD):
874 self.push('language', 1)
875 _start_language = _start_dc_language
877 def _end_dc_language(self):
878 self.lang = self.pop('language')
879 _end_language = _end_dc_language
881 def _start_dc_publisher(self, attrsD):
882 self.push('publisher', 1)
883 _start_webmaster = _start_dc_publisher
885 def _end_dc_publisher(self):
886 self.pop('publisher')
887 self._sync_author_detail('publisher')
888 _end_webmaster = _end_dc_publisher
890 def _start_dcterms_issued(self, attrsD):
891 self.push('issued', 1)
892 _start_issued = _start_dcterms_issued
894 def _end_dcterms_issued(self):
895 value = self.pop('issued')
896 self._save('issued_parsed', _parse_date(value))
897 _end_issued = _end_dcterms_issued
899 def _start_dcterms_created(self, attrsD):
900 self.push('created', 1)
901 _start_created = _start_dcterms_created
903 def _end_dcterms_created(self):
904 value = self.pop('created')
905 self._save('created_parsed', _parse_date(value))
906 _end_created = _end_dcterms_created
908 def _start_dcterms_modified(self, attrsD):
909 self.push('modified', 1)
910 _start_modified = _start_dcterms_modified
911 _start_dc_date = _start_dcterms_modified
912 _start_pubdate = _start_dcterms_modified
914 def _end_dcterms_modified(self):
915 value = self.pop('modified')
916 parsed_value = _parse_date(value)
917 self._save('modified_parsed', parsed_value)
918 _end_modified = _end_dcterms_modified
919 _end_dc_date = _end_dcterms_modified
920 _end_pubdate = _end_dcterms_modified
922 def _start_expirationdate(self, attrsD):
923 self.push('expired', 1)
925 def _end_expirationdate(self):
926 self._save('expired_parsed', _parse_date(self.pop('expired')))
928 def _start_cc_license(self, attrsD):
929 self.push('license', 1)
930 value = self._getAttribute(attrsD, 'rdf:resource')
932 self.elementstack[-1][2].append(value)
935 def _start_creativecommons_license(self, attrsD):
936 self.push('license', 1)
938 def _end_creativecommons_license(self):
941 def _start_category(self, attrsD):
942 self.push('category', 1)
943 domain = self._getAttribute(attrsD, 'domain')
946 cats = self.entries[-1].setdefault('categories', [])
948 cats = self.feeddata.setdefault('categories', [])
949 cats.append((domain, None))
950 _start_dc_subject = _start_category
951 _start_keywords = _start_category
953 def _end_category(self):
955 _end_dc_subject = _end_category
956 _end_keywords = _end_category
958 def _start_cloud(self, attrsD):
959 self.feeddata['cloud'] = FeedParserDict(attrsD)
961 def _start_link(self, attrsD):
962 attrsD.setdefault('rel', 'alternate')
963 attrsD.setdefault('type', 'text/html')
964 if attrsD.has_key('href'):
965 attrsD['href'] = self.resolveURI(attrsD['href'])
966 expectingText = self.infeed or self.inentry
968 self.entries[-1].setdefault('links', [])
969 self.entries[-1]['links'].append(FeedParserDict(attrsD))
971 self.feeddata.setdefault('links', [])
972 self.feeddata['links'].append(FeedParserDict(attrsD))
973 if attrsD.has_key('href'):
975 if attrsD.get('type', '') in self.html_types:
977 self.entries[-1]['link'] = attrsD['href']
979 self.feeddata['link'] = attrsD['href']
981 self.push('link', expectingText)
982 _start_producturl = _start_link
985 value = self.pop('link')
987 context = self._getContext()
988 context['textinput']['link'] = value
990 context = self._getContext()
991 context['image']['link'] = value
992 _end_producturl = _end_link
994 def _start_guid(self, attrsD):
995 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
999 value = self.pop('id')
1000 self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1002 # guid acts as link, but only if "ispermalink" is not present or is "true",
1003 # and only if the item doesn't already have a link element
1004 self._save('link', value)
1006 def _start_id(self, attrsD):
1010 value = self.pop('id')
1012 def _start_title(self, attrsD):
1014 if _debug: sys.stderr.write('attrsD.xml:lang = %s\n' % attrsD.get('xml:lang'))
1015 if _debug: sys.stderr.write('self.lang = %s\n' % self.lang)
1016 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
1017 'type': attrsD.get('type', 'text/plain'),
1018 'language': self.lang,
1019 'base': self.baseuri})
1020 self.push('title', self.infeed or self.inentry)
1021 _start_dc_title = _start_title
1023 def _end_title(self):
1024 value = self.pop('title')
1026 self.contentparams.clear()
1027 if self.intextinput:
1028 context = self._getContext()
1029 context['textinput']['title'] = value
1031 context = self._getContext()
1032 context['image']['title'] = value
1033 _end_dc_title = _end_title
1035 def _start_description(self, attrsD, default_content_type='text/html'):
1037 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
1038 'type': attrsD.get('type', default_content_type),
1039 'language': self.lang,
1040 'base': self.baseuri})
1041 self.push('description', self.infeed or self.inentry)
1043 def _start_abstract(self, attrsD):
1044 return self._start_description(attrsD, 'text/plain')
1046 def _end_description(self):
1047 value = self.pop('description')
1049 self.contentparams.clear()
1050 context = self._getContext()
1051 if self.intextinput:
1052 context['textinput']['description'] = value
1054 context['image']['description'] = value
1055 # elif self.inentry:
1056 # context['summary'] = value
1058 # context['tagline'] = value
1059 _end_abstract = _end_description
1061 def _start_info(self, attrsD):
1063 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
1064 'type': attrsD.get('type', 'text/plain'),
1065 'language': self.lang,
1066 'base': self.baseuri})
1067 self.push('info', 1)
1069 def _end_info(self):
1072 self.contentparams.clear()
1074 def _start_generator(self, attrsD):
1076 if attrsD.has_key('url'):
1077 attrsD['url'] = self.resolveURI(attrsD['url'])
1078 self.feeddata['generator_detail'] = FeedParserDict(attrsD)
1079 self.push('generator', 1)
1081 def _end_generator(self):
1082 value = self.pop('generator')
1083 if self.feeddata.has_key('generator_detail'):
1084 self.feeddata['generator_detail']['name'] = value
1086 def _start_admin_generatoragent(self, attrsD):
1087 self.push('generator', 1)
1088 value = self._getAttribute(attrsD, 'rdf:resource')
1090 self.elementstack[-1][2].append(value)
1091 self.pop('generator')
1092 self.feeddata['generator_detail'] = FeedParserDict({"url": value})
1094 def _start_admin_errorreportsto(self, attrsD):
1095 self.push('errorreportsto', 1)
1096 value = self._getAttribute(attrsD, 'rdf:resource')
1098 self.elementstack[-1][2].append(value)
1099 self.pop('errorreportsto')
1101 def _start_summary(self, attrsD):
1103 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
1104 'type': attrsD.get('type', 'text/plain'),
1105 'language': self.lang,
1106 'base': self.baseuri})
1107 self.push('summary', 1)
1109 def _end_summary(self):
1110 value = self.pop('summary')
1112 self.entries[-1]['description'] = value
1114 self.contentparams.clear()
1116 def _start_enclosure(self, attrsD):
1118 self.entries[-1].setdefault('enclosures', [])
1119 self.entries[-1]['enclosures'].append(FeedParserDict(attrsD))
1121 def _start_source(self, attrsD):
1123 self.entries[-1]['source'] = FeedParserDict(attrsD)
1124 self.push('source', 1)
1126 def _end_source(self):
1129 def _start_content(self, attrsD):
1131 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
1132 'type': attrsD.get('type', 'text/plain'),
1133 'language': self.lang,
1134 'base': self.baseuri})
1135 self.push('content', 1)
1137 def _start_prodlink(self, attrsD):
1139 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
1140 'type': attrsD.get('type', 'text/html'),
1141 'language': self.lang,
1142 'base': self.baseuri})
1143 self.push('content', 1)
1145 def _start_body(self, attrsD):
1147 self.contentparams = FeedParserDict({'mode': 'xml',
1148 'type': 'application/xhtml+xml',
1149 'language': self.lang,
1150 'base': self.baseuri})
1151 self.push('content', 1)
1152 _start_xhtml_body = _start_body
1154 def _start_content_encoded(self, attrsD):
1156 self.contentparams = FeedParserDict({'mode': 'escaped',
1157 'type': 'text/html',
1158 'language': self.lang,
1159 'base': self.baseuri})
1160 self.push('content', 1)
1161 _start_fullitem = _start_content_encoded
1163 def _end_content(self):
1164 value = self.pop('content')
1165 if self.contentparams.get('type') in (['text/plain'] + self.html_types):
1166 self._save('description', value)
1168 self.contentparams.clear()
1169 _end_body = _end_content
1170 _end_xhtml_body = _end_content
1171 _end_content_encoded = _end_content
1172 _end_fullitem = _end_content
1173 _end_prodlink = _end_content
1176 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1177 def __init__(self, baseuri, baselang, encoding):
1178 if _debug: sys.stderr.write('trying StrictFeedParser\n')
1179 xml.sax.handler.ContentHandler.__init__(self)
1180 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1184 def startPrefixMapping(self, prefix, uri):
1185 self.trackNamespace(prefix, uri)
1187 def startElementNS(self, name, qname, attrs):
1188 namespace, localname = name
1189 namespace = str(namespace or '')
1190 if namespace.find('backend.userland.com/rss') <> -1:
1191 # match any backend.userland.com namespace
1192 namespace = 'http://backend.userland.com/rss'
1193 prefix = self.namespaces.get(namespace, 'unknown')
1195 localname = prefix + ':' + localname
1196 localname = str(localname).lower()
1198 # qname implementation is horribly broken in Python 2.1 (it
1199 # doesn't report any), and slightly broken in Python 2.2 (it
1200 # doesn't report the xml: namespace). So we match up namespaces
1201 # with a known list first, and then possibly override them with
1202 # the qnames the SAX parser gives us (if indeed it gives us any
1203 # at all). Thanks to MatejC for helping me test this and
1204 # tirelessly telling me that it didn't work yet.
1206 for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1207 prefix = self.namespaces.get(namespace, '')
1209 attrlocalname = prefix + ":" + attrlocalname
1210 attrsD[str(attrlocalname).lower()] = attrvalue
1211 for qname in attrs.getQNames():
1212 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1213 self.unknown_starttag(localname, attrsD.items())
1215 # def resolveEntity(self, publicId, systemId):
1216 # return _StringIO()
1218 def characters(self, text):
1219 self.handle_data(text)
1221 def endElementNS(self, name, qname):
1222 namespace, localname = name
1223 namespace = str(namespace)
1224 prefix = self.namespaces.get(namespace, '')
1226 localname = prefix + ':' + localname
1227 localname = str(localname).lower()
1228 self.unknown_endtag(localname)
1230 def error(self, exc):
1234 def fatalError(self, exc):
1238 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1239 elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1240 'img', 'input', 'isindex', 'link', 'meta', 'param']
1242 def __init__(self, encoding):
1243 self.encoding = encoding
1244 if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1245 sgmllib.SGMLParser.__init__(self)
1249 sgmllib.SGMLParser.reset(self)
1251 def feed(self, data):
1252 data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
1253 data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
1254 data = data.replace(''', "'")
1255 data = data.replace('"', '"')
1256 if self.encoding and (type(data) == types.UnicodeType):
1257 data = data.encode(self.encoding)
1258 sgmllib.SGMLParser.feed(self, data)
1260 def normalize_attrs(self, attrs):
1261 # utility method to be called by descendants
1262 attrs = [(k.lower(), v) for k, v in attrs]
1264 # if _debug: sys.stderr.write('normalize_attrs, encoding=%s\n' % self.encoding)
1265 # attrs = [(k, v.encode(self.encoding)) for k, v in attrs]
1266 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1269 def unknown_starttag(self, tag, attrs):
1270 # called for each start tag
1271 # attrs is a list of (attr, value) tuples
1272 # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
1273 if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1274 strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
1275 if tag in self.elements_no_end_tag:
1276 self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
1278 self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
1280 def unknown_endtag(self, tag):
1281 # called for each end tag, e.g. for </pre>, tag will be "pre"
1282 # Reconstruct the original end tag.
1283 if tag not in self.elements_no_end_tag:
1284 self.pieces.append("</%(tag)s>" % locals())
1286 def handle_charref(self, ref):
1287 # called for each character reference, e.g. for " ", ref will be "160"
1288 # Reconstruct the original character reference.
1289 self.pieces.append("&#%(ref)s;" % locals())
1291 def handle_entityref(self, ref):
1292 # called for each entity reference, e.g. for "©", ref will be "copy"
1293 # Reconstruct the original entity reference.
1294 self.pieces.append("&%(ref)s;" % locals())
1296 def handle_data(self, text):
1297 # called for each block of plain text, i.e. outside of any tag and
1298 # not containing any character or entity references
1299 # Store the original text verbatim.
1300 if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1301 self.pieces.append(text)
1303 def handle_comment(self, text):
1304 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1305 # Reconstruct the original comment.
1306 self.pieces.append("<!--%(text)s-->" % locals())
1308 def handle_pi(self, text):
1309 # called for each processing instruction, e.g. <?instruction>
1310 # Reconstruct original processing instruction.
1311 self.pieces.append("<?%(text)s>" % locals())
1313 def handle_decl(self, text):
1314 # called for the DOCTYPE, if present, e.g.
1315 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1316 # "http://www.w3.org/TR/html4/loose.dtd">
1317 # Reconstruct original DOCTYPE
1318 self.pieces.append("<!%(text)s>" % locals())
1320 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1321 def _scan_name(self, i, declstartpos):
1322 rawdata = self.rawdata
1326 m = self._new_declname_match(rawdata, i)
1330 if (i + len(s)) == n:
1331 return None, -1 # end of buffer
1332 return name.lower(), m.end()
1334 self.handle_data(rawdata)
1335 # self.updatepos(declstartpos, i)
1339 """Return processed HTML as a single string"""
1340 return "".join([str(p) for p in self.pieces])
1342 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1343 def __init__(self, baseuri, baselang, encoding):
1344 sgmllib.SGMLParser.__init__(self)
1345 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1347 def decodeEntities(self, element, data):
1348 data = data.replace('<', '<')
1349 data = data.replace('<', '<')
1350 data = data.replace('>', '>')
1351 data = data.replace('>', '>')
1352 data = data.replace('&', '&')
1353 data = data.replace('&', '&')
1354 data = data.replace('"', '"')
1355 data = data.replace('"', '"')
1356 data = data.replace(''', ''')
1357 data = data.replace(''', ''')
1358 if self.contentparams.get('mode') == 'escaped':
1359 data = data.replace('<', '<')
1360 data = data.replace('>', '>')
1361 data = data.replace('&', '&')
1362 data = data.replace('"', '"')
1363 data = data.replace(''', "'")
1366 class _RelativeURIResolver(_BaseHTMLProcessor):
1367 relative_uris = [('a', 'href'),
1368 ('applet', 'codebase'),
1370 ('blockquote', 'cite'),
1371 ('body', 'background'),
1374 ('frame', 'longdesc'),
1376 ('iframe', 'longdesc'),
1378 ('head', 'profile'),
1379 ('img', 'longdesc'),
1383 ('input', 'usemap'),
1386 ('object', 'classid'),
1387 ('object', 'codebase'),
1389 ('object', 'usemap'),
1393 def __init__(self, baseuri, encoding):
1394 _BaseHTMLProcessor.__init__(self, encoding)
1395 self.baseuri = baseuri
1397 def resolveURI(self, uri):
1398 return urlparse.urljoin(self.baseuri, uri)
1400 def unknown_starttag(self, tag, attrs):
1401 attrs = self.normalize_attrs(attrs)
1402 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1403 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1405 def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1406 if _debug: sys.stderr.write("entering _resolveRelativeURIs\n")
1407 p = _RelativeURIResolver(baseURI, encoding)
1411 class _HTMLSanitizer(_BaseHTMLProcessor):
1412 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1413 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1414 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1415 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1416 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1417 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1418 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1419 'thead', 'tr', 'tt', 'u', 'ul', 'var']
1421 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1422 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1423 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1424 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1425 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1426 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1427 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1428 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1429 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1430 'usemap', 'valign', 'value', 'vspace', 'width']
1432 unacceptable_elements_with_end_tag = ['script', 'applet']
1435 _BaseHTMLProcessor.reset(self)
1436 self.unacceptablestack = 0
1438 def unknown_starttag(self, tag, attrs):
1439 if not tag in self.acceptable_elements:
1440 if tag in self.unacceptable_elements_with_end_tag:
1441 self.unacceptablestack += 1
1443 attrs = self.normalize_attrs(attrs)
1444 attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1445 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1447 def unknown_endtag(self, tag):
1448 if not tag in self.acceptable_elements:
1449 if tag in self.unacceptable_elements_with_end_tag:
1450 self.unacceptablestack -= 1
1452 _BaseHTMLProcessor.unknown_endtag(self, tag)
1454 def handle_pi(self, text):
1457 def handle_decl(self, text):
1460 def handle_data(self, text):
1461 if not self.unacceptablestack:
1462 _BaseHTMLProcessor.handle_data(self, text)
1464 def _sanitizeHTML(htmlSource, encoding):
1465 p = _HTMLSanitizer(encoding)
1468 if _mxtidy and TIDY_MARKUP:
1469 nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
1470 if data.count('<body'):
1471 data = data.split('<body', 1)[1]
1473 data = data.split('>', 1)[1]
1474 if data.count('</body'):
1475 data = data.split('</body', 1)[0]
1476 data = data.strip().replace('\r\n', '\n')
1479 class _FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1480 def http_error_default(self, req, fp, code, msg, headers):
1481 if ((code / 100) == 3) and (code != 304):
1482 return self.http_error_302(req, fp, code, msg, headers)
1483 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1484 infourl.status = code
1487 def http_error_302(self, req, fp, code, msg, headers):
1488 if headers.dict.has_key('location'):
1489 infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1491 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1492 if not hasattr(infourl, 'status'):
1493 infourl.status = code
1496 def http_error_301(self, req, fp, code, msg, headers):
1497 if headers.dict.has_key('location'):
1498 infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1500 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1501 if not hasattr(infourl, 'status'):
1502 infourl.status = code
1505 http_error_300 = http_error_302
1506 http_error_303 = http_error_302
1507 http_error_307 = http_error_302
1509 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1510 """URL, filename, or string --> stream
1512 This function lets you define parsers that take any input source
1513 (URL, pathname to local or network file, or actual data as a string)
1514 and deal with it in a uniform manner. Returned object is guaranteed
1515 to have all the basic stdio read methods (read, readline, readlines).
1516 Just .close() the object when you're done with it.
1518 If the etag argument is supplied, it will be used as the value of an
1519 If-None-Match request header.
1521 If the modified argument is supplied, it must be a tuple of 9 integers
1522 as returned by gmtime() in the standard Python time module. This MUST
1523 be in GMT (Greenwich Mean Time). The formatted date/time will be used
1524 as the value of an If-Modified-Since request header.
1526 If the agent argument is supplied, it will be used as the value of a
1527 User-Agent request header.
1529 If the referrer argument is supplied, it will be used as the value of a
1530 Referer[sic] request header.
1532 If handlers is supplied, it is a list of handlers used to build a
1536 if hasattr(url_file_stream_or_string, "read"):
1537 return url_file_stream_or_string
1539 if url_file_stream_or_string == "-":
1542 if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1545 # test for inline user:password for basic auth
1548 urltype, rest = urllib.splittype(url_file_stream_or_string)
1549 realhost, rest = urllib.splithost(rest)
1551 user_passwd, realhost = urllib.splituser(realhost)
1553 url_file_stream_or_string = "%s://%s%s" % (urltype, realhost, rest)
1554 auth = base64.encodestring(user_passwd).strip()
1555 # try to open with urllib2 (to use optional headers)
1556 request = urllib2.Request(url_file_stream_or_string)
1557 request.add_header("User-Agent", agent)
1559 request.add_header("If-None-Match", etag)
1561 # format into an RFC 1123-compliant timestamp. We can't use
1562 # time.strftime() since the %a and %b directives can be affected
1563 # by the current locale, but RFC 2616 states that dates must be
1565 short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
1566 months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
1567 request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1569 request.add_header("Referer", referrer)
1571 request.add_header("Accept-encoding", "gzip, deflate")
1573 request.add_header("Accept-encoding", "gzip")
1575 request.add_header("Accept-encoding", "deflate")
1577 request.add_header("Accept-encoding", "")
1579 request.add_header("Authorization", "Basic %s" % auth)
1581 request.add_header("Accept", ACCEPT_HEADER)
1582 opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
1583 opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1585 return opener.open(request)
1587 opener.close() # JohnD
1589 # try to open with native open function (if url_file_stream_or_string is a filename)
1591 return open(url_file_stream_or_string)
1595 # treat url_file_stream_or_string as string
1596 return _StringIO(str(url_file_stream_or_string))
1599 def registerDateHandler(func):
1600 """Register a date handler function (takes string, returns 9-tuple date in GMT)"""
1601 _date_handlers.insert(0, func)
1603 # ISO-8601 date parsing routines written by Fazal Majid.
1604 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1605 # parser is beyond the scope of feedparser and would be a worthwhile addition
1606 # to the Python library.
1607 # A single regular expression cannot parse ISO 8601 date formats into groups
1608 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
1609 # 0301-04-01), so we use templates instead.
1610 # Please note the order in templates is significant because we need a
1612 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1613 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1614 '-YY-?MM', '-OOO', '-YY',
1620 'YYYY', r'(?P<year>\d{4})').replace(
1621 'YY', r'(?P<year>\d\d)').replace(
1622 'MM', r'(?P<month>[01]\d)').replace(
1623 'DD', r'(?P<day>[0123]\d)').replace(
1624 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1625 'CC', r'(?P<century>\d\d$)')
1626 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1627 + r'(:(?P<second>\d{2}))?'
1628 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1629 for tmpl in _iso8601_tmpl]
1631 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1633 def _parse_date_iso8601(dateString):
1634 """Parse a variety of ISO-8601-compatible formats like 20040105"""
1636 for _iso8601_match in _iso8601_matches:
1637 m = _iso8601_match(dateString)
1640 if m.span() == (0, 0): return
1641 params = m.groupdict()
1642 ordinal = params.get("ordinal", 0)
1644 ordinal = int(ordinal)
1647 year = params.get("year", "--")
1648 if not year or year == "--":
1649 year = time.gmtime()[0]
1650 elif len(year) == 2:
1651 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1652 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1655 month = params.get("month", "-")
1656 if not month or month == "-":
1657 # ordinals are NOT normalized by mktime, we simulate them
1658 # by setting month=1, day=ordinal
1662 month = time.gmtime()[1]
1664 day = params.get("day", 0)
1669 elif params.get("century", 0) or \
1670 params.get("year", 0) or params.get("month", 0):
1673 day = time.gmtime()[2]
1676 # special case of the century - is the first year of the 21st century
1677 # 2000 or 2001 ? The debate goes on...
1678 if "century" in params.keys():
1679 year = (int(params["century"]) - 1) * 100 + 1
1680 # in ISO 8601 most fields are optional
1681 for field in ["hour", "minute", "second", "tzhour", "tzmin"]:
1682 if not params.get(field, None):
1684 hour = int(params.get("hour", 0))
1685 minute = int(params.get("minute", 0))
1686 second = int(params.get("second", 0))
1687 # weekday is normalized by mktime(), we can ignore it
1689 # daylight savings is complex, but not needed for feedparser's purposes
1690 # as time zones, if specified, include mention of whether it is active
1691 # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1692 # and most implementations have DST bugs
1693 daylight_savings_flag = 0
1694 tm = [year, month, day, hour, minute, second, weekday,
1695 ordinal, daylight_savings_flag]
1696 # ISO 8601 time zone adjustments
1697 tz = params.get("tz")
1698 if tz and tz != "Z":
1700 tm[3] += int(params.get("tzhour", 0))
1701 tm[4] += int(params.get("tzmin", 0))
1703 tm[3] -= int(params.get("tzhour", 0))
1704 tm[4] -= int(params.get("tzmin", 0))
1707 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1708 # which is guaranteed to normalize d/m/y/h/m/s.
1709 # Many implementations have bugs, but we'll pretend they don't.
1710 return time.localtime(time.mktime(tm))
1711 registerDateHandler(_parse_date_iso8601)
1713 # 8-bit date handling routines written by ytrewq1.
1714 _korean_year = u'\ub144' # b3e2 in euc-kr
1715 _korean_month = u'\uc6d4' # bff9 in euc-kr
1716 _korean_day = u'\uc77c' # c0cf in euc-kr
1717 _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
1718 _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
1720 _korean_onblog_date_re = \
1721 re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
1722 (_korean_year, _korean_month, _korean_day))
1723 _korean_nate_date_re = \
1724 re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
1725 (_korean_am, _korean_pm))
1726 def _parse_date_onblog(dateString):
1727 """Parse a string according to the OnBlog 8-bit date format"""
1728 m = _korean_onblog_date_re.match(dateString)
1730 w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" % \
1731 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1732 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
1733 'zonediff': '+09:00'}
1734 if _debug: sys.stderr.write("OnBlog date parsed as: %s\n" % w3dtfdate)
1735 return _parse_date_w3dtf(w3dtfdate)
1736 registerDateHandler(_parse_date_onblog)
1738 def _parse_date_nate(dateString):
1739 """Parse a string according to the Nate 8-bit date format"""
1740 m = _korean_nate_date_re.match(dateString)
1742 hour = int(m.group(5))
1744 if (ampm == _korean_pm):
1749 w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" % \
1750 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1751 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
1752 'zonediff': '+09:00'}
1753 if _debug: sys.stderr.write("Nate date parsed as: %s\n" % w3dtfdate)
1754 return _parse_date_w3dtf(w3dtfdate)
1755 registerDateHandler(_parse_date_nate)
1758 re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})\.\d+')
1759 def _parse_date_mssql(dateString):
1760 """Parse a string according to the MS SQL date format"""
1761 m = _mssql_date_re.match(dateString)
1763 w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" % \
1764 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1765 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
1766 'zonediff': '+09:00'}
1767 if _debug: sys.stderr.write("MS SQL date parsed as: %s\n" % w3dtfdate)
1768 return _parse_date_w3dtf(w3dtfdate)
1769 registerDateHandler(_parse_date_mssql)
1771 # Unicode strings for Greek date strings
1774 u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
1775 u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
1776 u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
1777 u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
1778 u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
1779 u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
1780 u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
1781 u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
1782 u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
1783 u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
1784 u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
1785 u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
1786 u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
1787 u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
1788 u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
1789 u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
1790 u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
1791 u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
1792 u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
1797 u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
1798 u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
1799 u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
1800 u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
1801 u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
1802 u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
1803 u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
1806 _greek_date_format_re = \
1807 re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
1809 def _parse_date_greek(dateString):
1810 """Parse a string according to a Greek 8-bit date format."""
1811 m = _greek_date_format_re.match(dateString)
1814 wday = _greek_wdays[m.group(1)]
1815 month = _greek_months[m.group(3)]
1818 rfc822date = "%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s" % \
1819 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
1820 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
1821 'zonediff': m.group(8)}
1822 if _debug: sys.stderr.write("Greek date parsed as: %s\n" % rfc822date)
1823 return _parse_date_rfc822(rfc822date)
1824 registerDateHandler(_parse_date_greek)
1826 # Unicode strings for Hungarian date strings
1827 _hungarian_months = \
1829 u'janu\u00e1r': u'01', # e1 in iso-8859-2
1830 u'febru\u00e1ri': u'02', # e1 in iso-8859-2
1831 u'm\u00e1rcius': u'03', # e1 in iso-8859-2
1832 u'\u00e1prilis': u'04', # e1 in iso-8859-2
1833 u'm\u00e1ujus': u'05', # e1 in iso-8859-2
1834 u'j\u00fanius': u'06', # fa in iso-8859-2
1835 u'j\u00falius': u'07', # fa in iso-8859-2
1836 u'augusztus': u'08',
1837 u'szeptember': u'09',
1838 u'okt\u00f3ber': u'10', # f3 in iso-8859-2
1843 _hungarian_date_format_re = \
1844 re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
1846 def _parse_date_hungarian(dateString):
1847 """Parse a string according to a Hungarian 8-bit date format."""
1848 m = _hungarian_date_format_re.match(dateString)
1851 month = _hungarian_months[m.group(2)]
1860 w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s" % \
1861 {'year': m.group(1), 'month': month, 'day': day,\
1862 'hour': hour, 'minute': m.group(5),\
1863 'zonediff': m.group(6)}
1864 if _debug: sys.stderr.write("Hungarian date parsed as: %s\n" % w3dtfdate)
1865 return _parse_date_w3dtf(w3dtfdate)
1866 registerDateHandler(_parse_date_hungarian)
1868 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
1869 # Drake and licensed under the Python license. Removed all range checking
1870 # for month, day, hour, minute, and second, since mktime will normalize
1872 def _parse_date_w3dtf(dateString):
1873 def __extract_date(m):
1874 year = int(m.group("year"))
1876 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1879 julian = m.group("julian")
1881 julian = int(julian)
1882 month = julian / 30 + 1
1883 day = julian % 30 + 1
1885 while jday != julian:
1886 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
1887 jday = time.gmtime(t)[-2]
1888 diff = abs(jday - julian)
1900 return year, month, day
1901 month = m.group("month")
1907 day = m.group("day")
1912 return year, month, day
1914 def __extract_time(m):
1917 hours = m.group("hours")
1921 minutes = int(m.group("minutes"))
1922 seconds = m.group("seconds")
1924 seconds = int(seconds)
1927 return hours, minutes, seconds
1929 def __extract_tzd(m):
1930 """Return the Time Zone Designator as an offset in seconds from UTC."""
1933 tzd = m.group("tzd")
1938 hours = int(m.group("tzdhours"))
1939 minutes = m.group("tzdminutes")
1941 minutes = int(minutes)
1944 offset = (hours*60 + minutes) * 60
1949 __date_re = ("(?P<year>\d\d\d\d)"
1951 "(?:(?P<julian>\d\d\d)"
1952 "|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?")
1953 __tzd_re = "(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)"
1954 __tzd_rx = re.compile(__tzd_re)
1955 __time_re = ("(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)"
1956 "(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?"
1958 __datetime_re = "%s(?:T%s)?" % (__date_re, __time_re)
1959 __datetime_rx = re.compile(__datetime_re)
1960 m = __datetime_rx.match(dateString)
1961 if (m is None) or (m.group() != dateString): return
1962 gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
1963 if gmt[0] == 0: return
1964 return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
1965 registerDateHandler(_parse_date_w3dtf)
1967 def _parse_date_rfc822(dateString):
1968 """Parse an RFC822, RFC1123, RFC2822, or asctime-style date"""
1969 tm = rfc822.parsedate_tz(dateString)
1971 return time.gmtime(rfc822.mktime_tz(tm))
1972 # rfc822.py defines several time zones, but we define some extra ones.
1973 # "ET" is equivalent to "EST", etc.
1974 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
1975 rfc822._timezones.update(_additional_timezones)
1976 registerDateHandler(_parse_date_rfc822)
1978 def _parse_date(dateString):
1979 """Parses a variety of date formats into a 9-tuple in GMT"""
1980 for handler in _date_handlers:
1982 date9tuple = handler(dateString)
1983 if not date9tuple: continue
1984 if len(date9tuple) != 9:
1985 if _debug: sys.stderr.write("date handler function must return 9-tuple\n")
1987 map(int, date9tuple)
1989 except Exception, e:
1990 if _debug: sys.stderr.write("%s raised %s\n" % (handler.__name__, repr(e)))
1994 def _getCharacterEncoding(http_headers, xml_data):
1995 """Get the character encoding of the XML document
1997 http_headers is a dictionary
1998 xml_data is a raw string (not Unicode)
2000 This is so much trickier than it sounds, it's not even funny.
2001 According to RFC 3023 ("XML Media Types"), if the HTTP Content-Type
2002 is application/xml, application/*+xml,
2003 application/xml-external-parsed-entity, or application/xml-dtd,
2004 the encoding given in the charset parameter of the HTTP Content-Type
2005 takes precedence over the encoding given in the XML prefix within the
2006 document, and defaults to "utf-8" if neither are specified. But, if
2007 the HTTP Content-Type is text/xml, text/*+xml, or
2008 text/xml-external-parsed-entity, the encoding given in the XML prefix
2009 within the document is ALWAYS IGNORED and only the encoding given in
2010 the charset parameter of the HTTP Content-Type header should be
2011 respected, and it defaults to "us-ascii" if not specified.
2013 Furthermore, discussion on the atom-syntax mailing list with the
2014 author of RFC 3023 leads me to the conclusion that any document
2015 served with a Content-Type of text/* and no charset parameter
2016 must be treated as us-ascii. (We now do this.) And also that it
2017 must always be flagged as non-well-formed. (We now do this too.)
2019 If Content-Type is unspecified (input was local file or non-HTTP source)
2020 or unrecognized (server just got it totally wrong), then go by the
2021 encoding given in the XML prefix of the document and default to
2022 "iso-8859-1" as per the HTTP specification (RFC 2616).
2024 Then, assuming we didn't find a character encoding in the HTTP headers
2025 (and the HTTP Content-type allowed us to look in the body), we need
2026 to sniff the first few bytes of the XML data and try to determine
2027 whether the encoding is ASCII-compatible. Section F of the XML
2028 specification shows the way here:
2029 http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2031 If the sniffed encoding is not ASCII-compatible, we need to make it
2032 ASCII compatible so that we can sniff further into the XML declaration
2033 to find the encoding attribute, which will tell us the true encoding.
2035 Of course, none of this guarantees that we will be able to parse the
2036 feed in the declared character encoding (assuming it was declared
2037 correctly, which many are not). CJKCodecs and iconv_codec help a lot;
2038 you should definitely install them if you can.
2039 http://cjkpython.i18n.org/
2042 def _parseHTTPContentType(content_type):
2043 """takes HTTP Content-Type header and returns (content type, charset)
2045 If no charset is specified, returns (content type, '')
2046 If no content type is specified, returns ('', '')
2047 Both return parameters are guaranteed to be lowercase strings
2049 content_type = content_type or ''
2050 content_type, params = cgi.parse_header(content_type)
2051 return content_type, params.get('charset', '').replace("'", "")
2053 sniffed_xml_encoding = ''
2056 http_content_type, http_encoding = _parseHTTPContentType(http_headers.get("content-type"))
2057 # Must sniff for non-ASCII-compatible character encodings before
2058 # searching for XML declaration. This heuristic is defined in
2059 # section F of the XML specification:
2060 # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2062 if xml_data[:4] == '\x4c\x6f\xa7\x94':
2064 xml_data = _ebcdic_to_ascii(xml_data)
2065 elif xml_data[:4] == '\x00\x3c\x00\x3f':
2067 sniffed_xml_encoding = 'utf-16be'
2068 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2069 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2071 sniffed_xml_encoding = 'utf-16be'
2072 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2073 elif xml_data[:4] == '\x3c\x00\x3f\x00':
2075 sniffed_xml_encoding = 'utf-16le'
2076 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2077 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2079 sniffed_xml_encoding = 'utf-16le'
2080 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2081 elif xml_data[:4] == '\x00\x00\x00\x3c':
2083 sniffed_xml_encoding = 'utf-32be'
2084 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2085 elif xml_data[:4] == '\x3c\x00\x00\x00':
2087 sniffed_xml_encoding = 'utf-32le'
2088 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2089 elif xml_data[:4] == '\x00\x00\xfe\xff':
2091 sniffed_xml_encoding = 'utf-32be'
2092 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2093 elif xml_data[:4] == '\xff\xfe\x00\x00':
2095 sniffed_xml_encoding = 'utf-32le'
2096 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2097 elif xml_data[:3] == '\xef\xbb\xbf':
2099 sniffed_xml_encoding = 'utf-8'
2100 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2104 xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2106 xml_encoding_match = None
2107 if xml_encoding_match:
2108 xml_encoding = xml_encoding_match.groups()[0].lower()
2109 if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2110 xml_encoding = sniffed_xml_encoding
2111 acceptable_content_type = 0
2112 application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2113 text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2114 if (http_content_type in application_content_types) or \
2115 (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2116 acceptable_content_type = 1
2117 true_encoding = http_encoding or xml_encoding or 'utf-8'
2118 elif (http_content_type in text_content_types) or \
2119 (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2120 acceptable_content_type = 1
2121 true_encoding = http_encoding or 'us-ascii'
2122 elif http_content_type.startswith('text/'):
2123 true_encoding = http_encoding or 'us-ascii'
2124 elif http_headers and (not http_headers.has_key('content-type')):
2125 true_encoding = xml_encoding or 'iso-8859-1'
2127 true_encoding = xml_encoding or 'utf-8'
2128 return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2130 def _toUTF8(data, encoding):
2131 """Changes an XML data stream on the fly to specify a new encoding
2133 data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2134 encoding is a string recognized by encodings.aliases
2136 if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2137 # strip Byte Order Mark (if present)
2138 if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2140 sys.stderr.write('stripping BOM\n')
2141 if encoding != 'utf-16be':
2142 sys.stderr.write('trying utf-16be instead\n')
2143 encoding = 'utf-16be'
2145 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2147 sys.stderr.write('stripping BOM\n')
2148 if encoding != 'utf-16le':
2149 sys.stderr.write('trying utf-16le instead\n')
2150 encoding = 'utf-16le'
2152 elif data[:3] == '\xef\xbb\xbf':
2154 sys.stderr.write('stripping BOM\n')
2155 if encoding != 'utf-8':
2156 sys.stderr.write('trying utf-8 instead\n')
2159 elif data[:4] == '\x00\x00\xfe\xff':
2161 sys.stderr.write('stripping BOM\n')
2162 if encoding != 'utf-32be':
2163 sys.stderr.write('trying utf-32be instead\n')
2164 encoding = 'utf-32be'
2166 elif data[:4] == '\xff\xfe\x00\x00':
2168 sys.stderr.write('stripping BOM\n')
2169 if encoding != 'utf-32le':
2170 sys.stderr.write('trying utf-32le instead\n')
2171 encoding = 'utf-32le'
2173 newdata = unicode(data, encoding)
2174 if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2175 declmatch = re.compile('^<\?xml[^>]*?>')
2176 newdecl = """<?xml version='1.0' encoding='utf-8'?>"""
2177 if declmatch.search(newdata):
2178 newdata = declmatch.sub(newdecl, newdata)
2180 newdata = newdecl + u'\n' + newdata
2181 return newdata.encode("utf-8")
2183 def _stripDoctype(data):
2184 """Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2186 rss_version may be "rss091n" or None
2187 stripped_data is the same XML document, minus the DOCTYPE
2189 entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2190 data = entity_pattern.sub('', data)
2191 doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2192 doctype_results = doctype_pattern.findall(data)
2193 doctype = doctype_results and doctype_results[0] or ''
2194 if doctype.lower().count('netscape'):
2198 data = doctype_pattern.sub('', data)
2199 return version, data
2201 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2202 """Parse a feed from a URL, file, stream, or string"""
2203 result = FeedParserDict()
2204 result['feed'] = FeedParserDict()
2205 result['entries'] = []
2208 if type(handlers) == types.InstanceType:
2209 handlers = [handlers]
2211 f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2213 except Exception, e:
2215 result['bozo_exception'] = e
2219 # if feed is gzip-compressed, decompress it
2220 if f and data and hasattr(f, "headers"):
2221 if gzip and f.headers.get('content-encoding', '') == 'gzip':
2223 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2224 except Exception, e:
2225 # Some feeds claim to be gzipped but they're not, so
2226 # we get garbage. Ideally, we should re-request the
2227 # feed without the "Accept-encoding: gzip" header,
2230 result['bozo_exception'] = e
2232 elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2234 data = zlib.decompress(data, -zlib.MAX_WBITS)
2235 except Exception, e:
2237 result['bozo_exception'] = e
2241 if hasattr(f, "info"):
2243 result["etag"] = info.getheader("ETag")
2244 last_modified = info.getheader("Last-Modified")
2246 result["modified"] = _parse_date(last_modified)
2247 if hasattr(f, "url"):
2248 result["url"] = f.url
2249 result["status"] = 200
2250 if hasattr(f, "status"):
2251 result["status"] = f.status
2252 if hasattr(f, "headers"):
2253 result["headers"] = f.headers.dict
2254 if hasattr(f, "close"):
2257 # there are four encodings to keep track of:
2258 # - http_encoding is the encoding declared in the Content-Type HTTP header
2259 # - xml_encoding is the encoding declared in the <?xml declaration
2260 # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
2261 # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
2262 http_headers = result.get("headers", {})
2263 result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
2264 _getCharacterEncoding(http_headers, data)
2265 if http_headers and (not acceptable_content_type):
2266 if http_headers.has_key('content-type'):
2267 bozo_message = '%s is not an XML media type' % http_headers['content-type']
2269 bozo_message = 'no Content-type specified'
2271 result['bozo_exception'] = NonXMLContentType(bozo_message)
2273 result['version'], data = _stripDoctype(data)
2275 baseuri = http_headers.get('content-location', result.get('url'))
2276 baselang = http_headers.get('content-language', None)
2278 # if server sent 304, we're done
2279 if result.get("status", 0) == 304:
2280 result['version'] = ''
2281 result['debug_message'] = "The feed has not changed since you last checked, " + \
2282 "so the server sent no data. This is a feature, not a bug!"
2285 # if there was a problem downloading, we're done
2289 # determine character encoding
2290 use_strict_parser = 0
2292 tried_encodings = []
2293 for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding, 'utf-8', 'windows-1252'):
2294 if proposed_encoding in tried_encodings: continue
2295 if not proposed_encoding: continue
2297 data = _toUTF8(data, proposed_encoding)
2299 use_strict_parser = 1
2303 tried_encodings.append(proposed_encoding)
2304 if not known_encoding:
2306 result['bozo_exception'] = CharacterEncodingUnknown( \
2307 "document encoding unknown, I tried " + \
2308 "%s, %s, utf-8, and windows-1252 but nothing worked" % \
2309 (result['encoding'], xml_encoding))
2310 result['encoding'] = ''
2311 elif proposed_encoding != result['encoding']:
2313 result['bozo_exception'] = CharacterEncodingOverride( \
2314 "documented declared as %s, but parsed as %s" % \
2315 (result['encoding'], proposed_encoding))
2316 result['encoding'] = proposed_encoding
2318 if not _XML_AVAILABLE:
2319 use_strict_parser = 0
2320 if use_strict_parser:
2321 # initialize the SAX parser
2322 feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
2323 saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2324 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2325 saxparser.setContentHandler(feedparser)
2326 saxparser.setErrorHandler(feedparser)
2327 source = xml.sax.xmlreader.InputSource()
2328 source.setByteStream(_StringIO(data))
2329 if hasattr(saxparser, '_ns_stack'):
2330 # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
2331 # PyXML doesn't have this problem, and it doesn't have _ns_stack either
2332 saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2334 saxparser.parse(source)
2335 except Exception, e:
2338 traceback.print_stack()
2339 traceback.print_exc()
2340 sys.stderr.write('xml parsing failed\n')
2342 result['bozo_exception'] = feedparser.exc or e
2343 use_strict_parser = 0
2344 if not use_strict_parser:
2345 feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
2346 feedparser.feed(data)
2347 result['feed'] = feedparser.feeddata
2348 result['entries'] = feedparser.entries
2349 result['version'] = result['version'] or feedparser.version
2352 if __name__ == '__main__':
2353 if not sys.argv[1:]:
2358 zopeCompatibilityHack()
2359 from pprint import pprint
2368 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
2369 # added Simon Fell's test suite
2370 #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
2372 # JD - use inchannel to watch out for image and textinput elements which can
2373 # also contain title, link, and description elements
2374 # JD - check for isPermaLink="false" attribute on guid elements
2375 # JD - replaced openAnything with open_resource supporting ETag and
2376 # If-Modified-Since request headers
2377 # JD - parse now accepts etag, modified, agent, and referrer optional
2379 # JD - modified parse to return a dictionary instead of a tuple so that any
2380 # etag or modified information can be returned and cached by the caller
2381 #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
2382 # because of etag/modified, return the old etag/modified to the caller to
2383 # indicate why nothing is being returned
2384 #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
2385 # useless. Fixes the problem JD was addressing by adding it.
2386 #2.1 - 11/14/2002 - MAP - added gzip support
2387 #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
2388 # start_admingeneratoragent is an example of how to handle elements with
2389 # only attributes, no content.
2390 #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
2391 # also, make sure we send the User-Agent even if urllib2 isn't available.
2392 # Match any variation of backend.userland.com/rss namespace.
2393 #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2394 #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
2395 # snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
2397 #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2398 # removed unnecessary urllib code -- urllib2 should always be available anyway;
2399 # return actual url, status, and full HTTP headers (as result['url'],
2400 # result['status'], and result['headers']) if parsing a remote feed over HTTP --
2401 # this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
2402 # added the latest namespace-of-the-week for RSS 2.0
2403 #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2404 # User-Agent (otherwise urllib2 sends two, which confuses some servers)
2405 #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2406 # inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2407 #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2408 # textInput, and also to return the character encoding (if specified)
2409 #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2410 # nested divs within content (JohnD); fixed missing sys import (JohanS);
2411 # fixed regular expression to capture XML character encoding (Andrei);
2412 # added support for Atom 0.3-style links; fixed bug with textInput tracking;
2413 # added support for cloud (MartijnP); added support for multiple
2414 # category/dc:subject (MartijnP); normalize content model: "description" gets
2415 # description (which can come from description, summary, or full content if no
2416 # description), "content" gets dict of base/language/type/value (which can come
2417 # from content:encoded, xhtml:body, content, or fullitem);
2418 # fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2419 # tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2420 # <content> element is not in default namespace (like Pocketsoap feed);
2421 # resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2422 # wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2423 # description, xhtml:body, content, content:encoded, title, subtitle,
2424 # summary, info, tagline, and copyright; added support for pingback and
2425 # trackback namespaces
2426 #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2427 # namespaces, as opposed to 2.6 when I said I did but didn't really;
2428 # sanitize HTML markup within some elements; added mxTidy support (if
2429 # installed) to tidy HTML markup within some elements; fixed indentation
2430 # bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2431 # (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2432 # 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2433 # 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2434 # and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2435 #2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
2436 # leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2437 # added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2438 #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
2439 # encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2440 # fixed relative URI processing for guid (skadz); added ICBM support; added
2442 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2443 # blogspot.com sites); added _debug variable
2444 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2445 #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2446 # added several new supported namespaces; fixed bug tracking naked markup in
2447 # description; added support for enclosure; added support for source; re-added
2448 # support for cloud which got dropped somehow; added support for expirationDate
2449 #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2450 # xml:base URI, one for documents that don't define one explicitly and one for
2451 # documents that define an outer and an inner xml:base that goes out of scope
2452 # before the end of the document
2453 #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2454 #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result["version"]
2455 # will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2456 # added support for creativeCommons:license and cc:license; added support for
2457 # full Atom content model in title, tagline, info, copyright, summary; fixed bug
2458 # with gzip encoding (not always telling server we support it when we do)
2459 #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2460 # (dictionary of "name", "url", "email"); map author to author_detail if author
2461 # contains name + email address
2462 #3.0b8 - 1/28/2004 - MAP - added support for contributor
2463 #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2464 # support for summary
2465 #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2467 #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2468 # dangerous markup; fiddled with decodeEntities (not right); liberalized
2469 # date parsing even further
2470 #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2471 # added support to Atom 0.2 subtitle; added support for Atom content model
2472 # in copyright; better sanitizing of dangerous HTML elements with end tags
2473 # (script, frameset)
2474 #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2475 # etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2476 #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2478 #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2479 # fixed bug capturing author and contributor URL; fixed bug resolving relative
2480 # links in author and contributor URL; fixed bug resolvin relative links in
2481 # generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2482 # namespace tests, and included them permanently in the test suite with his
2483 # permission; fixed namespace handling under Python 2.1
2484 #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2485 #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2486 #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2487 # use libxml2 (if available)
2488 #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2489 # name was in parentheses; removed ultra-problematic mxTidy support; patch to
2490 # workaround crash in PyXML/expat when encountering invalid entities
2491 # (MarkMoraes); support for textinput/textInput
2492 #3.0b20 - 4/7/2004 - MAP - added CDF support
2493 #3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2494 #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2495 # results dict; changed results dict to allow getting values with results.key
2496 # as well as results[key]; work around embedded illformed HTML with half
2497 # a DOCTYPE; work around malformed Content-Type header; if character encoding
2498 # is wrong, try several common ones before falling back to regexes (if this
2499 # works, bozo_exception is set to CharacterEncodingOverride); fixed character
2500 # encoding issues in BaseHTMLProcessor by tracking encoding and converting
2501 # from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
2502 # convert each value in results to Unicode (if possible), even if using
2503 # regex-based parsing
2504 #3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
2505 # high-bit characters in attributes in embedded HTML in description (thanks
2506 # Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
2507 # FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
2508 # about a mapped key
2509 #3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
2510 # results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
2511 # cause the same encoding to be tried twice (even if it failed the first time);
2512 # fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
2513 # better textinput and image tracking in illformed RSS 1.0 feeds
2514 #3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
2515 # my blink tag tests
2516 #3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
2517 # failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
2518 # duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
2519 # added support for image; refactored parse() fallback logic to try other
2520 # encodings if SAX parsing fails (previously it would only try other encodings
2521 # if re-encoding failed); remove unichr madness in normalize_attrs now that
2522 # we're properly tracking encoding in and out of BaseHTMLProcessor; set
2523 # feed.language from root-level xml:lang; set entry.id from rdf:about;
2524 # send Accept header
2525 #3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
2526 # iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
2527 # windows-1252); fixed regression that could cause the same encoding to be
2528 # tried twice (even if it failed the first time)
2529 #3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
2530 # recover from malformed content-type header parameter with no equals sign
2531 # ("text/xml; charset:iso-8859-1")
2532 #3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
2533 # to Unicode equivalents in illformed feeds (aaronsw); added and
2534 # passed tests for converting character entities to Unicode equivalents
2535 # in illformed feeds (aaronsw); test for valid parsers when setting
2536 # XML_AVAILABLE; make version and encoding available when server returns
2537 # a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
2538 # digest auth or proxy support); add code to parse username/password
2539 # out of url and send as basic authentication; expose downloading-related
2540 # exceptions in bozo_exception (aaronsw); added __contains__ method to
2541 # FeedParserDict (aaronsw); added publisher_detail (aaronsw)
2542 #3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
2543 # convert feed to UTF-8 before passing to XML parser; completely revamped
2544 # logic for determining character encoding and attempting XML parsing
2545 # (much faster); increased default timeout to 20 seconds; test for presence
2546 # of Location header on redirects; added tests for many alternate character
2547 # encodings; support various EBCDIC encodings; support UTF-16BE and
2548 # UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
2549 # UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
2550 # XML parsers are available; added support for "Content-encoding: deflate";
2551 # send blank "Accept-encoding: " header if neither gzip nor zlib modules
2553 #3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
2554 # problem tracking xml:base and xml:lang if element declares it, child
2555 # doesn't, first grandchild redeclares it, and second grandchild doesn't;
2556 # refactored date parsing; defined public registerDateHandler so callers
2557 # can add support for additional date formats at runtime; added support
2558 # for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
2559 # zopeCompatibilityHack() which turns FeedParserDict into a regular
2560 # dictionary, required for Zope compatibility, and also makes command-
2561 # line debugging easier because pprint module formats real dictionaries
2562 # better than dictionary-like objects; added NonXMLContentType exception,
2563 # which is stored in bozo_exception when a feed is served with a non-XML
2564 # media type such as "text/plain"; respect Content-Language as default
2565 # language if not xml:lang is present; cloud dict is now FeedParserDict;
2566 # generator dict is now FeedParserDict; better tracking of xml:lang,
2567 # including support for xml:lang="" to unset the current language;
2568 # recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
2569 # namespace; don't overwrite final status on redirects (scenarios:
2570 # redirecting to a URL that returns 304, redirecting to a URL that
2571 # redirects to another URL with a different type of redirect); add
2572 # support for HTTP 303 redirects