2 """Universal feed parser
4 Visit http://diveintomark.org/projects/feed_parser/ for the latest version
6 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds
8 Required: Python 2.1 or later
9 Recommended: Python 2.3 or later
10 Recommended: libxml2 <http://xmlsoft.org/python.html>
13 __version__ = "3.0-beta-22"
14 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
15 __copyright__ = "Copyright 2002-4, Mark Pilgrim"
16 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
17 "John Beimler <http://john.beimler.org/>",
18 "Fazal Majid <http://www.majid.info/mylos/weblog/>"]
19 __license__ = "Python"
21 _debug_never_use_libxml2 = 0
23 # if you are embedding feedparser in a larger application, you should change this to your application name and URL
24 USER_AGENT = "UniversalFeedParser/%s%s +http://diveintomark.org/projects/feed_parser/" % (__version__, _debug and "-debug" or "")
26 # If you want feedparser to automatically run HTML markup through HTML Tidy, set this to 1.
27 # This is off by default because of reports of crashing on some platforms. If it crashes
28 # for you, please submit a bug report with your OS platform, Python version, and the URL
29 # of the feed you were attempting to parse.
30 # Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
33 # ---------- required modules (should come with any Python distribution) ----------
34 import sgmllib, re, sys, copy, urlparse, time, rfc822, types
36 from cStringIO import StringIO as _StringIO
38 from StringIO import StringIO as _StringIO
40 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
42 # gzip is included with most Python distributions, but may not be available if you compiled your own
48 # timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers.
49 # Python 2.3 now has this functionality available in the standard socket library, so under
50 # 2.3 you don't need to install anything. But you probably should anyway, because the socket
51 # module is buggy and timeoutsocket is better.
53 import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
54 timeoutsocket.setDefaultSocketTimeout(10)
57 if hasattr(socket, 'setdefaulttimeout'):
58 socket.setdefaulttimeout(10)
64 from mx.Tidy import Tidy as _mxtidy
68 # If a real XML parser is available, feedparser will attempt to use it. feedparser works
69 # with both the built-in SAX parser and PyXML SAX parser. On platforms where the Python
70 # distribution does not come with an XML parser (such as Mac OS X 10.2 and some versions of
71 # FreeBSD), feedparser will just fall back on regex-based parsing. If XML libraries are
72 # available but the feed turns out not to be well-formed XML, feedparser will fall back
73 # on regex-based parsing and set the "bozo" bit in the results to indicate that the feed
74 # author is a bozo who can't generate well-formed XML. The two advantages of using a real
75 # XML parser are (1) Unicode support, and (2) to get people to stop yelling at me for not
79 from xml.sax.saxutils import escape as _xmlescape
80 class CharacterEncodingOverride(xml.sax.SAXException): pass
85 data = data.replace("&", "&")
86 data = data.replace(">", ">")
87 data = data.replace("<", "<")
90 # base64 support for Atom feeds that contain embedded binary data
92 import base64, binascii
94 base64 = binascii = None
96 # ---------- don't touch these ----------
97 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
98 sgmllib.special = re.compile('<!')
100 SUPPORTED_VERSIONS = {'': 'unknown',
101 'rss090': 'RSS 0.90',
102 'rss091n': 'RSS 0.91 (Netscape)',
103 'rss091u': 'RSS 0.91 (Userland)',
104 'rss092': 'RSS 0.92',
105 'rss093': 'RSS 0.93',
106 'rss094': 'RSS 0.94',
109 'rss': 'RSS (unknown version)',
110 'atom01': 'Atom 0.1',
111 'atom02': 'Atom 0.2',
112 'atom03': 'Atom 0.3',
113 'atom': 'Atom (unknown version)',
121 # Python 2.1 does not have a built-in dict() function
128 from UserDict import UserDict
129 class FeedParserDict(UserDict):
130 def __getitem__(self, key):
131 if key == 'channel': key = 'feed'
132 if key == 'items': key = 'entries'
133 return UserDict.__getitem__(self, key)
135 def __getattr__(self, key):
137 return self.__dict__[key]
141 return self.__getitem__(key)
143 raise AttributeError, "object has no attribute '%s'" % key
145 class _FeedParserMixin:
146 namespaces = {"": "",
147 "http://backend.userland.com/rss": "",
148 "http://blogs.law.harvard.edu/tech/rss": "",
149 "http://purl.org/rss/1.0/": "",
150 "http://my.netscape.com/rdf/simple/0.9/": "",
151 "http://example.com/newformat#": "",
152 "http://example.com/necho": "",
153 "http://purl.org/echo/": "",
154 "uri/of/echo/namespace#": "",
155 "http://purl.org/pie/": "",
156 "http://purl.org/atom/ns#": "",
157 "http://purl.org/rss/1.0/modules/rss091#": "",
159 "http://webns.net/mvcb/": "admin",
160 "http://purl.org/rss/1.0/modules/aggregation/": "ag",
161 "http://purl.org/rss/1.0/modules/annotate/": "annotate",
162 "http://media.tangent.org/rss/1.0/": "audio",
163 "http://backend.userland.com/blogChannelModule": "blogChannel",
164 "http://web.resource.org/cc/": "cc",
165 "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
166 "http://purl.org/rss/1.0/modules/company": "co",
167 "http://purl.org/rss/1.0/modules/content/": "content",
168 "http://my.theinfo.org/changed/1.0/rss/": "cp",
169 "http://purl.org/dc/elements/1.1/": "dc",
170 "http://purl.org/dc/terms/": "dcterms",
171 "http://purl.org/rss/1.0/modules/email/": "email",
172 "http://purl.org/rss/1.0/modules/event/": "ev",
173 "http://postneo.com/icbm/": "icbm",
174 "http://purl.org/rss/1.0/modules/image/": "image",
175 "http://xmlns.com/foaf/0.1/": "foaf",
176 "http://freshmeat.net/rss/fm/": "fm",
177 "http://purl.org/rss/1.0/modules/link/": "l",
178 "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
179 "http://prismstandard.org/namespaces/1.2/basic/": "prism",
180 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
181 "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
182 "http://purl.org/rss/1.0/modules/reference/": "ref",
183 "http://purl.org/rss/1.0/modules/richequiv/": "reqv",
184 "http://purl.org/rss/1.0/modules/search/": "search",
185 "http://purl.org/rss/1.0/modules/slash/": "slash",
186 "http://purl.org/rss/1.0/modules/servicestatus/": "ss",
187 "http://hacks.benhammersley.com/rss/streaming/": "str",
188 "http://purl.org/rss/1.0/modules/subscription/": "sub",
189 "http://purl.org/rss/1.0/modules/syndication/": "sy",
190 "http://purl.org/rss/1.0/modules/taxonomy/": "taxo",
191 "http://purl.org/rss/1.0/modules/threading/": "thr",
192 "http://purl.org/rss/1.0/modules/textinput/": "ti",
193 "http://madskills.com/public/xml/rss/module/trackback/":"trackback",
194 "http://wellformedweb.org/CommentAPI/": "wfw",
195 "http://purl.org/rss/1.0/modules/wiki/": "wiki",
196 "http://schemas.xmlsoap.org/soap/envelope/": "soap",
197 "http://www.w3.org/1999/xhtml": "xhtml",
198 "http://www.w3.org/XML/1998/namespace": "xml"
201 can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'comments']
202 can_contain_relative_uris = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
203 can_contain_dangerous_markup = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
204 html_types = ['text/html', 'application/xhtml+xml']
206 def __init__(self, baseuri=None, encoding='utf-8'):
207 if _debug: sys.stderr.write("initializing FeedParser\n")
208 self.feeddata = FeedParserDict() # feed-level data
209 self.encoding = encoding # character encoding
210 self.entries = [] # list of entry-level data
211 self.version = '' # feed type/version, see SUPPORTED_VERSIONS
213 # the following are used internally to track state;
214 # some of this is kind of out of control and should
215 # probably be refactored into a finite state machine
222 self.incontributor = 0
223 self.contentparams = FeedParserDict()
224 self.namespacemap = {}
225 self.elementstack = []
228 self.baseuri = baseuri or ''
231 def unknown_starttag(self, tag, attrs):
232 if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
234 attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
235 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
237 # track xml:base and xml:lang
239 baseuri = attrsD.get('xml:base', attrsD.get('base'))
241 if _debug: sys.stderr.write('self.baseuri=%s\n' % baseuri)
242 self.baseuri = baseuri
243 lang = attrsD.get('xml:lang', attrsD.get('lang'))
246 self.basestack.append(baseuri)
247 self.langstack.append(lang)
250 for prefix, uri in attrs:
251 if prefix.startswith('xmlns:'):
252 self.trackNamespace(prefix[6:], uri)
253 elif prefix == 'xmlns':
254 self.trackNamespace(None, uri)
256 # track inline content
257 if self.incontent and self.contentparams.get('mode') == 'escaped':
258 # element declared itself as escaped markup, but it isn't really
259 self.contentparams['mode'] = 'xml'
260 if self.incontent and self.contentparams.get('mode') == 'xml':
261 # Note: probably shouldn't simply recreate localname here, but
262 # our namespace handling isn't actually 100% correct in cases where
263 # the feed redefines the default namespace (which is actually
264 # the usual case for inline content, thanks Sam), so here we
265 # cheat and just reconstruct the element based on localname
266 # because that compensates for the bugs in our namespace handling.
267 # This will horribly munge inline content with non-empty qnames,
268 # but nobody actually does that, so I'm not fixing it.
269 tag = tag.split(':')[-1]
270 return self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0)
273 if tag.find(':') <> -1:
274 prefix, suffix = tag.split(':', 1)
276 prefix, suffix = '', tag
277 prefix = self.namespacemap.get(prefix, prefix)
279 prefix = prefix + '_'
281 # call special handler (if defined) or default handler
282 methodname = '_start_' + prefix + suffix
284 method = getattr(self, methodname)
285 return method(attrsD)
286 except AttributeError:
287 return self.push(prefix + suffix, 1)
289 def unknown_endtag(self, tag):
290 if _debug: sys.stderr.write('end %s\n' % tag)
292 if tag.find(':') <> -1:
293 prefix, suffix = tag.split(':', 1)
295 prefix, suffix = '', tag
296 prefix = self.namespacemap.get(prefix, prefix)
298 prefix = prefix + '_'
300 # call special handler (if defined) or default handler
301 methodname = '_end_' + prefix + suffix
303 method = getattr(self, methodname)
305 except AttributeError:
306 self.pop(prefix + suffix)
308 # track inline content
309 if self.incontent and self.contentparams.get('mode') == 'escaped':
310 # element declared itself as escaped markup, but it isn't really
311 self.contentparams['mode'] = 'xml'
312 if self.incontent and self.contentparams.get('mode') == 'xml':
313 tag = tag.split(':')[-1]
314 self.handle_data("</%s>" % tag, escape=0)
316 # track xml:base and xml:lang going out of scope
319 if self.basestack and self.basestack[-1]:
320 baseuri = self.basestack[-1]
321 if _debug: sys.stderr.write('self.baseuri=%s\n' % baseuri)
322 self.baseuri = baseuri
324 lang = self.langstack.pop()
328 def handle_charref(self, ref):
329 # called for each character reference, e.g. for " ", ref will be "160"
330 # Reconstruct the original character reference.
331 if not self.elementstack: return
333 self.elementstack[-1][2].append(text)
335 def handle_entityref(self, ref):
336 # called for each entity reference, e.g. for "©", ref will be "copy"
337 # Reconstruct the original entity reference.
338 if not self.elementstack: return
340 self.elementstack[-1][2].append(text)
342 def handle_data(self, text, escape=1):
343 # called for each block of plain text, i.e. outside of any tag and
344 # not containing any character or entity references
345 if not self.elementstack: return
346 # if _debug: sys.stderr.write(text)
347 if escape and self.contentparams.get('mode') == 'xml':
348 text = _xmlescape(text)
349 self.elementstack[-1][2].append(text)
351 def handle_comment(self, text):
352 # called for each comment, e.g. <!-- insert message here -->
355 def handle_pi(self, text):
356 # called for each processing instruction, e.g. <?instruction>
359 def handle_decl(self, text):
362 def parse_declaration(self, i):
363 # override internal declaration handler to handle CDATA blocks
364 if _debug: sys.stderr.write("entering parse_declaration\n")
365 if self.rawdata[i:i+9] == '<![CDATA[':
366 k = self.rawdata.find(']]>', i)
367 if k == -1: k = len(self.rawdata)
368 self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
371 k = self.rawdata.find('>', i)
374 def trackNamespace(self, prefix, uri):
375 if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
376 self.version = 'rss090'
377 if (prefix, uri) == (None, 'http://purl.org/rss/1.0/') and not self.version:
378 self.version = 'rss10'
379 if not prefix: return
380 if uri.find('backend.userland.com/rss') <> -1:
381 # match any backend.userland.com namespace
382 uri = 'http://backend.userland.com/rss'
383 if self.namespaces.has_key(uri):
384 self.namespacemap[prefix] = self.namespaces[uri]
386 def resolveURI(self, uri):
387 return urlparse.urljoin(self.baseuri or '', uri)
389 def decodeEntities(self, element, data):
390 if self.contentparams.get('mode') == 'escaped':
391 data = data.replace('<', '<')
392 data = data.replace('>', '>')
393 data = data.replace('&', '&')
394 data = data.replace('"', '"')
395 data = data.replace(''', "'")
398 def push(self, element, expectingText):
399 # while self.elementstack and self.elementstack[-1][1]:
400 # self.pop(self.elementstack[-1][0])
401 self.elementstack.append([element, expectingText, []])
403 def pop(self, element):
404 if not self.elementstack: return
405 # while self.elementstack[-1][0] != element: self.pop(self.elementstack[-1][0])
406 if self.elementstack[-1][0] != element: return
408 element, expectingText, pieces = self.elementstack.pop()
409 output = "".join(pieces)
410 output = output.strip()
411 if not expectingText: return output
413 # decode base64 content
414 if self.contentparams.get('mode') == 'base64' and base64:
416 output = base64.decodestring(output)
417 except binascii.Error:
419 except binascii.Incomplete:
422 # resolve relative URIs
423 if (element in self.can_be_relative_uri) and output:
424 output = self.resolveURI(output)
426 # decode entities within embedded markup
427 output = self.decodeEntities(element, output)
429 # resolve relative URIs within embedded markup
430 if element in self.can_contain_relative_uris:
431 output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
433 # sanitize embedded markup
434 if element in self.can_contain_dangerous_markup:
435 output = _sanitizeHTML(output, self.encoding)
437 if type(output) == types.StringType:
439 output = unicode(output, self.encoding)
443 # store output in appropriate place(s)
445 if element == 'content':
446 self.entries[-1].setdefault(element, [])
447 contentparams = copy.deepcopy(self.contentparams)
448 contentparams['value'] = output
449 self.entries[-1][element].append(contentparams)
450 elif element == 'category':
451 self.entries[-1][element] = output
452 domain = self.entries[-1]['categories'][-1][0]
453 self.entries[-1]['categories'][-1] = (domain, output)
454 elif element == 'source':
455 self.entries[-1]['source']['value'] = output
456 elif element == 'link':
457 self.entries[-1][element] = output
459 self.entries[-1]['links'][-1]['href'] = output
461 self.entries[-1][element] = output
463 if element == 'description':
465 contentparams = copy.deepcopy(self.contentparams)
466 contentparams['value'] = output
467 self.entries[-1][element + '_detail'] = contentparams
468 elif self.infeed and (not self.intextinput) and (not self.inimage):
469 self.feeddata[element] = output
470 if element == 'category':
471 domain = self.feeddata['categories'][-1][0]
472 self.feeddata['categories'][-1] = (domain, output)
473 elif element == 'link':
474 self.feeddata['links'][-1]['href'] = output
476 if element == 'description':
478 contentparams = copy.deepcopy(self.contentparams)
479 contentparams['value'] = output
480 self.feeddata[element + '_detail'] = contentparams
483 def _mapToStandardPrefix(self, name):
484 colonpos = name.find(':')
486 prefix = name[:colonpos]
487 suffix = name[colonpos+1:]
488 prefix = self.namespacemap.get(prefix, prefix)
489 name = prefix + ':' + suffix
492 def _getAttribute(self, attrsD, name):
493 return attrsD.get(self._mapToStandardPrefix(name))
495 def _save(self, key, value):
498 self.entries[-1].setdefault(key, value)
500 self.feeddata.setdefault(key, value)
502 def _start_rss(self, attrsD):
503 versionmap = {'0.91': 'rss091u',
508 attr_version = attrsD.get('version', '')
509 version = versionmap.get(attr_version)
511 self.version = version
512 elif attr_version.startswith('2.'):
513 self.version = 'rss20'
517 def _start_dlhottitles(self, attrsD):
518 self.version = 'hotrss'
520 def _start_channel(self, attrsD):
522 self._cdf_common(attrsD)
523 _start_feedinfo = _start_channel
525 def _cdf_common(self, attrsD):
526 if attrsD.has_key('lastmod'):
527 if _debug: sys.stderr.write(attrsD['lastmod'] + '\n')
528 self._start_modified({})
529 self.elementstack[-1][-1] = attrsD['lastmod']
531 if attrsD.has_key('href'):
533 self.elementstack[-1][-1] = attrsD['href']
536 def _start_feed(self, attrsD):
538 versionmap = {'0.1': 'atom01',
542 attr_version = attrsD.get('version')
543 version = versionmap.get(attr_version)
545 self.version = version
547 self.version = 'atom'
549 def _end_channel(self):
551 _end_feed = _end_channel
553 def _start_image(self, attrsD):
556 def _end_image(self):
559 def _start_textinput(self, attrsD):
561 self.push('textinput', 0)
562 context = self._getContext()
563 context.setdefault('textinput', FeedParserDict())
564 _start_textInput = _start_textinput
566 def _end_textinput(self):
567 self.pop('textinput')
569 _end_textInput = _end_textinput
571 def _start_author(self, attrsD):
573 self.push('author', 1)
574 _start_managingeditor = _start_author
575 _start_dc_author = _start_author
576 _start_dc_creator = _start_author
578 def _end_author(self):
581 self._sync_author_detail()
582 _end_managingeditor = _end_author
583 _end_dc_author = _end_author
584 _end_dc_creator = _end_author
586 def _start_contributor(self, attrsD):
587 self.incontributor = 1
588 context = self._getContext()
589 context.setdefault('contributors', [])
590 context['contributors'].append(FeedParserDict())
591 self.push('contributor', 0)
593 def _end_contributor(self):
594 self.pop('contributor')
595 self.incontributor = 0
597 def _start_name(self, attrsD):
601 value = self.pop('name')
603 self._save_author('name', value)
604 elif self.incontributor:
605 self._save_contributor('name', value)
606 elif self.intextinput:
607 context = self._getContext()
608 context['textinput']['name'] = value
610 def _start_url(self, attrsD):
612 _start_homepage = _start_url
613 _start_uri = _start_url
616 value = self.pop('url')
618 self._save_author('url', value)
619 elif self.incontributor:
620 self._save_contributor('url', value)
624 elif self.intextinput:
627 _end_homepage = _end_url
630 def _start_email(self, attrsD):
631 self.push('email', 0)
633 def _end_email(self):
634 value = self.pop('email')
636 self._save_author('email', value)
637 elif self.incontributor:
638 self._save_contributor('email', value)
641 def _getContext(self):
643 context = self.entries[-1]
645 context = self.feeddata
648 def _save_author(self, key, value):
649 context = self._getContext()
650 context.setdefault('author_detail', FeedParserDict())
651 context['author_detail'][key] = value
652 self._sync_author_detail()
654 def _save_contributor(self, key, value):
655 context = self._getContext()
656 context.setdefault('contributors', [FeedParserDict()])
657 context['contributors'][-1][key] = value
659 def _sync_author_detail(self):
660 context = self._getContext()
661 detail = context.get('author_detail')
663 name = detail.get('name')
664 email = detail.get('email')
666 context['author'] = "%s (%s)" % (name, email)
668 context['author'] = name
670 context['author'] = email
672 author = context.get('author')
673 if not author: return
674 emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author)
675 if not emailmatch: return
676 email = emailmatch.group(0)
677 # probably a better way to do the following, but it passes all the tests
678 author = author.replace(email, '')
679 author = author.replace('()', '')
680 author = author.strip()
681 if author and (author[0] == '('):
683 if author and (author[-1] == ')'):
685 author = author.strip()
686 context.setdefault('author_detail', FeedParserDict())
687 context['author_detail']['name'] = author
688 context['author_detail']['email'] = email
690 def _start_tagline(self, attrsD):
692 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
693 'type': attrsD.get('type', 'text/plain'),
694 'language': attrsD.get('xml:lang', self.lang),
695 'base': attrsD.get('xml:base', self.baseuri)})
696 self.push('tagline', 1)
697 _start_subtitle = _start_tagline
699 def _end_tagline(self):
700 value = self.pop('tagline')
702 self.contentparams.clear()
704 self.feeddata['description'] = value
705 _end_subtitle = _end_tagline
707 def _start_copyright(self, attrsD):
709 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
710 'type': attrsD.get('type', 'text/plain'),
711 'language': attrsD.get('xml:lang', self.lang),
712 'base': attrsD.get('xml:base', self.baseuri)})
713 self.push('copyright', 1)
714 _start_dc_rights = _start_copyright
716 def _end_copyright(self):
717 self.pop('copyright')
719 self.contentparams.clear()
720 _end_dc_rights = _end_copyright
722 def _start_item(self, attrsD):
723 self.entries.append(FeedParserDict())
726 self._cdf_common(attrsD)
727 _start_entry = _start_item
728 _start_product = _start_item
733 _end_entry = _end_item
735 def _start_dc_language(self, attrsD):
736 self.push('language', 1)
737 _start_language = _start_dc_language
739 def _end_dc_language(self):
740 self.lang = self.pop('language')
741 _end_language = _end_dc_language
743 def _start_dc_publisher(self, attrsD):
744 self.push('publisher', 1)
745 _start_webmaster = _start_dc_publisher
747 def _end_dc_publisher(self):
748 self.pop('publisher')
749 _end_webmaster = _end_dc_publisher
751 def _start_dcterms_issued(self, attrsD):
752 self.push('issued', 1)
753 _start_issued = _start_dcterms_issued
755 def _end_dcterms_issued(self):
756 value = self.pop('issued')
757 self._save('issued_parsed', _parse_date(value))
758 _end_issued = _end_dcterms_issued
760 def _start_dcterms_created(self, attrsD):
761 self.push('created', 1)
762 _start_created = _start_dcterms_created
764 def _end_dcterms_created(self):
765 value = self.pop('created')
766 self._save('created_parsed', _parse_date(value))
767 _end_created = _end_dcterms_created
769 def _start_dcterms_modified(self, attrsD):
770 self.push('modified', 1)
771 _start_modified = _start_dcterms_modified
772 _start_dc_date = _start_dcterms_modified
773 _start_pubdate = _start_dcterms_modified
775 def _end_dcterms_modified(self):
776 value = self.pop('modified')
777 if _debug: sys.stderr.write('_end_dcterms_modified, value=' + value + '\n')
778 parsed_value = _parse_date(value)
779 self._save('date', value)
780 self._save('date_parsed', parsed_value)
781 self._save('modified_parsed', parsed_value)
782 _end_modified = _end_dcterms_modified
783 _end_dc_date = _end_dcterms_modified
784 _end_pubdate = _end_dcterms_modified
786 def _start_expirationdate(self, attrsD):
787 self.push('expired', 1)
789 def _end_expirationdate(self):
790 self._save('expired_parsed', _parse_date(self.pop('expired')))
792 def _start_cc_license(self, attrsD):
793 self.push('license', 1)
794 value = self._getAttribute(attrsD, 'rdf:resource')
796 self.elementstack[-1][2].append(value)
799 def _start_creativecommons_license(self, attrsD):
800 self.push('license', 1)
802 def _end_creativecommons_license(self):
805 def _start_category(self, attrsD):
806 self.push('category', 1)
807 domain = self._getAttribute(attrsD, 'domain')
810 cats = self.entries[-1].setdefault('categories', [])
812 cats = self.feeddata.setdefault('categories', [])
813 cats.append((domain, None))
814 _start_dc_subject = _start_category
815 _start_keywords = _start_category
817 def _end_category(self):
819 _end_dc_subject = _end_category
820 _end_keywords = _end_category
822 def _start_cloud(self, attrsD):
823 self.feeddata['cloud'] = attrsD
825 def _start_link(self, attrsD):
826 attrsD.setdefault('rel', 'alternate')
827 attrsD.setdefault('type', 'text/html')
828 if attrsD.has_key('href'):
829 attrsD['href'] = self.resolveURI(attrsD['href'])
830 expectingText = self.infeed or self.inentry
832 self.entries[-1].setdefault('links', [])
833 self.entries[-1]['links'].append(attrsD)
835 self.feeddata.setdefault('links', [])
836 self.feeddata['links'].append(attrsD)
837 if attrsD.has_key('href'):
839 if attrsD.get('type', '') in self.html_types:
841 self.entries[-1]['link'] = attrsD['href']
843 self.feeddata['link'] = attrsD['href']
845 self.push('link', expectingText)
846 _start_producturl = _start_link
849 value = self.pop('link')
851 context = self._getContext()
852 context['textinput']['link'] = value
853 _end_producturl = _end_link
855 def _start_guid(self, attrsD):
856 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
860 value = self.pop('guid')
861 self._save('id', value)
863 # guid acts as link, but only if "ispermalink" is not present or is "true",
864 # and only if the item doesn't already have a link element
865 self._save('link', value)
867 def _start_id(self, attrsD):
871 value = self.pop('id')
872 self._save('guid', value)
874 def _start_title(self, attrsD):
876 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
877 'type': attrsD.get('type', 'text/plain'),
878 'language': attrsD.get('xml:lang', self.lang),
879 'base': attrsD.get('xml:base', self.baseuri)})
880 self.push('title', self.infeed or self.inentry)
881 _start_dc_title = _start_title
883 def _end_title(self):
884 value = self.pop('title')
886 self.contentparams.clear()
888 context = self._getContext()
889 context['textinput']['title'] = value
890 _end_dc_title = _end_title
892 def _start_description(self, attrsD, default_content_type='text/html'):
894 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
895 'type': attrsD.get('type', default_content_type),
896 'language': attrsD.get('xml:lang', self.lang),
897 'base': attrsD.get('xml:base', self.baseuri)})
898 self.push('description', self.infeed or self.inentry)
900 def _start_abstract(self, attrsD):
901 return self._start_description(attrsD, 'text/plain')
903 def _end_description(self):
904 value = self.pop('description')
906 self.contentparams.clear()
907 context = self._getContext()
909 context['textinput']['description'] = value
911 context['summary'] = value
913 context['tagline'] = value
914 _end_abstract = _end_description
916 def _start_info(self, attrsD):
918 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
919 'type': attrsD.get('type', 'text/plain'),
920 'language': attrsD.get('xml:lang', self.lang),
921 'base': attrsD.get('xml:base', self.baseuri)})
927 self.contentparams.clear()
929 def _start_generator(self, attrsD):
931 if attrsD.has_key('url'):
932 attrsD['url'] = self.resolveURI(attrsD['url'])
933 self.feeddata['generator_detail'] = attrsD
934 self.push('generator', 1)
936 def _end_generator(self):
937 value = self.pop('generator')
938 if self.feeddata.has_key('generator_detail'):
939 self.feeddata['generator_detail']['name'] = value
941 def _start_admin_generatoragent(self, attrsD):
942 self.push('generator', 1)
943 value = self._getAttribute(attrsD, 'rdf:resource')
945 self.elementstack[-1][2].append(value)
946 self.pop('generator')
948 def _start_admin_errorreportsto(self, attrsD):
949 self.push('errorreportsto', 1)
950 value = self._getAttribute(attrsD, 'rdf:resource')
952 self.elementstack[-1][2].append(value)
953 self.pop('errorreportsto')
955 def _start_summary(self, attrsD):
957 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
958 'type': attrsD.get('type', 'text/plain'),
959 'language': attrsD.get('xml:lang', self.lang),
960 'base': attrsD.get('xml:base', self.baseuri)})
961 self.push('summary', 1)
963 def _end_summary(self):
964 value = self.pop('summary')
966 self.entries[-1]['description'] = value
968 self.contentparams.clear()
970 def _start_enclosure(self, attrsD):
972 self.entries[-1].setdefault('enclosures', [])
973 self.entries[-1]['enclosures'].append(attrsD)
975 def _start_source(self, attrsD):
977 self.entries[-1]['source'] = attrsD
978 self.push('source', 1)
980 def _end_source(self):
983 def _start_content(self, attrsD):
985 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
986 'type': attrsD.get('type', 'text/plain'),
987 'language': attrsD.get('xml:lang', self.lang),
988 'base': attrsD.get('xml:base', self.baseuri)})
989 self.push('content', 1)
991 def _start_prodlink(self, attrsD):
993 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
994 'type': attrsD.get('type', 'text/html'),
995 'language': attrsD.get('xml:lang', self.lang),
996 'base': attrsD.get('xml:base', self.baseuri)})
997 self.push('content', 1)
999 def _start_body(self, attrsD):
1001 self.contentparams = FeedParserDict({'mode': 'xml',
1002 'type': 'application/xhtml+xml',
1003 'language': attrsD.get('xml:lang', self.lang),
1004 'base': attrsD.get('xml:base', self.baseuri)})
1005 self.push('content', 1)
1006 _start_xhtml_body = _start_body
1008 def _start_content_encoded(self, attrsD):
1010 self.contentparams = FeedParserDict({'mode': 'escaped',
1011 'type': 'text/html',
1012 'language': attrsD.get('xml:lang', self.lang),
1013 'base': attrsD.get('xml:base', self.baseuri)})
1014 self.push('content', 1)
1015 _start_fullitem = _start_content_encoded
1017 def _end_content(self):
1018 value = self.pop('content')
1019 if self.contentparams.get('type') in (['text/plain'] + self.html_types):
1020 self._save('description', value)
1022 self.contentparams.clear()
1023 _end_body = _end_content
1024 _end_xhtml_body = _end_content
1025 _end_content_encoded = _end_content
1026 _end_fullitem = _end_content
1027 _end_prodlink = _end_content
1030 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler, xml.sax.handler.EntityResolver):#, xml.sax.handler.DTDHandler):
1031 def __init__(self, baseuri, encoding):
1032 if _debug: sys.stderr.write('trying StrictFeedParser\n')
1033 xml.sax.handler.ContentHandler.__init__(self)
1034 _FeedParserMixin.__init__(self, baseuri, encoding)
1038 def startPrefixMapping(self, prefix, uri):
1039 self.trackNamespace(prefix, uri)
1041 def startElementNS(self, name, qname, attrs):
1042 namespace, localname = name
1043 namespace = str(namespace or '')
1044 if namespace.find('backend.userland.com/rss') <> -1:
1045 # match any backend.userland.com namespace
1046 namespace = 'http://backend.userland.com/rss'
1047 prefix = self.namespaces.get(namespace, 'unknown')
1049 localname = prefix + ':' + localname
1050 localname = str(localname).lower()
1052 # qname implementation is horribly broken in Python 2.1 (it
1053 # doesn't report any), and slightly broken in Python 2.2 (it
1054 # doesn't report the xml: namespace). So we match up namespaces
1055 # with a known list first, and then possibly override them with
1056 # the qnames the SAX parser gives us (if indeed it gives us any
1057 # at all). Thanks to MatejC for helping me test this and
1058 # tirelessly telling me that it didn't work yet.
1060 for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1061 prefix = self.namespaces.get(namespace, '')
1063 attrlocalname = prefix + ":" + attrlocalname
1064 attrsD[str(attrlocalname).lower()] = attrvalue
1065 for qname in attrs.getQNames():
1066 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1067 self.unknown_starttag(localname, attrsD.items())
1069 def resolveEntity(self, publicId, systemId):
1072 def characters(self, text):
1073 self.handle_data(text)
1075 def endElementNS(self, name, qname):
1076 namespace, localname = name
1077 namespace = str(namespace)
1078 prefix = self.namespaces.get(namespace, '')
1080 localname = prefix + ':' + localname
1081 localname = str(localname).lower()
1082 self.unknown_endtag(localname)
1084 def error(self, exc):
1088 def fatalError(self, exc):
1092 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1093 elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1094 'img', 'input', 'isindex', 'link', 'meta', 'param']
1096 def __init__(self, encoding):
1097 self.encoding = encoding
1098 sgmllib.SGMLParser.__init__(self)
1102 sgmllib.SGMLParser.reset(self)
1104 def feed(self, data):
1105 data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
1106 data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
1107 data = data.replace(''', "'")
1108 data = data.replace('"', '"')
1109 if type(data) == types.UnicodeType:
1110 data = data.encode(self.encoding)
1111 sgmllib.SGMLParser.feed(self, data)
1113 def normalize_attrs(self, attrs):
1114 # utility method to be called by descendants
1115 attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
1116 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1119 def unknown_starttag(self, tag, attrs):
1120 # called for each start tag
1121 # attrs is a list of (attr, value) tuples
1122 # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
1123 if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1124 strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
1125 if tag in self.elements_no_end_tag:
1126 self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
1128 self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
1130 def unknown_endtag(self, tag):
1131 # called for each end tag, e.g. for </pre>, tag will be "pre"
1132 # Reconstruct the original end tag.
1133 if tag not in self.elements_no_end_tag:
1134 self.pieces.append("</%(tag)s>" % locals())
1136 def handle_charref(self, ref):
1137 # called for each character reference, e.g. for " ", ref will be "160"
1138 # Reconstruct the original character reference.
1139 self.pieces.append("&#%(ref)s;" % locals())
1141 def handle_entityref(self, ref):
1142 # called for each entity reference, e.g. for "©", ref will be "copy"
1143 # Reconstruct the original entity reference.
1144 self.pieces.append("&%(ref)s;" % locals())
1146 def handle_data(self, text):
1147 # called for each block of plain text, i.e. outside of any tag and
1148 # not containing any character or entity references
1149 # Store the original text verbatim.
1150 if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1151 self.pieces.append(text)
1153 def handle_comment(self, text):
1154 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1155 # Reconstruct the original comment.
1156 self.pieces.append("<!--%(text)s-->" % locals())
1158 def handle_pi(self, text):
1159 # called for each processing instruction, e.g. <?instruction>
1160 # Reconstruct original processing instruction.
1161 self.pieces.append("<?%(text)s>" % locals())
1163 def handle_decl(self, text):
1164 # called for the DOCTYPE, if present, e.g.
1165 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1166 # "http://www.w3.org/TR/html4/loose.dtd">
1167 # Reconstruct original DOCTYPE
1168 self.pieces.append("<!%(text)s>" % locals())
1170 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1171 def _scan_name(self, i, declstartpos):
1172 rawdata = self.rawdata
1173 if _debug: sys.stderr.write("i=%s, declstartpos=%s, rawdata=%s\n" % (i, declstartpos, rawdata))
1177 m = self._new_declname_match(rawdata, i)
1181 if (i + len(s)) == n:
1182 return None, -1 # end of buffer
1183 return name.lower(), m.end()
1185 self.handle_data(rawdata)
1186 # self.updatepos(declstartpos, i)
1190 """Return processed HTML as a single string"""
1192 for p in self.pieces:
1194 sys.stderr.write('\n')
1195 return "".join([str(p) for p in self.pieces])
1197 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1198 def __init__(self, baseuri, encoding):
1199 sgmllib.SGMLParser.__init__(self)
1200 _FeedParserMixin.__init__(self, baseuri, encoding)
1202 class _RelativeURIResolver(_BaseHTMLProcessor):
1203 relative_uris = [('a', 'href'),
1204 ('applet', 'codebase'),
1206 ('blockquote', 'cite'),
1207 ('body', 'background'),
1210 ('frame', 'longdesc'),
1212 ('iframe', 'longdesc'),
1214 ('head', 'profile'),
1215 ('img', 'longdesc'),
1219 ('input', 'usemap'),
1222 ('object', 'classid'),
1223 ('object', 'codebase'),
1225 ('object', 'usemap'),
1229 def __init__(self, baseuri, encoding):
1230 _BaseHTMLProcessor.__init__(self, encoding)
1231 self.baseuri = baseuri
1233 def resolveURI(self, uri):
1234 return urlparse.urljoin(self.baseuri, uri)
1236 def unknown_starttag(self, tag, attrs):
1237 attrs = self.normalize_attrs(attrs)
1238 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1239 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1241 def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1242 if _debug: sys.stderr.write("entering _resolveRelativeURIs\n")
1243 p = _RelativeURIResolver(baseURI, encoding)
1244 if _debug: sys.stderr.write(repr(type(htmlSource)) + '\n')
1248 class _HTMLSanitizer(_BaseHTMLProcessor):
1249 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1250 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1251 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1252 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1253 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1254 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1255 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1256 'thead', 'tr', 'tt', 'u', 'ul', 'var']
1258 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1259 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1260 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1261 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1262 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1263 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1264 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1265 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1266 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1267 'usemap', 'valign', 'value', 'vspace', 'width']
1269 unacceptable_elements_with_end_tag = ['script', 'applet']
1272 _BaseHTMLProcessor.reset(self)
1273 self.unacceptablestack = 0
1275 def unknown_starttag(self, tag, attrs):
1276 if not tag in self.acceptable_elements:
1277 if tag in self.unacceptable_elements_with_end_tag:
1278 self.unacceptablestack += 1
1280 attrs = self.normalize_attrs(attrs)
1281 attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1282 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1284 def unknown_endtag(self, tag):
1285 if not tag in self.acceptable_elements:
1286 if tag in self.unacceptable_elements_with_end_tag:
1287 self.unacceptablestack -= 1
1289 _BaseHTMLProcessor.unknown_endtag(self, tag)
1291 def handle_pi(self, text):
1294 def handle_decl(self, text):
1297 def handle_data(self, text):
1298 if not self.unacceptablestack:
1299 _BaseHTMLProcessor.handle_data(self, text)
1301 def _sanitizeHTML(htmlSource, encoding):
1302 p = _HTMLSanitizer(encoding)
1305 if _mxtidy and TIDY_MARKUP:
1306 nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
1307 if data.count('<body'):
1308 data = data.split('<body', 1)[1]
1310 data = data.split('>', 1)[1]
1311 if data.count('</body'):
1312 data = data.split('</body', 1)[0]
1313 data = data.strip().replace('\r\n', '\n')
1316 class _FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1317 def http_error_default(self, req, fp, code, msg, headers):
1318 if ((code / 100) == 3) and (code != 304):
1319 return self.http_error_302(req, fp, code, msg, headers)
1320 from urllib import addinfourl
1321 infourl = addinfourl(fp, headers, req.get_full_url())
1322 infourl.status = code
1325 def http_error_302(self, req, fp, code, msg, headers):
1326 infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1327 infourl.status = code
1330 def http_error_301(self, req, fp, code, msg, headers):
1331 infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1332 infourl.status = code
1335 http_error_300 = http_error_302
1336 http_error_307 = http_error_302
1338 def _open_resource(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None):
1339 """URL, filename, or string --> stream
1341 This function lets you define parsers that take any input source
1342 (URL, pathname to local or network file, or actual data as a string)
1343 and deal with it in a uniform manner. Returned object is guaranteed
1344 to have all the basic stdio read methods (read, readline, readlines).
1345 Just .close() the object when you're done with it.
1347 If the etag argument is supplied, it will be used as the value of an
1348 If-None-Match request header.
1350 If the modified argument is supplied, it must be a tuple of 9 integers
1351 as returned by gmtime() in the standard Python time module. This MUST
1352 be in GMT (Greenwich Mean Time). The formatted date/time will be used
1353 as the value of an If-Modified-Since request header.
1355 If the agent argument is supplied, it will be used as the value of a
1356 User-Agent request header.
1358 If the referrer argument is supplied, it will be used as the value of a
1359 Referer[sic] request header.
1362 if hasattr(url_file_stream_or_string, "read"):
1363 return url_file_stream_or_string
1365 if url_file_stream_or_string == "-":
1368 if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1372 # try to open with urllib2 (to use optional headers)
1373 request = urllib2.Request(url_file_stream_or_string)
1374 request.add_header("User-Agent", agent)
1376 request.add_header("If-None-Match", etag)
1378 # format into an RFC 1123-compliant timestamp. We can't use
1379 # time.strftime() since the %a and %b directives can be affected
1380 # by the current locale, but RFC 2616 states that dates must be
1382 short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
1383 months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
1384 request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1386 request.add_header("Referer", referrer)
1388 request.add_header("Accept-encoding", "gzip")
1389 opener = urllib2.build_opener(_FeedURLHandler())
1390 opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1393 return opener.open(request)
1394 # except ValueError:
1395 # # not a valid URL, but might be a valid filename
1397 # except AssertionError:
1398 # # under Python 2.1, non-URLs will fail with an AssertionError;
1399 # # still might be a valid filename, so fall through
1402 return _StringIO('')
1404 opener.close() # JohnD
1406 # try to open with native open function (if url_file_stream_or_string is a filename)
1408 return open(url_file_stream_or_string)
1412 # treat url_file_stream_or_string as string
1413 return _StringIO(str(url_file_stream_or_string))
1415 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
1416 # Drake and licensed under the Python license. Removed all range checking
1417 # for month, day, hour, minute, and second, since mktime will normalize
1419 def _w3dtf_parse(s):
1420 def __extract_date(m):
1421 year = int(m.group("year"))
1423 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1426 julian = m.group("julian")
1428 julian = int(julian)
1429 month = julian / 30 + 1
1430 day = julian % 30 + 1
1432 while jday != julian:
1433 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
1434 jday = time.gmtime(t)[-2]
1435 diff = abs(jday - julian)
1447 return year, month, day
1448 month = m.group("month")
1454 day = m.group("day")
1459 return year, month, day
1461 def __extract_time(m):
1464 hours = m.group("hours")
1468 minutes = int(m.group("minutes"))
1469 seconds = m.group("seconds")
1471 seconds = int(seconds)
1474 return hours, minutes, seconds
1476 def __extract_tzd(m):
1477 """Return the Time Zone Designator as an offset in seconds from UTC."""
1480 tzd = m.group("tzd")
1485 hours = int(m.group("tzdhours"))
1486 minutes = m.group("tzdminutes")
1488 minutes = int(minutes)
1491 offset = (hours*60 + minutes) * 60
1496 __date_re = ("(?P<year>\d\d\d\d)"
1498 "(?:(?P<julian>\d\d\d)"
1499 "|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?")
1500 __tzd_re = "(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)"
1501 __tzd_rx = re.compile(__tzd_re)
1502 __time_re = ("(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)"
1503 "(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?"
1505 __datetime_re = "%s(?:T%s)?" % (__date_re, __time_re)
1506 __datetime_rx = re.compile(__datetime_re)
1507 m = __datetime_rx.match(s)
1508 if m is None or m.group() != s:
1510 gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
1511 if gmt[0] == 0: return
1512 return time.mktime(gmt) + __extract_tzd(m) - time.timezone
1514 # Additional ISO-8601 date parsing routines written by Fazal Majid
1515 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1516 # parser is beyond the scope of feedparser and would be a worthwhile addition
1517 # to the Python library
1518 # A single regular expression cannot parse ISO 8601 date formats into groups
1519 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
1520 # 0301-04-01), so we use templates instead
1521 # Please note the order in templates is significant because we need a
1523 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1524 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1525 '-YY-?MM', '-OOO', '-YY',
1531 'YYYY', r'(?P<year>\d{4})').replace(
1532 'YY', r'(?P<year>\d\d)').replace(
1533 'MM', r'(?P<month>[01]\d)').replace(
1534 'DD', r'(?P<day>[0123]\d)').replace(
1535 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1536 'CC', r'(?P<century>\d\d$)')
1537 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1538 + r'(:(?P<second>\d{2}))?'
1539 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1540 for tmpl in _iso8601_tmpl]
1543 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1546 # rfc822.py defines several time zones, but we define some extra ones.
1547 # "ET" is equivalent to "EST", etc.
1548 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
1549 rfc822._timezones.update(_additional_timezones)
1551 def _parse_date(date):
1552 """Parses a variety of date formats into a tuple of 9 integers"""
1555 # try the standard rfc822 library, which handles
1556 # RFC822, RFC1123, RFC2822, and asctime
1557 tm = rfc822.parsedate_tz(date)
1559 return time.gmtime(rfc822.mktime_tz(tm))
1560 # not a RFC2822 date, try W3DTF profile of ISO-8601
1562 tm = _w3dtf_parse(date)
1566 return time.gmtime(tm)
1567 # try various non-W3DTF ISO-8601-compatible formats like 20040105
1569 for _iso8601_match in _iso8601_matches:
1570 m = _iso8601_match(date)
1573 # catch truly malformed strings
1574 if m.span() == (0, 0): return
1575 params = m.groupdict()
1576 ordinal = params.get("ordinal", 0)
1578 ordinal = int(ordinal)
1581 year = params.get("year", "--")
1582 if not year or year == "--":
1583 year = time.gmtime()[0]
1584 elif len(year) == 2:
1585 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1586 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1589 month = params.get("month", "-")
1590 if not month or month == "-":
1591 # ordinals are NOT normalized by mktime, we simulate them
1592 # by setting month=1, day=ordinal
1596 month = time.gmtime()[1]
1598 day = params.get("day", 0)
1603 elif params.get("century", 0) or \
1604 params.get("year", 0) or params.get("month", 0):
1607 day = time.gmtime()[2]
1610 # special case of the century - is the first year of the 21st century
1611 # 2000 or 2001 ? The debate goes on...
1612 if "century" in params.keys():
1613 year = (int(params["century"]) - 1) * 100 + 1
1614 # in ISO 8601 most fields are optional
1615 for field in ["hour", "minute", "second", "tzhour", "tzmin"]:
1616 if not params.get(field, None):
1618 hour = int(params.get("hour", 0))
1619 minute = int(params.get("minute", 0))
1620 second = int(params.get("second", 0))
1621 # weekday is normalized by mktime(), we can ignore it
1623 # daylight savings is complex, but not needed for feedparser's purposes
1624 # as time zones, if specified, include mention of whether it is active
1625 # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1626 # and most implementations have DST bugs
1627 daylight_savings_flag = 0
1628 tm = [year, month, day, hour, minute, second, weekday,
1629 ordinal, daylight_savings_flag]
1630 # ISO 8601 time zone adjustments
1631 tz = params.get("tz")
1632 if tz and tz != "Z":
1634 tm[3] += int(params.get("tzhour", 0))
1635 tm[4] += int(params.get("tzmin", 0))
1637 tm[3] -= int(params.get("tzhour", 0))
1638 tm[4] -= int(params.get("tzmin", 0))
1641 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1642 # which is guaranteed to normalize d/m/y/h/m/s
1643 # many implementations have bugs, but we'll pretend they don't
1644 return time.localtime(time.mktime(tm))
1648 def _getCharacterEncoding(http_headers, xml_data):
1649 """Get the character encoding of the XML document
1651 http_headers is a dictionary
1652 xml_data is a raw string (not Unicode)
1654 This is so much trickier than it sounds,
1655 it's not even funny. According to RFC 3023 ("XML Media Types"), if
1656 the HTTP Content-Type is application/xml, application/*+xml,
1657 application/xml-external-parsed-entity, or application/xml-dtd,
1658 the encoding given in the charset parameter of the HTTP Content-Type
1659 takes precedence over the encoding given in the XML prefix within the
1660 document, and defaults to "utf-8" if neither are specified. But, if
1661 the HTTP Content-Type is text/xml, text/*+xml, or
1662 text/xml-external-parsed-entity, the encoding given in the XML prefix
1663 within the document is ALWAYS IGNORED and only the encoding given in
1664 the charset parameter of the HTTP Content-Type header should be
1665 respected, and it defaults to "us-ascii" if not specified. If
1666 Content-Type is unspecified (input was local file or non-HTTP source)
1667 or unrecognized (server just got it totally wrong), then go by the
1668 encoding given in the XML prefix of the document and default to
1669 "utf-8" as per the XML specification.
1672 def _parseHTTPContentType(content_type):
1673 """takes HTTP Content-Type header and returns (content type, charset)
1675 If no charset is specified, returns (content type, '')
1676 If no content type is specified, returns ('', '')
1677 Both return parameters are guaranteed to be lowercase strings
1679 if not content_type:
1681 content_type = content_type.strip()
1682 paramstr = content_type.split(';')[1:]
1684 return content_type, ''
1685 content_type = content_type.split(';', 1)[0].strip().lower()
1687 # declaration like "text/xml;" (note ending semicolon)
1688 # dunno if this is malformed but it sure was hard to track down
1689 return content_type, ''
1691 params = dict([map(string.lower, map(string.strip, p.strip().split('=', 1))) for p in paramstr])
1692 charset = params.get('charset')
1694 return content_type, ''
1695 if charset[0] in ('"', "'"):
1696 charset = charset[1:]
1697 if charset and charset[-1] in ('"', "'"):
1698 charset = charset[:-1]
1699 charset = charset.strip()
1700 return content_type, charset
1702 true_encoding = None
1703 http_content_type, http_encoding = _parseHTTPContentType(http_headers.get("content-type"))
1704 xml_encoding_match = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1705 xml_encoding = xml_encoding_match and xml_encoding_match.groups()[0].lower() or ''
1706 if (http_content_type == 'application/xml') or \
1707 (http_content_type == 'application/xml-dtd') or \
1708 (http_content_type == 'application/xml-external-parsed-entity') or \
1709 (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
1711 true_encoding = http_encoding
1713 true_encoding = xml_encoding
1715 true_encoding = 'utf-8'
1716 elif (http_content_type == 'text/xml') or \
1717 (http_content_type == 'text/xml-external-parsed-entity') or \
1718 (http_content_type.startswith('text/') and http_content_type.endswith('+xml')):
1720 true_encoding = http_encoding
1722 true_encoding = 'us-ascii'
1724 true_encoding = xml_encoding or 'utf-8'
1725 return true_encoding, http_encoding, xml_encoding
1727 def _changeEncodingDeclaration(data, encoding):
1728 """Changes an XML data stream on the fly to specify a new encoding
1730 data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
1731 encoding is a string recognized by encodings.aliases
1733 if _debug: sys.stderr.write('entering _changeEncodingDeclaration\n')
1734 if _debug: sys.stderr.write('proposed encoding: %s\n' % encoding)
1735 #import cjkcodecs.aliases
1737 data = unicode(data, encoding)
1738 declmatch = re.compile(u'^<\?xml[^>]*?>')
1739 newdecl = unicode("""<?xml version='1.0' encoding='%s'?>""" % encoding, encoding)
1740 if declmatch.search(data):
1741 data = declmatch.sub(newdecl, data)
1743 data = newdecl + u'\n' + data
1744 return data.encode(encoding)
1746 def _stripDoctype(data):
1747 """Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
1749 rss_version may be "rss091n" or None
1750 stripped_data is the same XML document, minus the DOCTYPE
1752 doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
1753 doctype_results = doctype_pattern.findall(data)
1754 doctype = doctype_results and doctype_results[0] or ''
1755 if doctype.lower().count('netscape'):
1759 data = doctype_pattern.sub('', data)
1760 return version, data
1762 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None):
1763 """Parse a feed from a URL, file, stream, or string"""
1764 result = FeedParserDict()
1765 f = _open_resource(url_file_stream_or_string, etag=etag, modified=modified, agent=agent, referrer=referrer)
1767 if hasattr(f, "headers"):
1768 if gzip and f.headers.get('content-encoding', '') == 'gzip':
1770 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
1772 # some feeds claim to be gzipped but they're not, so we get garbage
1774 if hasattr(f, "info"):
1776 result["etag"] = info.getheader("ETag")
1777 last_modified = info.getheader("Last-Modified")
1779 result["modified"] = _parse_date(last_modified)
1780 if hasattr(f, "url"):
1781 result["url"] = f.url
1782 result["status"] = 200 # default, may be overridden later
1783 if hasattr(f, "status"):
1784 result["status"] = f.status
1785 if hasattr(f, "headers"):
1786 result["headers"] = f.headers.dict
1788 if result.get("status", 0) == 304:
1789 result['feed'] = FeedParserDict()
1790 result['entries'] = []
1791 result['debug_message'] = "The feed has not changed since you last checked, so the server sent no data. This is a feature, not a bug!"
1793 result['encoding'], http_encoding, xml_encoding = _getCharacterEncoding(result.get("headers", {}), data)
1794 result['version'], data = _stripDoctype(data)
1795 baseuri = result.get('headers', {}).get('content-location', result.get('url'))
1796 # try true XML parser first
1797 if not _XML_AVAILABLE:
1798 if _debug: sys.stderr.write('no xml libraries available\n')
1799 use_strict_parser = _XML_AVAILABLE
1800 if use_strict_parser:
1801 if _debug: sys.stderr.write('using xml library\n')
1803 feedparser = _StrictFeedParser(baseuri, result['encoding'])
1804 if _debug and _debug_never_use_libxml2:
1805 sys.stderr.write('not using libxml2 (even if available)\n')
1806 additional_parsers = []
1808 additional_parsers = ["drv_libxml2"]
1809 saxparser = xml.sax.make_parser(additional_parsers)
1810 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
1811 saxparser.setContentHandler(feedparser)
1812 saxparser.setErrorHandler(feedparser)
1814 saxparser.setDTDHandler(feedparser)
1815 except xml.sax.SAXNotSupportedException:
1816 # libxml2 driver does not support DTDHandler
1817 if _debug: sys.stderr.write('using an xml library that does not support DTDHandler (not a big deal)\n')
1819 saxparser.setEntityResolver(feedparser)
1820 except xml.sax.SAXNotSupportedException:
1821 # libxml2 driver does not support EntityResolver
1822 if _debug: sys.stderr.write('using an xml library that does not support EntityResolver (not a big deal)\n')
1823 encoding_set = (result['encoding'] == xml_encoding)
1824 if not encoding_set:
1825 bozo_exception = None
1826 proposed_encodings = [result['encoding'], xml_encoding, 'utf-8', 'iso-8859-1', 'windows-1252']
1827 tried_encodings = []
1828 for proposed_encoding in proposed_encodings:
1829 if proposed_encodings in tried_encodings: continue
1830 tried_encodings.append(proposed_encoding)
1832 data = _changeEncodingDeclaration(data, proposed_encoding)
1833 except Exception, bozo_exception:
1834 if _debug: sys.stderr.write('character encoding is wrong\n')
1836 if proposed_encoding != result['encoding']:
1838 raise CharacterEncodingOverride, "document declared as %s, but parsed as %s" % (result['encoding'], proposed_encoding)
1839 except CharacterEncodingOverride, bozo_exception:
1841 result['bozo_exception'] = bozo_exception
1842 result['encoding'] = proposed_encoding
1845 if not encoding_set:
1847 result['bozo_exception'] = bozo_exception
1848 use_strict_parser = 0
1849 if use_strict_parser:
1850 source = xml.sax.xmlreader.InputSource()
1851 source.setByteStream(_StringIO(data))
1852 if hasattr(saxparser, '_ns_stack'):
1853 # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
1854 # PyXML doesn't have this problem, and it doesn't have _ns_stack either
1855 saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
1857 saxparser.parse(source)
1858 except Exception, e:
1859 if _debug: sys.stderr.write('xml parsing failed\n')
1861 feedparser.bozo_exception = feedparser.exc or e
1863 # feed is not well-formed XML, fall back on regex-based parser
1865 result['bozo_exception'] = feedparser.bozo_exception
1866 use_strict_parser = 0
1867 if not use_strict_parser:
1868 if _debug: sys.stderr.write('using regexes, now you have two problems\n')
1869 feedparser = _LooseFeedParser(baseuri, result['encoding'])
1870 feedparser.feed(data)
1871 result['feed'] = feedparser.feeddata
1872 result['entries'] = feedparser.entries
1873 result['version'] = result['version'] or feedparser.version
1876 if __name__ == '__main__':
1877 if not sys.argv[1:]:
1882 from pprint import pprint
1894 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
1895 # added Simon Fell's test suite
1896 #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
1898 # JD - use inchannel to watch out for image and textinput elements which can
1899 # also contain title, link, and description elements
1900 # JD - check for isPermaLink="false" attribute on guid elements
1901 # JD - replaced openAnything with open_resource supporting ETag and
1902 # If-Modified-Since request headers
1903 # JD - parse now accepts etag, modified, agent, and referrer optional
1905 # JD - modified parse to return a dictionary instead of a tuple so that any
1906 # etag or modified information can be returned and cached by the caller
1907 #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
1908 # because of etag/modified, return the old etag/modified to the caller to
1909 # indicate why nothing is being returned
1910 #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
1911 # useless. Fixes the problem JD was addressing by adding it.
1912 #2.1 - 11/14/2002 - MAP - added gzip support
1913 #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
1914 # start_admingeneratoragent is an example of how to handle elements with
1915 # only attributes, no content.
1916 #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
1917 # also, make sure we send the User-Agent even if urllib2 isn't available.
1918 # Match any variation of backend.userland.com/rss namespace.
1919 #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
1920 #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
1921 # snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
1923 #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
1924 # removed unnecessary urllib code -- urllib2 should always be available anyway;
1925 # return actual url, status, and full HTTP headers (as result['url'],
1926 # result['status'], and result['headers']) if parsing a remote feed over HTTP --
1927 # this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
1928 # added the latest namespace-of-the-week for RSS 2.0
1929 #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
1930 # User-Agent (otherwise urllib2 sends two, which confuses some servers)
1931 #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
1932 # inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
1933 #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
1934 # textInput, and also to return the character encoding (if specified)
1935 #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
1936 # nested divs within content (JohnD); fixed missing sys import (JohanS);
1937 # fixed regular expression to capture XML character encoding (Andrei);
1938 # added support for Atom 0.3-style links; fixed bug with textInput tracking;
1939 # added support for cloud (MartijnP); added support for multiple
1940 # category/dc:subject (MartijnP); normalize content model: "description" gets
1941 # description (which can come from description, summary, or full content if no
1942 # description), "content" gets dict of base/language/type/value (which can come
1943 # from content:encoded, xhtml:body, content, or fullitem);
1944 # fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
1945 # tracking; fixed bug tracking unknown tags; fixed bug tracking content when
1946 # <content> element is not in default namespace (like Pocketsoap feed);
1947 # resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
1948 # wfw:commentRSS; resolve relative URLs within embedded HTML markup in
1949 # description, xhtml:body, content, content:encoded, title, subtitle,
1950 # summary, info, tagline, and copyright; added support for pingback and
1951 # trackback namespaces
1952 #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
1953 # namespaces, as opposed to 2.6 when I said I did but didn't really;
1954 # sanitize HTML markup within some elements; added mxTidy support (if
1955 # installed) to tidy HTML markup within some elements; fixed indentation
1956 # bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
1957 # (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
1958 # 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
1959 # 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
1960 # and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
1961 #2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
1962 # leak not closing url opener (JohnD); added dc:publisher support (MarekK);
1963 # added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
1964 #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
1965 # encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
1966 # fixed relative URI processing for guid (skadz); added ICBM support; added
1968 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
1969 # blogspot.com sites); added _debug variable
1970 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
1971 #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
1972 # added several new supported namespaces; fixed bug tracking naked markup in
1973 # description; added support for enclosure; added support for source; re-added
1974 # support for cloud which got dropped somehow; added support for expirationDate
1975 #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
1976 # xml:base URI, one for documents that don't define one explicitly and one for
1977 # documents that define an outer and an inner xml:base that goes out of scope
1978 # before the end of the document
1979 #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
1980 #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result["version"]
1981 # will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
1982 # added support for creativeCommons:license and cc:license; added support for
1983 # full Atom content model in title, tagline, info, copyright, summary; fixed bug
1984 # with gzip encoding (not always telling server we support it when we do)
1985 #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
1986 # (dictionary of "name", "url", "email"); map author to author_detail if author
1987 # contains name + email address
1988 #3.0b8 - 1/28/2004 - MAP - added support for contributor
1989 #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
1990 # support for summary
1991 #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
1993 #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
1994 # dangerous markup; fiddled with decodeEntities (not right); liberalized
1995 # date parsing even further
1996 #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
1997 # added support to Atom 0.2 subtitle; added support for Atom content model
1998 # in copyright; better sanitizing of dangerous HTML elements with end tags
1999 # (script, frameset)
2000 #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2001 # etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2002 #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2004 #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2005 # fixed bug capturing author and contributor URL; fixed bug resolving relative
2006 # links in author and contributor URL; fixed bug resolvin relative links in
2007 # generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2008 # namespace tests, and included them permanently in the test suite with his
2009 # permission; fixed namespace handling under Python 2.1
2010 #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2011 #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2012 #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2013 # use libxml2 (if available)
2014 #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2015 # name was in parentheses; removed ultra-problematic mxTidy support; patch to
2016 # workaround crash in PyXML/expat when encountering invalid entities
2017 # (MarkMoraes); support for textinput/textInput
2018 #3.0b20 - 4/7/2004 - MAP - added CDF support
2019 #3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2020 #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2021 # results dict; changed results dict to allow getting values with results.key
2022 # as well as results[key]; work around embedded illformed HTML with half
2023 # a DOCTYPE; work around malformed Content-Type header; if character encoding
2024 # is wrong, try several common ones before falling back to regexes (if this
2025 # works, bozo_exception is set to CharacterEncodingOverride); fixed character
2026 # encoding issues in BaseHTMLProcessor by tracking encoding and converting
2027 # from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
2028 # convert each value in results to Unicode (if possible), even if using
2029 # regex-based parsing