2 """Universal feed parser
4 Visit http://diveintomark.org/projects/feed_parser/ for the latest version
6 Handles RSS 0.9x, RSS 1.0, RSS 2.0, Atom feeds
8 Things it handles that choke other parsers:
9 - bastard combinations of RSS 0.9x and RSS 1.0
10 - illegal 8-bit XML characters
11 - naked and/or invalid HTML in description
12 - content:encoded, xhtml:body, fullitem
14 - elements in non-standard namespaces or non-default namespaces
15 - multiple content items per entry (Atom)
16 - multiple links per entry (Atom)
19 - resolves relative URIs in some elements
20 - uses xml:base to define base URI
21 - uses URI of feed if no xml:base is given
22 - to control which elements are resolved, set _FeedParserMixin.can_be_relative_uri
23 - resolves relative URIs within embedded markup
24 - to control which elements are resolved, set _FeedParserMixin.can_contain_relative_uris
25 - sanitizes embedded markup in some elements
26 - to allow/disallow HTML elements, set _HTMLSanitizer.acceptable_elements
27 - to allow/disallow HTML attributes, set _HTMLSanitizer.acceptable_attributes
28 - to control which feed elements are sanitized, set _FeedParserMixin.can_contain_dangerous_markup
29 - to disable entirely (NOT RECOMMENDED), set _FeedParserMixin.can_contain_dangerous_markup = []
30 - optionally tidies embedded markup
31 - fixes malformed HTML
33 - converts character entities to numeric entities
34 - requires mxTidy <http://www.lemburg.com/files/python/mxTidy.html>
36 Required: Python 2.1 or later
37 Recommended: Python 2.3 or later
40 __version__ = "3.0-beta-14"
41 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
42 __copyright__ = "Copyright 2002-4, Mark Pilgrim"
43 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
44 "John Beimler <http://john.beimler.org/>",
45 "Fazal Majid <http://www.majid.info/mylos/weblog/>"]
46 __license__ = "Python"
49 # if you are embedding feedparser in a larger application, you should change this to your application name and URL
50 USER_AGENT = "UniversalFeedParser/%s%s +http://diveintomark.org/projects/feed_parser/" % (__version__, _debug and "-debug" or "")
52 # ---------- required modules (should come with any Python distribution) ----------
53 import sgmllib, re, sys, copy, urlparse, time, rfc822
55 from cStringIO import StringIO as _StringIO
57 from StringIO import StringIO as _StringIO
59 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
61 # gzip is included with most Python distributions, but may not be available if you compiled your own
67 # timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers.
68 # Python 2.3 now has this functionality available in the standard socket library, so under
69 # 2.3 you don't need to install anything.
71 if hasattr(socket, 'setdefaulttimeout'):
72 socket.setdefaulttimeout(10)
75 import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
76 timeoutsocket.setDefaultSocketTimeout(10)
81 # mxtidy allows feedparser to tidy malformed embedded HTML markup in description, content, etc.
82 # this does not affect HTML sanitizing, which is self-contained in the _HTMLSanitizer class
84 from mx.Tidy import Tidy as _mxtidy # http://www.lemburg.com/files/python/mxTidy.html
88 # If a real XML parser is available, feedparser will attempt to use it. feedparser works
89 # with both the built-in SAX parser and PyXML SAX parser. On platforms where the Python
90 # distribution does not come with an XML parser (such as Mac OS X 10.2 and some versions of
91 # FreeBSD), feedparser will just fall back on regex-based parsing. If XML libraries are
92 # available but the feed turns out not to be well-formed XML, feedparser will fall back
93 # on regex-based parsing and set the "bozo" bit in the results to indicate that the feed
94 # author is a bozo who can't generate well-formed XML. The two advantages of using a real
95 # XML parser are (1) Unicode support, and (2) to get people to stop yelling at me for not
99 from xml.sax.saxutils import escape as xmlescape
104 data = data.replace("&", "&")
105 data = data.replace(">", ">")
106 data = data.replace("<", "<")
109 # base64 support for Atom feeds that contain embedded binary data
111 import base64, binascii
113 base64 = binascii = None
115 # ---------- don't touch these ----------
116 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
117 sgmllib.special = re.compile('<!')
119 SUPPORTED_VERSIONS = {'': 'unknown',
120 'rss090': 'RSS 0.90',
121 'rss091n': 'RSS 0.91 (Netscape)',
122 'rss091u': 'RSS 0.91 (Userland)',
123 'rss092': 'RSS 0.92',
124 'rss093': 'RSS 0.93',
125 'rss094': 'RSS 0.94',
128 'rss': 'RSS (unknown version)',
129 'atom01': 'Atom 0.1',
130 'atom02': 'Atom 0.2',
131 'atom03': 'Atom 0.3',
132 'atom': 'Atom (unknown version)'
138 # Python 2.1 does not have a built-in dict() function
145 class _FeedParserMixin:
146 namespaces = {"http://backend.userland.com/rss": "",
147 "http://blogs.law.harvard.edu/tech/rss": "",
148 "http://purl.org/rss/1.0/": "",
149 "http://example.com/newformat#": "",
150 "http://example.com/necho": "",
151 "http://purl.org/echo/": "",
152 "uri/of/echo/namespace#": "",
153 "http://purl.org/pie/": "",
154 "http://purl.org/atom/ns#": "",
155 "http://purl.org/rss/1.0/modules/rss091#": "",
157 "http://webns.net/mvcb/": "admin",
158 "http://purl.org/rss/1.0/modules/aggregation/": "ag",
159 "http://purl.org/rss/1.0/modules/annotate/": "annotate",
160 "http://media.tangent.org/rss/1.0/": "audio",
161 "http://backend.userland.com/blogChannelModule": "blogChannel",
162 "http://web.resource.org/cc/": "cc",
163 "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
164 "http://purl.org/rss/1.0/modules/company": "co",
165 "http://purl.org/rss/1.0/modules/content/": "content",
166 "http://my.theinfo.org/changed/1.0/rss/": "cp",
167 "http://purl.org/dc/elements/1.1/": "dc",
168 "http://purl.org/dc/terms/": "dcterms",
169 "http://purl.org/rss/1.0/modules/email/": "email",
170 "http://purl.org/rss/1.0/modules/event/": "ev",
171 "http://postneo.com/icbm/": "icbm",
172 "http://purl.org/rss/1.0/modules/image/": "image",
173 "http://xmlns.com/foaf/0.1/": "foaf",
174 "http://freshmeat.net/rss/fm/": "fm",
175 "http://purl.org/rss/1.0/modules/link/": "l",
176 "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
177 "http://prismstandard.org/namespaces/1.2/basic/": "prism",
178 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
179 "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
180 "http://purl.org/rss/1.0/modules/reference/": "ref",
181 "http://purl.org/rss/1.0/modules/richequiv/": "reqv",
182 "http://purl.org/rss/1.0/modules/search/": "search",
183 "http://purl.org/rss/1.0/modules/slash/": "slash",
184 "http://purl.org/rss/1.0/modules/servicestatus/": "ss",
185 "http://hacks.benhammersley.com/rss/streaming/": "str",
186 "http://purl.org/rss/1.0/modules/subscription/": "sub",
187 "http://purl.org/rss/1.0/modules/syndication/": "sy",
188 "http://purl.org/rss/1.0/modules/taxonomy/": "taxo",
189 "http://purl.org/rss/1.0/modules/threading/": "thr",
190 "http://purl.org/rss/1.0/modules/textinput/": "ti",
191 "http://madskills.com/public/xml/rss/module/trackback/":"trackback",
192 "http://wellformedweb.org/CommentAPI/": "wfw",
193 "http://purl.org/rss/1.0/modules/wiki/": "wiki",
194 "http://schemas.xmlsoap.org/soap/envelope/": "soap",
195 "http://www.w3.org/1999/xhtml": "xhtml",
196 "http://www.w3.org/XML/1998/namespace": "xml"
199 can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentRSS', 'docs', 'url', 'comments']
200 can_contain_relative_uris = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
201 can_contain_dangerous_markup = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
202 html_types = ['text/html', 'application/xhtml+xml']
204 def __init__(self, baseuri=None):
205 if _debug: sys.stderr.write("initializing FeedParser\n")
206 self.channel = {} # channel- or feed-level data
207 self.items = [] # list of item- or entry-level data
208 self.version = '' # feed type/version, see SUPPORTED_VERSIONS
210 # the following are used internally to track state;
211 # some of this is kind of out of control and should
212 # probably be refactored into a finite state machine
219 self.incontributor = 0
220 self.contentparams = {}
221 self.namespacemap = {}
222 self.elementstack = []
225 self.baseuri = baseuri or ''
228 def unknown_starttag(self, tag, attrs):
229 if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
231 attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
232 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
234 # track xml:base and xml:lang
236 baseuri = attrsD.get('xml:base')
238 if _debug: sys.stderr.write('self.baseuri=%s\n' % baseuri)
239 self.baseuri = baseuri
240 lang = attrsD.get('xml:lang')
243 self.basestack.append(baseuri)
244 self.langstack.append(lang)
247 for prefix, uri in attrs:
248 if prefix.startswith('xmlns:'):
249 self.trackNamespace(prefix[6:], uri)
250 elif prefix == 'xmlns':
251 self.trackNamespace(None, uri)
253 # track inline content
254 if self.incontent and self.contentparams.get('mode') == 'escaped':
255 # element declared itself as escaped markup, but it isn't really
256 self.contentparams['mode'] = 'xml'
257 if self.incontent and self.contentparams.get('mode') == 'xml':
258 # Note: probably shouldn't simply recreate localname here, but
259 # our namespace handling isn't actually 100% correct in cases where
260 # the feed redefines the default namespace (which is actually
261 # the usual case for inline content, thanks Sam), so here we
262 # cheat and just reconstruct the element based on localname
263 # because that compensates for the bugs in our namespace handling.
264 # This will horribly munge inline content with non-empty qnames,
265 # but nobody actually does that, so I'm not fixing it.
266 tag = tag.split(':')[-1]
267 return self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0)
271 prefix, suffix = tag.split(':', 1)
273 prefix, suffix = '', tag
274 prefix = self.namespacemap.get(prefix, prefix)
276 prefix = prefix + '_'
278 # call special handler (if defined) or default handler
279 methodname = '_start_' + prefix + suffix
281 method = getattr(self, methodname)
282 return method(attrsD)
283 except AttributeError:
284 return self.push(prefix + suffix, 1)
286 def unknown_endtag(self, tag):
287 if _debug: sys.stderr.write('end %s\n' % tag)
290 prefix, suffix = tag.split(':', 1)
292 prefix, suffix = '', tag
293 prefix = self.namespacemap.get(prefix, prefix)
295 prefix = prefix + '_'
297 # call special handler (if defined) or default handler
298 methodname = '_end_' + prefix + suffix
300 method = getattr(self, methodname)
302 except AttributeError:
303 self.pop(prefix + suffix)
305 # track inline content
306 if self.incontent and self.contentparams.get('mode') == 'escaped':
307 # element declared itself as escaped markup, but it isn't really
308 self.contentparams['mode'] = 'xml'
309 if self.incontent and self.contentparams.get('mode') == 'xml':
310 tag = tag.split(':')[-1]
311 self.handle_data("</%s>" % tag, escape=0)
313 # track xml:base and xml:lang going out of scope
316 if self.basestack and self.basestack[-1]:
317 baseuri = self.basestack[-1]
318 if _debug: sys.stderr.write('self.baseuri=%s\n' % baseuri)
319 self.baseuri = baseuri
321 lang = self.langstack.pop()
325 def handle_charref(self, ref):
326 # called for each character reference, e.g. for " ", ref will be "160"
327 # Reconstruct the original character reference.
328 if not self.elementstack: return
330 self.elementstack[-1][2].append(text)
332 def handle_entityref(self, ref):
333 # called for each entity reference, e.g. for "©", ref will be "copy"
334 # Reconstruct the original entity reference.
335 if not self.elementstack: return
337 self.elementstack[-1][2].append(text)
339 def handle_data(self, text, escape=1):
340 # called for each block of plain text, i.e. outside of any tag and
341 # not containing any character or entity references
342 if not self.elementstack: return
343 if escape and self.contentparams.get('mode') == 'xml':
344 text = xmlescape(text)
345 self.elementstack[-1][2].append(text)
347 def handle_comment(self, text):
348 # called for each comment, e.g. <!-- insert message here -->
351 def handle_pi(self, text):
352 # called for each processing instruction, e.g. <?instruction>
355 def handle_decl(self, text):
356 # called for the DOCTYPE, if present, e.g.
357 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
358 # "http://www.w3.org/TR/html4/loose.dtd">
359 if text.count('http://my.netscape.com/publish/formats/rss-0.91.dtd'):
360 self.version = 'rss091n'
362 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
363 def _scan_name(self, i, declstartpos):
364 rawdata = self.rawdata
368 m = self._new_declname_match(rawdata, i)
372 if (i + len(s)) == n:
373 return None, -1 # end of buffer
374 return name.lower(), m.end()
376 self.updatepos(declstartpos, i)
377 self.error("expected name token")
379 def parse_declaration(self, i):
380 # override internal declaration handler to handle CDATA blocks
381 if _debug: sys.stderr.write("entering parse_declaration\n")
382 if re.search(r'^<!DOCTYPE\s+?rss\s+?PUBLIC\s+?"-//Netscape Communications//DTD RSS 0.91//EN"\s+?"http://my.netscape.com/publish/formats/rss-0.91.dtd">', self.rawdata[i:]):
383 if _debug: sys.stderr.write("found Netscape DOCTYPE\n")
384 self.version = 'rss091n'
385 if self.rawdata[i:i+9] == '<![CDATA[':
386 k = self.rawdata.find(']]>', i)
387 if k == -1: k = len(self.rawdata)
388 self.handle_data(xmlescape(self.rawdata[i+9:k]), 0)
391 k = self.rawdata.find('>', i)
394 def trackNamespace(self, prefix, uri):
395 if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
396 self.version = 'rss090'
397 if not prefix: return
398 if uri.find('backend.userland.com/rss') <> -1:
399 # match any backend.userland.com namespace
400 uri = 'http://backend.userland.com/rss'
401 if self.namespaces.has_key(uri):
402 self.namespacemap[prefix] = self.namespaces[uri]
404 def resolveURI(self, uri):
405 return urlparse.urljoin(self.baseuri or '', uri)
407 def decodeEntities(self, element, data):
408 if self.contentparams.get('mode') == 'escaped':
409 data = data.replace('<', '<')
410 data = data.replace('>', '>')
411 data = data.replace('&', '&')
412 data = data.replace('"', '"')
413 data = data.replace(''', "'")
416 def push(self, element, expectingText):
417 # print 'push', element, expectingText
418 # while self.elementstack and self.elementstack[-1][1]:
419 # self.pop(self.elementstack[-1][0])
420 self.elementstack.append([element, expectingText, []])
422 def pop(self, element):
423 # print 'pop', element
424 if not self.elementstack: return
425 # while self.elementstack[-1][0] != element: self.pop(self.elementstack[-1][0])
426 if self.elementstack[-1][0] != element: return
428 element, expectingText, pieces = self.elementstack.pop()
429 output = "".join(pieces)
430 output = output.strip()
431 if not expectingText: return output
433 # decode base64 content
434 if self.contentparams.get('mode') == 'base64' and base64:
436 output = base64.decodestring(output)
437 except binascii.Error:
439 except binascii.Incomplete:
442 # resolve relative URIs
443 if (element in self.can_be_relative_uri) and output:
444 output = self.resolveURI(output)
446 # decode entities within embedded markup
447 output = self.decodeEntities(element, output)
449 # resolve relative URIs within embedded markup
450 if element in self.can_contain_relative_uris:
451 output = _resolveRelativeURIs(output, self.baseuri)
453 # sanitize embedded markup
454 if element in self.can_contain_dangerous_markup:
455 output = _sanitizeHTML(output)
457 # store output in appropriate place(s)
459 if element == 'content':
460 self.items[-1].setdefault(element, [])
461 contentparams = copy.deepcopy(self.contentparams)
462 contentparams['value'] = output
463 self.items[-1][element].append(contentparams)
464 elif element == 'category':
465 self.items[-1][element] = output
466 domain = self.items[-1]['categories'][-1][0]
467 self.items[-1]['categories'][-1] = (domain, output)
468 elif element == 'source':
469 self.items[-1]['source']['value'] = output
470 elif element == 'link':
471 self.items[-1][element] = output
473 self.items[-1]['links'][-1]['href'] = output
475 if self.incontent and element != 'description':
476 contentparams = copy.deepcopy(self.contentparams)
477 contentparams['value'] = output
478 self.items[-1][element + '_detail'] = contentparams
479 self.items[-1][element] = output
480 elif self.inchannel and (not self.intextinput) and (not self.inimage):
481 if element == 'category':
482 domain = self.channel['categories'][-1][0]
483 self.channel['categories'][-1] = (domain, output)
484 elif element == 'link':
485 self.channel['links'][-1]['href'] = output
487 if self.incontent and element != 'description':
488 contentparams = copy.deepcopy(self.contentparams)
489 contentparams['value'] = output
490 self.channel[element + '_detail'] = contentparams
491 self.channel[element] = output
494 def _mapToStandardPrefix(self, name):
495 colonpos = name.find(':')
497 prefix = name[:colonpos]
498 suffix = name[colonpos+1:]
499 prefix = self.namespacemap.get(prefix, prefix)
500 name = prefix + ':' + suffix
503 def _getAttribute(self, attrsD, name):
504 return attrsD.get(self._mapToStandardPrefix(name))
506 def _save(self, key, value):
509 self.items[-1].setdefault(key, value)
511 self.channel.setdefault(key, value)
513 def _start_rss(self, attrsD):
514 versionmap = {'0.91': 'rss091u',
519 attr_version = attrsD.get('version', '')
520 version = versionmap.get(attr_version)
522 self.version = version
523 elif attr_version.startswith('2.'):
524 self.version = 'rss20'
528 def _start_channel(self, attrsD):
531 def _start_feed(self, attrsD):
533 versionmap = {'0.1': 'atom01',
537 attr_version = attrsD.get('version')
538 version = versionmap.get(attr_version)
540 self.version = version
542 self.version = 'atom'
544 def _end_channel(self):
546 _end_feed = _end_channel
548 def _start_image(self, attrsD):
551 def _end_image(self):
554 def _start_textinput(self, attrsD):
556 _start_textInput = _start_textinput
558 def _end_textinput(self):
560 _end_textInput = _end_textinput
562 def _start_author(self, attrsD):
564 self.push('author', 1)
565 _start_managingeditor = _start_author
566 _start_dc_author = _start_author
567 _start_dc_creator = _start_author
569 def _end_author(self):
572 self._sync_author_detail()
573 _end_managingeditor = _end_author
574 _end_dc_author = _end_author
575 _end_dc_creator = _end_author
577 def _start_contributor(self, attrsD):
578 self.incontributor = 1
579 context = self._getContext()
580 context.setdefault('contributors', [])
581 context['contributors'].append({})
582 self.push('contributor', 0)
584 def _end_contributor(self):
585 self.pop('contributor')
586 self.incontributor = 0
588 def _start_name(self, attrsD):
592 value = self.pop('name')
594 self._save_author('name', value)
595 elif self.incontributor:
596 self._save_contributor('name', value)
598 elif self.intextinput:
602 def _start_url(self, attrsD):
604 _start_homepage = _start_url
605 _start_uri = _start_url
608 value = self.pop('url')
610 self._save_author('url', value)
611 elif self.incontributor:
612 self._save_contributor('url', value)
616 elif self.intextinput:
619 _end_homepage = _end_url
622 def _start_email(self, attrsD):
623 self.push('email', 0)
625 def _end_email(self):
626 value = self.pop('email')
628 self._save_author('email', value)
629 elif self.incontributor:
630 self._save_contributor('email', value)
635 elif self.intextinput:
639 def _getContext(self):
641 context = self.items[-1]
643 context = self.channel
646 def _save_author(self, key, value):
647 context = self._getContext()
648 context.setdefault('author_detail', {})
649 context['author_detail'][key] = value
650 self._sync_author_detail()
652 def _save_contributor(self, key, value):
653 context = self._getContext()
654 context.setdefault('contributors', [{}])
655 context['contributors'][-1][key] = value
657 def _sync_author_detail(self):
658 context = self._getContext()
659 detail = context.get('author_detail')
661 name = detail.get('name')
662 email = detail.get('email')
664 context['author'] = "%s (%s)" % (name, email)
666 context['author'] = name
668 context['author'] = email
670 author = context.get('author')
671 if not author: return
672 emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author)
673 if not emailmatch: return
674 email = emailmatch.group(0)
675 author = author.replace(email, '')
676 author = author.replace('()', '')
677 author = author.strip()
678 context.setdefault('author_detail', {})
679 context['author_detail']['name'] = author
680 context['author_detail']['email'] = email
682 def _start_tagline(self, attrsD):
684 self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
685 'type': attrsD.get('type', 'text/plain'),
686 'language': attrsD.get('xml:lang', self.lang),
687 'base': attrsD.get('xml:base', self.baseuri)}
688 self.push('tagline', 1)
689 _start_subtitle = _start_tagline
691 def _end_tagline(self):
692 value = self.pop('tagline')
694 self.contentparams.clear()
696 self.channel['description'] = value
697 _end_subtitle = _end_tagline
699 def _start_copyright(self, attrsD):
701 self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
702 'type': attrsD.get('type', 'text/plain'),
703 'language': attrsD.get('xml:lang', self.lang),
704 'base': attrsD.get('xml:base', self.baseuri)}
705 self.push('copyright', 1)
706 _start_dc_rights = _start_copyright
708 def _end_copyright(self):
709 self.pop('copyright')
711 self.contentparams.clear()
712 _end_dc_rights = _end_copyright
714 def _start_item(self, attrsD):
715 self.items.append({})
718 _start_entry = _start_item
723 _end_entry = _end_item
725 def _start_dc_language(self, attrsD):
726 self.push('language', 1)
727 _start_language = _start_dc_language
729 def _end_dc_language(self):
730 self.lang = self.pop('language')
731 _end_language = _end_dc_language
733 def _start_dc_publisher(self, attrsD):
734 self.push('publisher', 1)
735 _start_webmaster = _start_dc_publisher
737 def _end_dc_publisher(self):
738 self.pop('publisher')
739 _end_webmaster = _end_dc_publisher
741 def _start_dcterms_issued(self, attrsD):
742 self.push('issued', 1)
743 _start_issued = _start_dcterms_issued
745 def _end_dcterms_issued(self):
746 value = self.pop('issued')
747 self._save('issued_parsed', _parse_date(value))
748 _end_issued = _end_dcterms_issued
750 def _start_dcterms_created(self, attrsD):
751 self.push('created', 1)
752 _start_created = _start_dcterms_created
754 def _end_dcterms_created(self):
755 value = self.pop('created')
756 self._save('created_parsed', _parse_date(value))
757 _end_created = _end_dcterms_created
759 def _start_dcterms_modified(self, attrsD):
760 self.push('modified', 1)
761 _start_modified = _start_dcterms_modified
762 _start_dc_date = _start_dcterms_modified
763 _start_pubdate = _start_dcterms_modified
765 def _end_dcterms_modified(self):
766 value = self.pop('modified')
767 parsed_value = _parse_date(value)
768 self._save('date', value)
769 self._save('date_parsed', parsed_value)
770 self._save('modified_parsed', parsed_value)
771 _end_modified = _end_dcterms_modified
772 _end_dc_date = _end_dcterms_modified
773 _end_pubdate = _end_dcterms_modified
775 def _start_expirationdate(self, attrsD):
776 self.push('expired', 1)
778 def _end_expirationdate(self):
779 self._save('expired_parsed', _parse_date(self.pop('expired')))
781 def _start_cc_license(self, attrsD):
782 self.push('license', 1)
783 value = self._getAttribute(attrsD, 'rdf:resource')
785 self.elementstack[-1][2].append(value)
788 def _start_creativecommons_license(self, attrsD):
789 self.push('license', 1)
791 def _end_creativecommons_license(self):
794 def _start_category(self, attrsD):
795 self.push('category', 1)
796 domain = self._getAttribute(attrsD, 'domain')
799 cats = self.items[-1].setdefault('categories', [])
801 cats = self.channel.setdefault('categories', [])
802 cats.append((domain, None))
803 _start_dc_subject = _start_category
805 def _end_category(self):
807 _end_dc_subject = _end_category
809 def _start_cloud(self, attrsD):
810 self.channel['cloud'] = attrsD
812 def _start_link(self, attrsD):
813 attrsD.setdefault('rel', 'alternate')
814 attrsD.setdefault('type', 'text/html')
815 if attrsD.has_key('href'):
816 attrsD['href'] = self.resolveURI(attrsD['href'])
817 expectingText = self.inchannel or self.initem
819 self.items[-1].setdefault('links', [])
820 self.items[-1]['links'].append(attrsD)
822 self.channel.setdefault('links', [])
823 self.channel['links'].append(attrsD)
824 if attrsD.has_key('href'):
826 if attrsD.get('type', '') in self.html_types:
828 self.items[-1]['link'] = attrsD['href']
830 self.channel['link'] = attrsD['href']
832 self.push('link', expectingText)
834 def _start_guid(self, attrsD):
835 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
839 value = self.pop('guid')
840 self._save('id', value)
842 # guid acts as link, but only if "ispermalink" is not present or is "true",
843 # and only if the item doesn't already have a link element
844 self._save('link', value)
846 def _start_id(self, attrsD):
850 value = self.pop('id')
851 self._save('guid', value)
853 def _start_title(self, attrsD):
855 self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
856 'type': attrsD.get('type', 'text/plain'),
857 'language': attrsD.get('xml:lang', self.lang),
858 'base': attrsD.get('xml:base', self.baseuri)}
859 self.push('title', self.inchannel or self.initem)
860 _start_dc_title = _start_title
862 def _end_title(self):
865 self.contentparams.clear()
866 _end_dc_title = _end_title
868 def _start_description(self, attrsD):
870 self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
871 'type': attrsD.get('type', 'text/html'),
872 'language': attrsD.get('xml:lang', self.lang),
873 'base': attrsD.get('xml:base', self.baseuri)}
874 self.push('description', self.inchannel or self.initem)
876 def _end_description(self):
877 value = self.pop('description')
879 self.items[-1]['summary'] = value
881 self.channel['tagline'] = value
883 self.contentparams.clear()
885 def _start_info(self, attrsD):
887 self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
888 'type': attrsD.get('type', 'text/plain'),
889 'language': attrsD.get('xml:lang', self.lang),
890 'base': attrsD.get('xml:base', self.baseuri)}
896 self.contentparams.clear()
898 def _start_generator(self, attrsD):
900 self.channel['generator_detail'] = attrsD
901 self.push('generator', 1)
903 def _end_generator(self):
904 value = self.pop('generator')
905 if self.channel.has_key('generator_detail'):
906 self.channel['generator_detail']['name'] = value
908 def _start_admin_generatoragent(self, attrsD):
909 self.push('generator', 1)
910 value = self._getAttribute(attrsD, 'rdf:resource')
912 self.elementstack[-1][2].append(value)
913 self.pop('generator')
915 def _start_admin_errorreportsto(self, attrsD):
916 self.push('errorreportsto', 1)
917 value = self._getAttribute(attrsD, 'rdf:resource')
919 self.elementstack[-1][2].append(value)
920 self.pop('errorreportsto')
922 def _start_summary(self, attrsD):
924 self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
925 'type': attrsD.get('type', 'text/plain'),
926 'language': attrsD.get('xml:lang', self.lang),
927 'base': attrsD.get('xml:base', self.baseuri)}
928 self.push('summary', 1)
930 def _end_summary(self):
931 value = self.pop('summary')
933 self.items[-1]['description'] = value
935 self.contentparams.clear()
937 def _start_enclosure(self, attrsD):
939 self.items[-1].setdefault('enclosures', [])
940 self.items[-1]['enclosures'].append(attrsD)
942 def _start_source(self, attrsD):
944 self.items[-1]['source'] = attrsD
945 self.push('source', 1)
947 def _end_source(self):
950 def _start_content(self, attrsD):
952 self.contentparams = {'mode': attrsD.get('mode', 'xml'),
953 'type': attrsD.get('type', 'text/plain'),
954 'language': attrsD.get('xml:lang', self.lang),
955 'base': attrsD.get('xml:base', self.baseuri)}
956 self.push('content', 1)
958 def _start_body(self, attrsD):
960 self.contentparams = {'mode': 'xml',
961 'type': 'application/xhtml+xml',
962 'language': attrsD.get('xml:lang', self.lang),
963 'base': attrsD.get('xml:base', self.baseuri)}
964 self.push('content', 1)
965 _start_xhtml_body = _start_body
967 def _start_content_encoded(self, attrsD):
969 self.contentparams = {'mode': 'escaped',
971 'language': attrsD.get('xml:lang', self.lang),
972 'base': attrsD.get('xml:base', self.baseuri)}
973 self.push('content', 1)
974 _start_fullitem = _start_content_encoded
976 def _end_content(self):
977 value = self.pop('content')
978 if self.contentparams.get('type') in (['text/plain'] + self.html_types):
979 self._save('description', value)
981 self.contentparams.clear()
982 _end_body = _end_content
983 _end_xhtml_body = _end_content
984 _end_content_encoded = _end_content
985 _end_fullitem = _end_content
988 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):#, xml.sax.handler.DTDHandler):
989 def __init__(self, baseuri):
990 if _debug: sys.stderr.write('trying StrictFeedParser\n')
991 xml.sax.handler.ContentHandler.__init__(self)
992 _FeedParserMixin.__init__(self, baseuri)
996 def startPrefixMapping(self, prefix, uri):
997 self.trackNamespace(prefix, uri)
999 def startElementNS(self, name, qname, attrs):
1000 namespace, localname = name
1001 namespace = str(namespace)
1002 prefix = self.namespaces.get(namespace, '')
1004 localname = prefix + ':' + localname
1005 localname = str(localname).lower()
1007 # qname implementation is horribly broken in Python 2.1 (it
1008 # doesn't report any), and slightly broken in Python 2.2 (it
1009 # doesn't report the xml: namespace). So we match up namespaces
1010 # with a known list first, and then possibly override them with
1011 # the qnames the SAX parser gives us (if indeed it gives us any
1012 # at all). Thanks to MatejC for helping me test this and
1013 # tirelessly telling me that it didn't work yet.
1015 for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1016 prefix = self.namespaces.get(namespace, '')
1018 attrlocalname = prefix + ":" + attrlocalname
1019 attrsD[str(attrlocalname).lower()] = attrvalue
1020 for qname in attrs.getQNames():
1021 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1022 self.unknown_starttag(localname, attrsD.items())
1024 def resolveEntity(self, publicId, systemId):
1027 def characters(self, text):
1028 self.handle_data(text)
1030 def endElementNS(self, name, qname):
1031 namespace, localname = name
1032 namespace = str(namespace)
1033 prefix = self.namespaces.get(namespace, '')
1035 localname = prefix + ':' + localname
1036 localname = str(localname).lower()
1037 self.unknown_endtag(localname)
1039 def fatalError(self, exc):
1044 class _LooseFeedParser(_FeedParserMixin, sgmllib.SGMLParser):
1045 def __init__(self, baseuri):
1046 sgmllib.SGMLParser.__init__(self)
1047 _FeedParserMixin.__init__(self, baseuri)
1049 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1050 elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1051 'img', 'input', 'isindex', 'link', 'meta', 'param']
1054 sgmllib.SGMLParser.__init__(self)
1057 # extend (called by sgmllib.SGMLParser.__init__)
1059 sgmllib.SGMLParser.reset(self)
1061 def normalize_attrs(self, attrs):
1062 # utility method to be called by descendants
1063 attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
1064 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1067 def unknown_starttag(self, tag, attrs):
1068 # called for each start tag
1069 # attrs is a list of (attr, value) tuples
1070 # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
1071 strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
1072 if tag in self.elements_no_end_tag:
1073 self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
1075 self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
1077 def unknown_endtag(self, tag):
1078 # called for each end tag, e.g. for </pre>, tag will be "pre"
1079 # Reconstruct the original end tag.
1080 if tag not in self.elements_no_end_tag:
1081 self.pieces.append("</%(tag)s>" % locals())
1083 def handle_charref(self, ref):
1084 # called for each character reference, e.g. for " ", ref will be "160"
1085 # Reconstruct the original character reference.
1086 self.pieces.append("&#%(ref)s;" % locals())
1088 def handle_entityref(self, ref):
1089 # called for each entity reference, e.g. for "©", ref will be "copy"
1090 # Reconstruct the original entity reference.
1091 self.pieces.append("&%(ref)s;" % locals())
1093 def handle_data(self, text):
1094 # called for each block of plain text, i.e. outside of any tag and
1095 # not containing any character or entity references
1096 # Store the original text verbatim.
1097 self.pieces.append(text)
1099 def handle_comment(self, text):
1100 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1101 # Reconstruct the original comment.
1102 self.pieces.append("<!--%(text)s-->" % locals())
1104 def handle_pi(self, text):
1105 # called for each processing instruction, e.g. <?instruction>
1106 # Reconstruct original processing instruction.
1107 self.pieces.append("<?%(text)s>" % locals())
1109 def handle_decl(self, text):
1110 # called for the DOCTYPE, if present, e.g.
1111 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1112 # "http://www.w3.org/TR/html4/loose.dtd">
1113 # Reconstruct original DOCTYPE
1114 self.pieces.append("<!%(text)s>" % locals())
1117 """Return processed HTML as a single string"""
1118 return "".join(self.pieces)
1120 class _RelativeURIResolver(_BaseHTMLProcessor):
1121 relative_uris = [('a', 'href'),
1122 ('applet', 'codebase'),
1124 ('blockquote', 'cite'),
1125 ('body', 'background'),
1128 ('frame', 'longdesc'),
1130 ('iframe', 'longdesc'),
1132 ('head', 'profile'),
1133 ('img', 'longdesc'),
1137 ('input', 'usemap'),
1140 ('object', 'classid'),
1141 ('object', 'codebase'),
1143 ('object', 'usemap'),
1147 def __init__(self, baseuri):
1148 _BaseHTMLProcessor.__init__(self)
1149 self.baseuri = baseuri
1151 def resolveURI(self, uri):
1152 return urlparse.urljoin(self.baseuri, uri)
1154 def unknown_starttag(self, tag, attrs):
1155 attrs = self.normalize_attrs(attrs)
1156 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1157 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1159 def _resolveRelativeURIs(htmlSource, baseURI):
1160 p = _RelativeURIResolver(baseURI)
1164 class _HTMLSanitizer(_BaseHTMLProcessor):
1165 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1166 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1167 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1168 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1169 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1170 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1171 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1172 'thead', 'tr', 'tt', 'u', 'ul', 'var']
1174 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1175 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1176 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1177 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1178 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1179 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1180 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1181 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1182 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1183 'usemap', 'valign', 'value', 'vspace', 'width']
1185 unacceptable_elements_with_end_tag = ['script', 'applet']
1188 _BaseHTMLProcessor.reset(self)
1189 self.unacceptablestack = 0
1191 def unknown_starttag(self, tag, attrs):
1192 if not tag in self.acceptable_elements:
1193 if tag in self.unacceptable_elements_with_end_tag:
1194 self.unacceptablestack += 1
1196 attrs = self.normalize_attrs(attrs)
1197 attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1198 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1200 def unknown_endtag(self, tag):
1201 if not tag in self.acceptable_elements:
1202 if tag in self.unacceptable_elements_with_end_tag:
1203 self.unacceptablestack -= 1
1205 _BaseHTMLProcessor.unknown_endtag(self, tag)
1207 def handle_pi(self, text):
1210 def handle_decl(self, text):
1213 def handle_data(self, text):
1214 if not self.unacceptablestack:
1215 _BaseHTMLProcessor.handle_data(self, text)
1217 def _sanitizeHTML(htmlSource):
1218 p = _HTMLSanitizer()
1222 nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
1223 if data.count('<body'):
1224 data = data.split('<body', 1)[1]
1226 data = data.split('>', 1)[1]
1227 if data.count('</body'):
1228 data = data.split('</body', 1)[0]
1229 data = data.strip().replace('\r\n', '\n')
1232 class _FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1233 def http_error_default(self, req, fp, code, msg, headers):
1234 if ((code / 100) == 3) and (code != 304):
1235 return self.http_error_302(req, fp, code, msg, headers)
1236 from urllib import addinfourl
1237 infourl = addinfourl(fp, headers, req.get_full_url())
1238 infourl.status = code
1241 def http_error_302(self, req, fp, code, msg, headers):
1242 infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1243 infourl.status = code
1246 def http_error_301(self, req, fp, code, msg, headers):
1247 infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1248 infourl.status = code
1251 http_error_300 = http_error_302
1252 http_error_307 = http_error_302
1254 def _open_resource(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None):
1255 """URL, filename, or string --> stream
1257 This function lets you define parsers that take any input source
1258 (URL, pathname to local or network file, or actual data as a string)
1259 and deal with it in a uniform manner. Returned object is guaranteed
1260 to have all the basic stdio read methods (read, readline, readlines).
1261 Just .close() the object when you're done with it.
1263 If the etag argument is supplied, it will be used as the value of an
1264 If-None-Match request header.
1266 If the modified argument is supplied, it must be a tuple of 9 integers
1267 as returned by gmtime() in the standard Python time module. This MUST
1268 be in GMT (Greenwich Mean Time). The formatted date/time will be used
1269 as the value of an If-Modified-Since request header.
1271 If the agent argument is supplied, it will be used as the value of a
1272 User-Agent request header.
1274 If the referrer argument is supplied, it will be used as the value of a
1275 Referer[sic] request header.
1278 if hasattr(url_file_stream_or_string, "read"):
1279 return url_file_stream_or_string
1281 if url_file_stream_or_string == "-":
1287 # try to open with urllib2 (to use optional headers)
1288 request = urllib2.Request(url_file_stream_or_string)
1290 request.add_header("If-None-Match", etag)
1292 # format into an RFC 1123-compliant timestamp. We can't use
1293 # time.strftime() since the %a and %b directives can be affected
1294 # by the current locale, but RFC 2616 states that dates must be
1296 short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
1297 months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
1298 request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1299 request.add_header("User-Agent", agent)
1301 request.add_header("Referer", referrer)
1303 request.add_header("Accept-encoding", "gzip")
1304 opener = urllib2.build_opener(_FeedURLHandler())
1305 opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1308 return opener.open(request)
1310 # url_file_stream_or_string is not a valid URL, but it might be a valid filename
1313 opener.close() # JohnD
1315 # try to open with native open function (if url_file_stream_or_string is a filename)
1317 return open(url_file_stream_or_string)
1321 # treat url_file_stream_or_string as string
1322 return _StringIO(str(url_file_stream_or_string))
1324 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
1325 # Drake and licensed under the Python license. Removed all range checking
1326 # for month, day, hour, minute, and second, since mktime will normalize
1328 def _w3dtf_parse(s):
1329 def __extract_date(m):
1330 year = int(m.group("year"))
1332 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1335 julian = m.group("julian")
1337 julian = int(julian)
1338 month = julian / 30 + 1
1339 day = julian % 30 + 1
1341 while jday != julian:
1342 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
1343 jday = time.gmtime(t)[-2]
1344 diff = abs(jday - julian)
1356 return year, month, day
1357 month = m.group("month")
1363 day = m.group("day")
1368 return year, month, day
1370 def __extract_time(m):
1373 hours = m.group("hours")
1377 minutes = int(m.group("minutes"))
1378 seconds = m.group("seconds")
1380 seconds = int(seconds)
1383 return hours, minutes, seconds
1385 def __extract_tzd(m):
1386 """Return the Time Zone Designator as an offset in seconds from UTC."""
1389 tzd = m.group("tzd")
1394 hours = int(m.group("tzdhours"))
1395 minutes = m.group("tzdminutes")
1397 minutes = int(minutes)
1400 offset = (hours*60 + minutes) * 60
1405 __date_re = ("(?P<year>\d\d\d\d)"
1407 "(?:(?P<julian>\d\d\d)"
1408 "|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?")
1409 __tzd_re = "(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)"
1410 __tzd_rx = re.compile(__tzd_re)
1411 __time_re = ("(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)"
1412 "(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?"
1414 __datetime_re = "%s(?:T%s)?" % (__date_re, __time_re)
1415 __datetime_rx = re.compile(__datetime_re)
1416 m = __datetime_rx.match(s)
1417 if m is None or m.group() != s:
1419 gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
1420 if gmt[0] == 0: return
1421 return time.mktime(gmt) + __extract_tzd(m) - time.timezone
1423 # Additional ISO-8601 date parsing routines written by Fazal Majid
1424 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1425 # parser is beyond the scope of feedparser and would be a worthwhile addition
1426 # to the Python library
1427 # A single regular expression cannot parse ISO 8601 date formats into groups
1428 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
1429 # 0301-04-01), so we use templates instead
1430 # Please note the order in templates is significant because we need a
1432 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1433 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1434 '-YY-?MM', '-OOO', '-YY',
1440 'YYYY', r'(?P<year>\d{4})').replace(
1441 'YY', r'(?P<year>\d\d)').replace(
1442 'MM', r'(?P<month>[01]\d)').replace(
1443 'DD', r'(?P<day>[0123]\d)').replace(
1444 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1445 'CC', r'(?P<century>\d\d$)')
1446 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1447 + r'(:(?P<second>\d{2}))?'
1448 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1449 for tmpl in _iso8601_tmpl]
1452 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1455 # rfc822.py defines several time zones, but we define some extra ones.
1456 # "ET" is equivalent to "EST", etc.
1457 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
1458 rfc822._timezones.update(_additional_timezones)
1460 def _parse_date(date):
1461 """Parses a variety of date formats into a tuple of 9 integers"""
1464 # try the standard rfc822 library, which handles
1465 # RFC822, RFC1123, RFC2822, and asctime
1466 tm = rfc822.parsedate_tz(date)
1468 return time.gmtime(rfc822.mktime_tz(tm))
1469 # not a RFC2822 date, try W3DTF profile of ISO-8601
1471 tm = _w3dtf_parse(date)
1475 return time.gmtime(tm)
1476 # try various non-W3DTF ISO-8601-compatible formats like 20040105
1478 for _iso8601_match in _iso8601_matches:
1479 m = _iso8601_match(date)
1482 # catch truly malformed strings
1483 if m.span() == (0, 0): return
1484 params = m.groupdict()
1485 ordinal = params.get("ordinal", 0)
1487 ordinal = int(ordinal)
1490 year = params.get("year", "--")
1491 if not year or year == "--":
1492 year = time.gmtime()[0]
1493 elif len(year) == 2:
1494 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1495 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1498 month = params.get("month", "-")
1499 if not month or month == "-":
1500 # ordinals are NOT normalized by mktime, we simulate them
1501 # by setting month=1, day=ordinal
1505 month = time.gmtime()[1]
1507 day = params.get("day", 0)
1512 elif params.get("century", 0) or \
1513 params.get("year", 0) or params.get("month", 0):
1516 day = time.gmtime()[2]
1519 # special case of the century - is the first year of the 21st century
1520 # 2000 or 2001 ? The debate goes on...
1521 if "century" in params.keys():
1522 year = (int(params["century"]) - 1) * 100 + 1
1523 # in ISO 8601 most fields are optional
1524 for field in ["hour", "minute", "second", "tzhour", "tzmin"]:
1525 if not params.get(field, None):
1527 hour = int(params.get("hour", 0))
1528 minute = int(params.get("minute", 0))
1529 second = int(params.get("second", 0))
1530 # weekday is normalized by mktime(), we can ignore it
1532 # daylight savings is complex, but not needed for feedparser's purposes
1533 # as time zones, if specified, include mention of whether it is active
1534 # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1535 # and most implementations have DST bugs
1536 daylight_savings_flag = 0
1537 tm = [year, month, day, hour, minute, second, weekday,
1538 ordinal, daylight_savings_flag]
1539 # ISO 8601 time zone adjustments
1540 tz = params.get("tz")
1541 if tz and tz != "Z":
1543 tm[3] += int(params.get("tzhour", 0))
1544 tm[4] += int(params.get("tzmin", 0))
1546 tm[3] -= int(params.get("tzhour", 0))
1547 tm[4] -= int(params.get("tzmin", 0))
1550 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1551 # which is guaranteed to normalize d/m/y/h/m/s
1552 # many implementations have bugs, but we'll pretend they don't
1553 return time.localtime(time.mktime(tm))
1557 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None):
1558 """Parse a feed from a URL, file, stream, or string"""
1560 f = _open_resource(url_file_stream_or_string, etag=etag, modified=modified, agent=agent, referrer=referrer)
1562 if hasattr(f, "headers"):
1563 if gzip and f.headers.get('content-encoding', '') == 'gzip':
1565 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
1567 # some feeds claim to be gzipped but they're not, so we get garbage
1569 if hasattr(f, "info"):
1571 result["etag"] = info.getheader("ETag")
1572 last_modified = info.getheader("Last-Modified")
1574 result["modified"] = _parse_date(last_modified)
1575 if hasattr(f, "url"):
1576 result["url"] = f.url
1577 result["status"] = 200 # default, may be overridden later
1578 if hasattr(f, "status"):
1579 result["status"] = f.status
1580 if hasattr(f, "headers"):
1581 result["headers"] = f.headers.dict
1582 # get the xml encoding
1583 xmlheaderRe = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>') # Andrei's version
1584 match = xmlheaderRe.match(data)
1586 result["encoding"] = match.groups()[0].lower()
1588 result['channel'] = {}
1589 result['items'] = {}
1590 baseuri = result.get('headers', {}).get('content-location', result.get('url'))
1591 # try true XML parser first
1593 if _debug: sys.stderr.write('using xml library\n')
1595 feedparser = _StrictFeedParser(baseuri)
1596 if re.search(r'<!DOCTYPE\s+?rss\s+?PUBLIC\s+?"-//Netscape Communications//DTD RSS 0.91//EN"\s+?"http://my.netscape.com/publish/formats/rss-0.91.dtd">', data):
1597 feedparser.version = 'rss091n'
1598 source = xml.sax.xmlreader.InputSource()
1599 source.setByteStream(_StringIO(data))
1600 saxparser = xml.sax.make_parser()#["drv_libxml2"])
1601 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
1602 saxparser.setContentHandler(feedparser)
1603 saxparser.setErrorHandler(feedparser)
1605 saxparser.setDTDHandler(feedparser)
1606 saxparser.setEntityResolver(feedparser)
1607 except xml.sax.SAXNotSupportedException:
1608 if _debug: sys.stderr.write('using an xml library that does not support DTDHandler and EntityResolver (this is not a problem)\n')
1609 # libxml2 driver does not currently support DTDHandler or EntityResolver
1611 if hasattr(saxparser, '_ns_stack'):
1612 # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
1613 # PyXML doesn't have this problem, and it doesn't have _ns_stack either
1614 saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
1616 saxparser.parse(source)
1617 except Exception, e:
1618 # SAX parser is supposed to catch all of these and call feedparser.fatal_error,
1619 # which captures them. For some reason, some Unicode-related errors go
1620 # uncaught on some combination of platform, XML library, Python version,
1621 # and phase of the moon.
1623 feedparser.bozo_exception = e
1625 # feed is not well-formed XML, fall back on regex-based parser
1626 if _debug: sys.stderr.write('xml parsing failed, using regexes. now you have two problems...\n')
1628 result['bozo_exception'] = feedparser.exc
1629 # munge short tags, e.g. <description/> becomes <description></description>
1630 data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
1631 feedparser = _LooseFeedParser(baseuri)
1632 feedparser.feed(data)
1634 if _debug: sys.stderr.write('no xml libraries available, using regexes\n')
1635 data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
1636 feedparser = _LooseFeedParser(baseuri)
1637 feedparser.feed(data)
1638 result['channel'] = feedparser.channel
1639 result['items'] = feedparser.items
1640 result['version'] = feedparser.version
1643 _TEST_SUITE = ('http://www.pocketsoap.com/rssTests/rss1.0withModules.xml',
1644 'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNS.xml',
1645 'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNSLocalNameClash.xml',
1646 'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModules.xml',
1647 'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModulesLocalNameClash.xml',
1648 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModules.xml',
1649 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNS.xml',
1650 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNSLocalNameClash.xml')
1652 if __name__ == '__main__':
1657 from pprint import pprint
1667 #- textinput/textInput
1672 #- content-type.startswith('text/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else "us-ascii"
1673 #- content-type.startswith('application/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else look for encoding="(.*?)" in document, else "utf-8"
1674 #- parsing encoding: http://www.w3.org/TR/REC-xml#NT-EncodingDecl
1677 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
1678 # added Simon Fell's test suite
1679 #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
1681 # JD - use inchannel to watch out for image and textinput elements which can
1682 # also contain title, link, and description elements
1683 # JD - check for isPermaLink="false" attribute on guid elements
1684 # JD - replaced openAnything with open_resource supporting ETag and
1685 # If-Modified-Since request headers
1686 # JD - parse now accepts etag, modified, agent, and referrer optional
1688 # JD - modified parse to return a dictionary instead of a tuple so that any
1689 # etag or modified information can be returned and cached by the caller
1690 #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
1691 # because of etag/modified, return the old etag/modified to the caller to
1692 # indicate why nothing is being returned
1693 #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
1694 # useless. Fixes the problem JD was addressing by adding it.
1695 #2.1 - 11/14/2002 - MAP - added gzip support
1696 #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
1697 # start_admingeneratoragent is an example of how to handle elements with
1698 # only attributes, no content.
1699 #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
1700 # also, make sure we send the User-Agent even if urllib2 isn't available.
1701 # Match any variation of backend.userland.com/rss namespace.
1702 #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
1703 #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
1704 # snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
1706 #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
1707 # removed unnecessary urllib code -- urllib2 should always be available anyway;
1708 # return actual url, status, and full HTTP headers (as result['url'],
1709 # result['status'], and result['headers']) if parsing a remote feed over HTTP --
1710 # this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
1711 # added the latest namespace-of-the-week for RSS 2.0
1712 #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
1713 # User-Agent (otherwise urllib2 sends two, which confuses some servers)
1714 #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
1715 # inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
1716 #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
1717 # textInput, and also to return the character encoding (if specified)
1718 #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
1719 # nested divs within content (JohnD); fixed missing sys import (JohanS);
1720 # fixed regular expression to capture XML character encoding (Andrei);
1721 # added support for Atom 0.3-style links; fixed bug with textInput tracking;
1722 # added support for cloud (MartijnP); added support for multiple
1723 # category/dc:subject (MartijnP); normalize content model: "description" gets
1724 # description (which can come from description, summary, or full content if no
1725 # description), "content" gets dict of base/language/type/value (which can come
1726 # from content:encoded, xhtml:body, content, or fullitem);
1727 # fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
1728 # tracking; fixed bug tracking unknown tags; fixed bug tracking content when
1729 # <content> element is not in default namespace (like Pocketsoap feed);
1730 # resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
1731 # wfw:commentRSS; resolve relative URLs within embedded HTML markup in
1732 # description, xhtml:body, content, content:encoded, title, subtitle,
1733 # summary, info, tagline, and copyright; added support for pingback and
1734 # trackback namespaces
1735 #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
1736 # namespaces, as opposed to 2.6 when I said I did but didn't really;
1737 # sanitize HTML markup within some elements; added mxTidy support (if
1738 # installed) to tidy HTML markup within some elements; fixed indentation
1739 # bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
1740 # (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
1741 # 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
1742 # 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
1743 # and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
1744 #2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
1745 # leak not closing url opener (JohnD); added dc:publisher support (MarekK);
1746 # added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
1747 #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
1748 # encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
1749 # fixed relative URI processing for guid (skadz); added ICBM support; added
1751 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
1752 # blogspot.com sites); added _debug variable
1753 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
1754 #3.0 - MAP - parse entire feed with real XML parser (if available); added several
1755 # new supported namespaces; fixed bug tracking naked markup in description;
1756 # added support for enclosure; added support for source; re-added support for
1757 # cloud which got dropped somehow; added support for expirationDate; fixed
1758 # xml:lang inheritance; fixed multiple bugs tracking xml:base URI, one for
1759 # documents that don't define one explicitly and one for documents that define
1760 # an outer and an inner xml:base that goes out of scope before the end of the
1761 # document; fixed bug parsing multiple links at feed level; added feed type and
1762 # version detection, results["version"] will be one of SUPPORTED_VERSIONS.keys()
1763 # or empty string if unrecognized; added support for creativeCommons:license and
1764 # cc:license; added support for full Atom content model in title, tagline, info,
1765 # copyright, summary; fixed bug with gzip encoding (not always telling server
1766 # we support it when we do); support Atom-style author element in author_detail
1767 # (dictionary of "name", "url", "email"); map author to author_detail if author
1768 # contains name + email address; better handling of empty HTML tags (br, hr, img,
1769 # etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />);
1770 # fixed CDATA handling in non-wellformed feeds under Python 2.1