2 """Universal feed parser
4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
6 Visit http://feedparser.org/ for the latest version
7 Visit http://feedparser.org/docs/ for the latest documentation
9 Required: Python 2.4 or later
10 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
14 __license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
16 Redistribution and use in source and binary forms, with or without modification,
17 are permitted provided that the following conditions are met:
19 * Redistributions of source code must retain the above copyright notice,
20 this list of conditions and the following disclaimer.
21 * Redistributions in binary form must reproduce the above copyright notice,
22 this list of conditions and the following disclaimer in the documentation
23 and/or other materials provided with the distribution.
25 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
26 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 POSSIBILITY OF SUCH DAMAGE."""
36 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
37 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
38 "John Beimler <http://john.beimler.org/>",
39 "Fazal Majid <http://www.majid.info/mylos/weblog/>",
40 "Aaron Swartz <http://aaronsw.com/>",
41 "Kevin Marks <http://epeus.blogspot.com/>",
42 "Sam Ruby <http://intertwingly.net/>",
43 "Ade Oshineye <http://blog.oshineye.com/>",
44 "Martin Pool <http://sourcefrog.net/>",
45 "Kurt McKee <http://kurtmckee.org/>"]
48 # HTTP "User-Agent" header to send to servers when downloading feeds.
49 # If you are embedding feedparser in a larger application, you should
50 # change this to your application name and URL.
51 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
53 # HTTP "Accept" header to send to servers when downloading feeds. If you don't
54 # want to send an Accept header, set this to None.
55 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
57 # List of preferred XML parsers, by SAX driver name. These will be tried first,
58 # but if they're not installed, Python will keep searching through its own list
59 # of pre-installed parsers until it finds one that supports everything we need.
60 PREFERRED_XML_PARSERS = ["drv_libxml2"]
62 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
63 # this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
64 # or utidylib <http://utidylib.berlios.de/>.
67 # List of Python interfaces for HTML Tidy, in order of preference. Only useful
69 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
71 # If you want feedparser to automatically resolve all relative URIs, set this
73 RESOLVE_RELATIVE_URIS = 1
75 # If you want feedparser to automatically sanitize all potentially unsafe
76 # HTML content, set this to 1.
79 # ---------- Python 3 modules (make it work if possible) ----------
83 from email import _parseaddr as rfc822
86 # Python 3.1 introduces bytes.maketrans and simultaneously
87 # deprecates string.maketrans; use bytes.maketrans if possible
88 _maketrans = bytes.maketrans
89 except (NameError, AttributeError):
91 _maketrans = string.maketrans
93 # base64 support for Atom feeds that contain embedded binary data
95 import base64, binascii
96 # Python 3.1 deprecates decodestring in favor of decodebytes
97 _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
99 base64 = binascii = None
102 # Convert a UTF-8 str to bytes if the interpreter is Python 3
104 return bytes(s, 'utf8')
105 except (NameError, TypeError):
106 # In Python 2.5 and below, bytes doesn't exist (NameError)
107 # In Python 2.6 and above, bytes and str are the same (TypeError)
111 # Convert a list of ints to bytes if the interpreter is Python 3
114 # In Python 2.6 and above, this call won't raise an exception
115 # but it will return bytes([65]) as '[65]' instead of 'A'
119 return ''.join(map(chr, l))
121 # If you want feedparser to allow all URL schemes, set this to ()
122 # List culled from Python's urlparse documentation at:
123 # http://docs.python.org/library/urlparse.html
124 # as well as from "URI scheme" at Wikipedia:
125 # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
126 # Many more will likely need to be added!
127 ACCEPTABLE_URI_SCHEMES = (
128 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto',
129 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp',
130 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
131 # Additional common-but-unofficial schemes
132 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
133 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
135 #ACCEPTABLE_URI_SCHEMES = ()
137 # ---------- required modules (should come with any Python distribution) ----------
138 import sgmllib, re, sys, copy, urlparse, time, types, cgi, urllib, urllib2, datetime
140 from io import BytesIO as _StringIO
143 from cStringIO import StringIO as _StringIO
145 from StringIO import StringIO as _StringIO
147 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
149 # gzip is included with most Python distributions, but may not be available if you compiled your own
159 # If a real XML parser is available, feedparser will attempt to use it. feedparser has
160 # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
161 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
162 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
165 xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
166 from xml.sax.saxutils import escape as _xmlescape
170 def _xmlescape(data,entities={}):
171 data = data.replace('&', '&')
172 data = data.replace('>', '>')
173 data = data.replace('<', '<')
174 for char, entity in entities:
175 data = data.replace(char, entity)
178 # cjkcodecs and iconv_codec provide support for more character encodings.
179 # Both are available from http://cjkpython.i18n.org/
181 import cjkcodecs.aliases
189 # chardet library auto-detects character encodings
190 # Download from http://chardet.feedparser.org/
194 import chardet.constants
195 chardet.constants._debug = 1
199 # reversable htmlentitydefs mappings for Python 2.2
201 from htmlentitydefs import name2codepoint, codepoint2name
203 import htmlentitydefs
206 for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
207 if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
208 name2codepoint[name]=ord(codepoint)
209 codepoint2name[ord(codepoint)]=name
211 # BeautifulSoup parser used for parsing microformats from embedded HTML content
212 # http://www.crummy.com/software/BeautifulSoup/
213 # feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
214 # older 2.x series. If it doesn't, and you can figure out why, I'll accept a
215 # patch and modify the compatibility statement accordingly.
221 # ---------- don't touch these ----------
222 class ThingsNobodyCaresAboutButMe(Exception): pass
223 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
224 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
225 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
226 class UndeclaredNamespace(Exception): pass
228 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
229 sgmllib.special = re.compile('<!')
230 sgmllib.charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
232 if sgmllib.endbracket.search(' <').start(0):
233 class EndBracketRegEx:
235 # Overriding the built-in sgmllib.endbracket regex allows the
236 # parser to find angle brackets embedded in element attributes.
237 self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
238 def search(self,string,index=0):
239 match = self.endbracket.match(string,index)
240 if match is not None:
241 # Returning a new object in the calling thread's context
242 # resolves a thread-safety.
243 return EndBracketMatch(match)
245 class EndBracketMatch:
246 def __init__(self, match):
249 return self.match.end(n)
250 sgmllib.endbracket = EndBracketRegEx()
252 SUPPORTED_VERSIONS = {'': 'unknown',
253 'rss090': 'RSS 0.90',
254 'rss091n': 'RSS 0.91 (Netscape)',
255 'rss091u': 'RSS 0.91 (Userland)',
256 'rss092': 'RSS 0.92',
257 'rss093': 'RSS 0.93',
258 'rss094': 'RSS 0.94',
261 'rss': 'RSS (unknown version)',
262 'atom01': 'Atom 0.1',
263 'atom02': 'Atom 0.2',
264 'atom03': 'Atom 0.3',
265 'atom10': 'Atom 1.0',
266 'atom': 'Atom (unknown version)',
274 # Python 2.1 does not have dict
275 from UserDict import UserDict
282 class FeedParserDict(UserDict):
283 keymap = {'channel': 'feed',
287 'date_parsed': 'updated_parsed',
288 'description': ['summary', 'subtitle'],
290 'modified': 'updated',
291 'modified_parsed': 'updated_parsed',
292 'issued': 'published',
293 'issued_parsed': 'published_parsed',
294 'copyright': 'rights',
295 'copyright_detail': 'rights_detail',
296 'tagline': 'subtitle',
297 'tagline_detail': 'subtitle_detail'}
298 def __getitem__(self, key):
299 if key == 'category':
300 return UserDict.__getitem__(self, 'tags')[0]['term']
301 if key == 'enclosures':
302 norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
303 return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
305 for link in UserDict.__getitem__(self, 'links'):
306 if link['rel']=='license' and link.has_key('href'):
308 if key == 'categories':
309 return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
310 realkey = self.keymap.get(key, key)
311 if type(realkey) == types.ListType:
313 if UserDict.__contains__(self, k):
314 return UserDict.__getitem__(self, k)
315 if UserDict.__contains__(self, key):
316 return UserDict.__getitem__(self, key)
317 return UserDict.__getitem__(self, realkey)
319 def __setitem__(self, key, value):
320 for k in self.keymap.keys():
323 if type(key) == types.ListType:
325 return UserDict.__setitem__(self, key, value)
327 def get(self, key, default=None):
328 if self.has_key(key):
333 def setdefault(self, key, value):
334 if not self.has_key(key):
338 def has_key(self, key):
340 return hasattr(self, key) or UserDict.__contains__(self, key)
341 except AttributeError:
343 # This alias prevents the 2to3 tool from changing the semantics of the
344 # __contains__ function below and exhausting the maximum recursion depth
347 def __getattr__(self, key):
349 return self.__dict__[key]
353 assert not key.startswith('_')
354 return self.__getitem__(key)
356 raise AttributeError, "object has no attribute '%s'" % key
358 def __setattr__(self, key, value):
359 if key.startswith('_') or key == 'data':
360 self.__dict__[key] = value
362 return self.__setitem__(key, value)
364 def __contains__(self, key):
365 return self.__has_key(key)
367 def zopeCompatibilityHack():
368 global FeedParserDict
370 def FeedParserDict(aDict=None):
376 _ebcdic_to_ascii_map = None
377 def _ebcdic_to_ascii(s):
378 global _ebcdic_to_ascii_map
379 if not _ebcdic_to_ascii_map:
381 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
382 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
383 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
384 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
385 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
386 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
387 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
388 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
389 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
390 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
391 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
392 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
393 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
394 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
395 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
396 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
398 _ebcdic_to_ascii_map = _maketrans( \
399 _l2bytes(range(256)), _l2bytes(emap))
400 return s.translate(_ebcdic_to_ascii_map)
403 unichr(128): unichr(8364), # euro sign
404 unichr(130): unichr(8218), # single low-9 quotation mark
405 unichr(131): unichr( 402), # latin small letter f with hook
406 unichr(132): unichr(8222), # double low-9 quotation mark
407 unichr(133): unichr(8230), # horizontal ellipsis
408 unichr(134): unichr(8224), # dagger
409 unichr(135): unichr(8225), # double dagger
410 unichr(136): unichr( 710), # modifier letter circumflex accent
411 unichr(137): unichr(8240), # per mille sign
412 unichr(138): unichr( 352), # latin capital letter s with caron
413 unichr(139): unichr(8249), # single left-pointing angle quotation mark
414 unichr(140): unichr( 338), # latin capital ligature oe
415 unichr(142): unichr( 381), # latin capital letter z with caron
416 unichr(145): unichr(8216), # left single quotation mark
417 unichr(146): unichr(8217), # right single quotation mark
418 unichr(147): unichr(8220), # left double quotation mark
419 unichr(148): unichr(8221), # right double quotation mark
420 unichr(149): unichr(8226), # bullet
421 unichr(150): unichr(8211), # en dash
422 unichr(151): unichr(8212), # em dash
423 unichr(152): unichr( 732), # small tilde
424 unichr(153): unichr(8482), # trade mark sign
425 unichr(154): unichr( 353), # latin small letter s with caron
426 unichr(155): unichr(8250), # single right-pointing angle quotation mark
427 unichr(156): unichr( 339), # latin small ligature oe
428 unichr(158): unichr( 382), # latin small letter z with caron
429 unichr(159): unichr( 376)} # latin capital letter y with diaeresis
431 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
432 def _urljoin(base, uri):
433 uri = _urifixer.sub(r'\1\3', uri)
435 return urlparse.urljoin(base, uri)
437 uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
438 return urlparse.urljoin(base, uri)
440 class _FeedParserMixin:
441 namespaces = {'': '',
442 'http://backend.userland.com/rss': '',
443 'http://blogs.law.harvard.edu/tech/rss': '',
444 'http://purl.org/rss/1.0/': '',
445 'http://my.netscape.com/rdf/simple/0.9/': '',
446 'http://example.com/newformat#': '',
447 'http://example.com/necho': '',
448 'http://purl.org/echo/': '',
449 'uri/of/echo/namespace#': '',
450 'http://purl.org/pie/': '',
451 'http://purl.org/atom/ns#': '',
452 'http://www.w3.org/2005/Atom': '',
453 'http://purl.org/rss/1.0/modules/rss091#': '',
455 'http://webns.net/mvcb/': 'admin',
456 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
457 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
458 'http://media.tangent.org/rss/1.0/': 'audio',
459 'http://backend.userland.com/blogChannelModule': 'blogChannel',
460 'http://web.resource.org/cc/': 'cc',
461 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
462 'http://purl.org/rss/1.0/modules/company': 'co',
463 'http://purl.org/rss/1.0/modules/content/': 'content',
464 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
465 'http://purl.org/dc/elements/1.1/': 'dc',
466 'http://purl.org/dc/terms/': 'dcterms',
467 'http://purl.org/rss/1.0/modules/email/': 'email',
468 'http://purl.org/rss/1.0/modules/event/': 'ev',
469 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
470 'http://freshmeat.net/rss/fm/': 'fm',
471 'http://xmlns.com/foaf/0.1/': 'foaf',
472 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
473 'http://postneo.com/icbm/': 'icbm',
474 'http://purl.org/rss/1.0/modules/image/': 'image',
475 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
476 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
477 'http://purl.org/rss/1.0/modules/link/': 'l',
478 'http://search.yahoo.com/mrss': 'media',
479 #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
480 'http://search.yahoo.com/mrss/': 'media',
481 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
482 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
483 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
484 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
485 'http://purl.org/rss/1.0/modules/reference/': 'ref',
486 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
487 'http://purl.org/rss/1.0/modules/search/': 'search',
488 'http://purl.org/rss/1.0/modules/slash/': 'slash',
489 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
490 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
491 'http://hacks.benhammersley.com/rss/streaming/': 'str',
492 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
493 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
494 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
495 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
496 'http://purl.org/rss/1.0/modules/threading/': 'thr',
497 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
498 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
499 'http://wellformedweb.org/commentAPI/': 'wfw',
500 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
501 'http://www.w3.org/1999/xhtml': 'xhtml',
502 'http://www.w3.org/1999/xlink': 'xlink',
503 'http://www.w3.org/XML/1998/namespace': 'xml'
505 _matchnamespaces = {}
507 can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
508 can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
509 can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
510 html_types = ['text/html', 'application/xhtml+xml']
512 def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
513 if _debug: sys.stderr.write('initializing FeedParser\n')
514 if not self._matchnamespaces:
515 for k, v in self.namespaces.items():
516 self._matchnamespaces[k.lower()] = v
517 self.feeddata = FeedParserDict() # feed-level data
518 self.encoding = encoding # character encoding
519 self.entries = [] # list of entry-level data
520 self.version = '' # feed type/version, see SUPPORTED_VERSIONS
521 self.namespacesInUse = {} # dictionary of namespaces defined by the feed
523 # the following are used internally to track state;
524 # this is really out of control and should be refactored
531 self.incontributor = 0
534 self.sourcedata = FeedParserDict()
535 self.contentparams = FeedParserDict()
536 self._summaryKey = None
537 self.namespacemap = {}
538 self.elementstack = []
541 self.baseuri = baseuri or ''
542 self.lang = baselang or None
546 self.feeddata['language'] = baselang.replace('_','-')
548 def unknown_starttag(self, tag, attrs):
549 if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
551 attrs = [(k.lower(), v) for k, v in attrs]
552 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
553 # the sgml parser doesn't handle entities in attributes, but
554 # strict xml parsers do -- account for this difference
555 if isinstance(self, _LooseFeedParser):
556 attrs = [(k, v.replace('&', '&')) for k, v in attrs]
558 # track xml:base and xml:lang
560 baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
561 if type(baseuri) != type(u''):
563 baseuri = unicode(baseuri, self.encoding)
565 baseuri = unicode(baseuri, 'iso-8859-1')
566 # ensure that self.baseuri is always an absolute URI that
567 # uses a whitelisted URI scheme (e.g. not `javscript:`)
569 self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
571 self.baseuri = _urljoin(self.baseuri, baseuri)
572 lang = attrsD.get('xml:lang', attrsD.get('lang'))
574 # xml:lang could be explicitly set to '', we need to capture that
577 # if no xml:lang is specified, use parent lang
580 if tag in ('feed', 'rss', 'rdf:RDF'):
581 self.feeddata['language'] = lang.replace('_','-')
583 self.basestack.append(self.baseuri)
584 self.langstack.append(lang)
587 for prefix, uri in attrs:
588 if prefix.startswith('xmlns:'):
589 self.trackNamespace(prefix[6:], uri)
590 elif prefix == 'xmlns':
591 self.trackNamespace(None, uri)
593 # track inline content
594 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
595 if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
596 # element declared itself as escaped markup, but it isn't really
597 self.contentparams['type'] = 'application/xhtml+xml'
598 if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
599 if tag.find(':') <> -1:
600 prefix, tag = tag.split(':', 1)
601 namespace = self.namespacesInUse.get(prefix, '')
602 if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
603 attrs.append(('xmlns',namespace))
604 if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
605 attrs.append(('xmlns',namespace))
606 if tag == 'svg': self.svgOK += 1
607 return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
610 if tag.find(':') <> -1:
611 prefix, suffix = tag.split(':', 1)
613 prefix, suffix = '', tag
614 prefix = self.namespacemap.get(prefix, prefix)
616 prefix = prefix + '_'
618 # special hack for better tracking of empty textinput/image elements in illformed feeds
619 if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
621 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
624 # call special handler (if defined) or default handler
625 methodname = '_start_' + prefix + suffix
627 method = getattr(self, methodname)
628 return method(attrsD)
629 except AttributeError:
630 # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
631 unknown_tag = prefix + suffix
633 # No attributes so merge it into the encosing dictionary
634 return self.push(unknown_tag, 1)
636 # Has attributes so create it in its own dictionary
637 context = self._getContext()
638 context[unknown_tag] = attrsD
640 def unknown_endtag(self, tag):
641 if _debug: sys.stderr.write('end %s\n' % tag)
643 if tag.find(':') <> -1:
644 prefix, suffix = tag.split(':', 1)
646 prefix, suffix = '', tag
647 prefix = self.namespacemap.get(prefix, prefix)
649 prefix = prefix + '_'
650 if suffix == 'svg' and self.svgOK: self.svgOK -= 1
652 # call special handler (if defined) or default handler
653 methodname = '_end_' + prefix + suffix
655 if self.svgOK: raise AttributeError()
656 method = getattr(self, methodname)
658 except AttributeError:
659 self.pop(prefix + suffix)
661 # track inline content
662 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
663 # element declared itself as escaped markup, but it isn't really
664 if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
665 self.contentparams['type'] = 'application/xhtml+xml'
666 if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
667 tag = tag.split(':')[-1]
668 self.handle_data('</%s>' % tag, escape=0)
670 # track xml:base and xml:lang going out of scope
673 if self.basestack and self.basestack[-1]:
674 self.baseuri = self.basestack[-1]
677 if self.langstack: # and (self.langstack[-1] is not None):
678 self.lang = self.langstack[-1]
680 def handle_charref(self, ref):
681 # called for each character reference, e.g. for ' ', ref will be '160'
682 if not self.elementstack: return
684 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
691 text = unichr(c).encode('utf-8')
692 self.elementstack[-1][2].append(text)
694 def handle_entityref(self, ref):
695 # called for each entity reference, e.g. for '©', ref will be 'copy'
696 if not self.elementstack: return
697 if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
698 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
700 elif ref in self.entities.keys():
701 text = self.entities[ref]
702 if text.startswith('&#') and text.endswith(';'):
703 return self.handle_entityref(text)
705 try: name2codepoint[ref]
706 except KeyError: text = '&%s;' % ref
707 else: text = unichr(name2codepoint[ref]).encode('utf-8')
708 self.elementstack[-1][2].append(text)
710 def handle_data(self, text, escape=1):
711 # called for each block of plain text, i.e. outside of any tag and
712 # not containing any character or entity references
713 if not self.elementstack: return
714 if escape and self.contentparams.get('type') == 'application/xhtml+xml':
715 text = _xmlescape(text)
716 self.elementstack[-1][2].append(text)
718 def handle_comment(self, text):
719 # called for each comment, e.g. <!-- insert message here -->
722 def handle_pi(self, text):
723 # called for each processing instruction, e.g. <?instruction>
726 def handle_decl(self, text):
729 def parse_declaration(self, i):
730 # override internal declaration handler to handle CDATA blocks
731 if _debug: sys.stderr.write('entering parse_declaration\n')
732 if self.rawdata[i:i+9] == '<![CDATA[':
733 k = self.rawdata.find(']]>', i)
735 # CDATA block began but didn't finish
736 k = len(self.rawdata)
738 self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
741 k = self.rawdata.find('>', i)
745 # We have an incomplete CDATA block.
748 def mapContentType(self, contentType):
749 contentType = contentType.lower()
750 if contentType == 'text' or contentType == 'plain':
751 contentType = 'text/plain'
752 elif contentType == 'html':
753 contentType = 'text/html'
754 elif contentType == 'xhtml':
755 contentType = 'application/xhtml+xml'
758 def trackNamespace(self, prefix, uri):
759 loweruri = uri.lower()
760 if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
761 self.version = 'rss090'
762 if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
763 self.version = 'rss10'
764 if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
765 self.version = 'atom10'
766 if loweruri.find('backend.userland.com/rss') <> -1:
767 # match any backend.userland.com namespace
768 uri = 'http://backend.userland.com/rss'
770 if self._matchnamespaces.has_key(loweruri):
771 self.namespacemap[prefix] = self._matchnamespaces[loweruri]
772 self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
774 self.namespacesInUse[prefix or ''] = uri
776 def resolveURI(self, uri):
777 return _urljoin(self.baseuri or '', uri)
779 def decodeEntities(self, element, data):
782 def strattrs(self, attrs):
783 return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs])
785 def push(self, element, expectingText):
786 self.elementstack.append([element, expectingText, []])
788 def pop(self, element, stripWhitespace=1):
789 if not self.elementstack: return
790 if self.elementstack[-1][0] != element: return
792 element, expectingText, pieces = self.elementstack.pop()
794 if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
795 # remove enclosing child element, but only if it is a <div> and
796 # only if all the remaining content is nested underneath it.
797 # This means that the divs would be retained in the following:
798 # <div>foo</div><div>bar</div>
799 while pieces and len(pieces)>1 and not pieces[-1].strip():
801 while pieces and len(pieces)>1 and not pieces[0].strip():
803 if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
805 for piece in pieces[:-1]:
806 if piece.startswith('</'):
809 elif piece.startswith('<') and not piece.endswith('/>'):
812 pieces = pieces[1:-1]
814 # Ensure each piece is a str for Python 3
815 for (i, v) in enumerate(pieces):
816 if not isinstance(v, basestring):
817 pieces[i] = v.decode('utf-8')
819 output = ''.join(pieces)
821 output = output.strip()
822 if not expectingText: return output
824 # decode base64 content
825 if base64 and self.contentparams.get('base64', 0):
827 output = _base64decode(output)
828 except binascii.Error:
830 except binascii.Incomplete:
833 # In Python 3, base64 takes and outputs bytes, not str
834 # This may not be the most correct way to accomplish this
835 output = _base64decode(output.encode('utf-8')).decode('utf-8')
837 # resolve relative URIs
838 if (element in self.can_be_relative_uri) and output:
839 output = self.resolveURI(output)
841 # decode entities within embedded markup
842 if not self.contentparams.get('base64', 0):
843 output = self.decodeEntities(element, output)
845 if self.lookslikehtml(output):
846 self.contentparams['type']='text/html'
848 # remove temporary cruft from contentparams
850 del self.contentparams['mode']
854 del self.contentparams['base64']
858 is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
859 # resolve relative URIs within embedded markup
860 if is_htmlish and RESOLVE_RELATIVE_URIS:
861 if element in self.can_contain_relative_uris:
862 output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
865 # (must do this before sanitizing because some microformats
866 # rely on elements that we sanitize)
867 if is_htmlish and element in ['content', 'description', 'summary']:
868 mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
870 for tag in mfresults.get('tags', []):
871 self._addTag(tag['term'], tag['scheme'], tag['label'])
872 for enclosure in mfresults.get('enclosures', []):
873 self._start_enclosure(enclosure)
874 for xfn in mfresults.get('xfn', []):
875 self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
876 vcard = mfresults.get('vcard')
878 self._getContext()['vcard'] = vcard
880 # sanitize embedded markup
881 if is_htmlish and SANITIZE_HTML:
882 if element in self.can_contain_dangerous_markup:
883 output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
885 if self.encoding and type(output) != type(u''):
887 output = unicode(output, self.encoding)
891 # address common error where people take data that is already
892 # utf-8, presume that it is iso-8859-1, and re-encode it.
893 if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and type(output) == type(u''):
895 output = unicode(output.encode('iso-8859-1'), 'utf-8')
899 # map win-1252 extensions to the proper code points
900 if type(output) == type(u''):
901 output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
903 # categories/tags/keywords/whatever are handled in _end_category
904 if element == 'category':
907 if element == 'title' and self.hasTitle:
910 # store output in appropriate place(s)
911 if self.inentry and not self.insource:
912 if element == 'content':
913 self.entries[-1].setdefault(element, [])
914 contentparams = copy.deepcopy(self.contentparams)
915 contentparams['value'] = output
916 self.entries[-1][element].append(contentparams)
917 elif element == 'link':
919 # query variables in urls in link elements are improperly
920 # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
921 # unhandled character references. fix this special case.
922 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
923 self.entries[-1][element] = output
925 self.entries[-1]['links'][-1]['href'] = output
927 if element == 'description':
929 self.entries[-1][element] = output
931 contentparams = copy.deepcopy(self.contentparams)
932 contentparams['value'] = output
933 self.entries[-1][element + '_detail'] = contentparams
934 elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
935 context = self._getContext()
936 if element == 'description':
938 context[element] = output
939 if element == 'link':
940 # fix query variables; see above for the explanation
941 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
942 context[element] = output
943 context['links'][-1]['href'] = output
945 contentparams = copy.deepcopy(self.contentparams)
946 contentparams['value'] = output
947 context[element + '_detail'] = contentparams
950 def pushContent(self, tag, attrsD, defaultContentType, expectingText):
952 if self.lang: self.lang=self.lang.replace('_','-')
953 self.contentparams = FeedParserDict({
954 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
955 'language': self.lang,
956 'base': self.baseuri})
957 self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
958 self.push(tag, expectingText)
960 def popContent(self, tag):
961 value = self.pop(tag)
963 self.contentparams.clear()
966 # a number of elements in a number of RSS variants are nominally plain
967 # text, but this is routinely ignored. This is an attempt to detect
968 # the most common cases. As false positives often result in silent
969 # data loss, this function errs on the conservative side.
970 def lookslikehtml(self, s):
971 if self.version.startswith('atom'): return
972 if self.contentparams.get('type','text/html') != 'text/plain': return
974 # must have a close tag or a entity reference to qualify
975 if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)): return
977 # all tags must be in a restricted subset of valid HTML tags
978 if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
979 re.findall(r'</?(\w+)',s)): return
981 # all entities must have been defined as valid HTML entities
982 from htmlentitydefs import entitydefs
983 if filter(lambda e: e not in entitydefs.keys(),
984 re.findall(r'&(\w+);',s)): return
988 def _mapToStandardPrefix(self, name):
989 colonpos = name.find(':')
991 prefix = name[:colonpos]
992 suffix = name[colonpos+1:]
993 prefix = self.namespacemap.get(prefix, prefix)
994 name = prefix + ':' + suffix
997 def _getAttribute(self, attrsD, name):
998 return attrsD.get(self._mapToStandardPrefix(name))
1000 def _isBase64(self, attrsD, contentparams):
1001 if attrsD.get('mode', '') == 'base64':
1003 if self.contentparams['type'].startswith('text/'):
1005 if self.contentparams['type'].endswith('+xml'):
1007 if self.contentparams['type'].endswith('/xml'):
1011 def _itsAnHrefDamnIt(self, attrsD):
1012 href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
1022 attrsD['href'] = href
1025 def _save(self, key, value, overwrite=False):
1026 context = self._getContext()
1028 context[key] = value
1030 context.setdefault(key, value)
1032 def _start_rss(self, attrsD):
1033 versionmap = {'0.91': 'rss091u',
1037 #If we're here then this is an RSS feed.
1038 #If we don't have a version or have a version that starts with something
1039 #other than RSS then there's been a mistake. Correct it.
1040 if not self.version or not self.version.startswith('rss'):
1041 attr_version = attrsD.get('version', '')
1042 version = versionmap.get(attr_version)
1044 self.version = version
1045 elif attr_version.startswith('2.'):
1046 self.version = 'rss20'
1048 self.version = 'rss'
1050 def _start_dlhottitles(self, attrsD):
1051 self.version = 'hotrss'
1053 def _start_channel(self, attrsD):
1055 self._cdf_common(attrsD)
1056 _start_feedinfo = _start_channel
1058 def _cdf_common(self, attrsD):
1059 if attrsD.has_key('lastmod'):
1060 self._start_modified({})
1061 self.elementstack[-1][-1] = attrsD['lastmod']
1062 self._end_modified()
1063 if attrsD.has_key('href'):
1064 self._start_link({})
1065 self.elementstack[-1][-1] = attrsD['href']
1068 def _start_feed(self, attrsD):
1070 versionmap = {'0.1': 'atom01',
1073 if not self.version:
1074 attr_version = attrsD.get('version')
1075 version = versionmap.get(attr_version)
1077 self.version = version
1079 self.version = 'atom'
1081 def _end_channel(self):
1083 _end_feed = _end_channel
1085 def _start_image(self, attrsD):
1086 context = self._getContext()
1087 if not self.inentry:
1088 context.setdefault('image', FeedParserDict())
1091 self.push('image', 0)
1093 def _end_image(self):
1097 def _start_textinput(self, attrsD):
1098 context = self._getContext()
1099 context.setdefault('textinput', FeedParserDict())
1100 self.intextinput = 1
1102 self.push('textinput', 0)
1103 _start_textInput = _start_textinput
1105 def _end_textinput(self):
1106 self.pop('textinput')
1107 self.intextinput = 0
1108 _end_textInput = _end_textinput
1110 def _start_author(self, attrsD):
1112 self.push('author', 1)
1113 # Append a new FeedParserDict when expecting an author
1114 context = self._getContext()
1115 context.setdefault('authors', [])
1116 context['authors'].append(FeedParserDict())
1117 _start_managingeditor = _start_author
1118 _start_dc_author = _start_author
1119 _start_dc_creator = _start_author
1120 _start_itunes_author = _start_author
1122 def _end_author(self):
1125 self._sync_author_detail()
1126 _end_managingeditor = _end_author
1127 _end_dc_author = _end_author
1128 _end_dc_creator = _end_author
1129 _end_itunes_author = _end_author
1131 def _start_itunes_owner(self, attrsD):
1132 self.inpublisher = 1
1133 self.push('publisher', 0)
1135 def _end_itunes_owner(self):
1136 self.pop('publisher')
1137 self.inpublisher = 0
1138 self._sync_author_detail('publisher')
1140 def _start_contributor(self, attrsD):
1141 self.incontributor = 1
1142 context = self._getContext()
1143 context.setdefault('contributors', [])
1144 context['contributors'].append(FeedParserDict())
1145 self.push('contributor', 0)
1147 def _end_contributor(self):
1148 self.pop('contributor')
1149 self.incontributor = 0
1151 def _start_dc_contributor(self, attrsD):
1152 self.incontributor = 1
1153 context = self._getContext()
1154 context.setdefault('contributors', [])
1155 context['contributors'].append(FeedParserDict())
1156 self.push('name', 0)
1158 def _end_dc_contributor(self):
1160 self.incontributor = 0
1162 def _start_name(self, attrsD):
1163 self.push('name', 0)
1164 _start_itunes_name = _start_name
1166 def _end_name(self):
1167 value = self.pop('name')
1168 if self.inpublisher:
1169 self._save_author('name', value, 'publisher')
1171 self._save_author('name', value)
1172 elif self.incontributor:
1173 self._save_contributor('name', value)
1174 elif self.intextinput:
1175 context = self._getContext()
1176 context['name'] = value
1177 _end_itunes_name = _end_name
1179 def _start_width(self, attrsD):
1180 self.push('width', 0)
1182 def _end_width(self):
1183 value = self.pop('width')
1189 context = self._getContext()
1190 context['width'] = value
1192 def _start_height(self, attrsD):
1193 self.push('height', 0)
1195 def _end_height(self):
1196 value = self.pop('height')
1202 context = self._getContext()
1203 context['height'] = value
1205 def _start_url(self, attrsD):
1206 self.push('href', 1)
1207 _start_homepage = _start_url
1208 _start_uri = _start_url
1211 value = self.pop('href')
1213 self._save_author('href', value)
1214 elif self.incontributor:
1215 self._save_contributor('href', value)
1216 _end_homepage = _end_url
1219 def _start_email(self, attrsD):
1220 self.push('email', 0)
1221 _start_itunes_email = _start_email
1223 def _end_email(self):
1224 value = self.pop('email')
1225 if self.inpublisher:
1226 self._save_author('email', value, 'publisher')
1228 self._save_author('email', value)
1229 elif self.incontributor:
1230 self._save_contributor('email', value)
1231 _end_itunes_email = _end_email
1233 def _getContext(self):
1235 context = self.sourcedata
1236 elif self.inimage and self.feeddata.has_key('image'):
1237 context = self.feeddata['image']
1238 elif self.intextinput:
1239 context = self.feeddata['textinput']
1241 context = self.entries[-1]
1243 context = self.feeddata
1246 def _save_author(self, key, value, prefix='author'):
1247 context = self._getContext()
1248 context.setdefault(prefix + '_detail', FeedParserDict())
1249 context[prefix + '_detail'][key] = value
1250 self._sync_author_detail()
1251 context.setdefault('authors', [FeedParserDict()])
1252 context['authors'][-1][key] = value
1254 def _save_contributor(self, key, value):
1255 context = self._getContext()
1256 context.setdefault('contributors', [FeedParserDict()])
1257 context['contributors'][-1][key] = value
1259 def _sync_author_detail(self, key='author'):
1260 context = self._getContext()
1261 detail = context.get('%s_detail' % key)
1263 name = detail.get('name')
1264 email = detail.get('email')
1266 context[key] = '%s (%s)' % (name, email)
1270 context[key] = email
1272 author, email = context.get(key), None
1273 if not author: return
1274 emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1276 email = emailmatch.group(0)
1277 # probably a better way to do the following, but it passes all the tests
1278 author = author.replace(email, '')
1279 author = author.replace('()', '')
1280 author = author.replace('<>', '')
1281 author = author.replace('<>', '')
1282 author = author.strip()
1283 if author and (author[0] == '('):
1285 if author and (author[-1] == ')'):
1286 author = author[:-1]
1287 author = author.strip()
1289 context.setdefault('%s_detail' % key, FeedParserDict())
1291 context['%s_detail' % key]['name'] = author
1293 context['%s_detail' % key]['email'] = email
1295 def _start_subtitle(self, attrsD):
1296 self.pushContent('subtitle', attrsD, 'text/plain', 1)
1297 _start_tagline = _start_subtitle
1298 _start_itunes_subtitle = _start_subtitle
1300 def _end_subtitle(self):
1301 self.popContent('subtitle')
1302 _end_tagline = _end_subtitle
1303 _end_itunes_subtitle = _end_subtitle
1305 def _start_rights(self, attrsD):
1306 self.pushContent('rights', attrsD, 'text/plain', 1)
1307 _start_dc_rights = _start_rights
1308 _start_copyright = _start_rights
1310 def _end_rights(self):
1311 self.popContent('rights')
1312 _end_dc_rights = _end_rights
1313 _end_copyright = _end_rights
1315 def _start_item(self, attrsD):
1316 self.entries.append(FeedParserDict())
1317 self.push('item', 0)
1321 id = self._getAttribute(attrsD, 'rdf:about')
1323 context = self._getContext()
1325 self._cdf_common(attrsD)
1326 _start_entry = _start_item
1327 _start_product = _start_item
1329 def _end_item(self):
1332 _end_entry = _end_item
1334 def _start_dc_language(self, attrsD):
1335 self.push('language', 1)
1336 _start_language = _start_dc_language
1338 def _end_dc_language(self):
1339 self.lang = self.pop('language')
1340 _end_language = _end_dc_language
1342 def _start_dc_publisher(self, attrsD):
1343 self.push('publisher', 1)
1344 _start_webmaster = _start_dc_publisher
1346 def _end_dc_publisher(self):
1347 self.pop('publisher')
1348 self._sync_author_detail('publisher')
1349 _end_webmaster = _end_dc_publisher
1351 def _start_published(self, attrsD):
1352 self.push('published', 1)
1353 _start_dcterms_issued = _start_published
1354 _start_issued = _start_published
1356 def _end_published(self):
1357 value = self.pop('published')
1358 self._save('published_parsed', _parse_date(value), overwrite=True)
1359 _end_dcterms_issued = _end_published
1360 _end_issued = _end_published
1362 def _start_updated(self, attrsD):
1363 self.push('updated', 1)
1364 _start_modified = _start_updated
1365 _start_dcterms_modified = _start_updated
1366 _start_pubdate = _start_updated
1367 _start_dc_date = _start_updated
1368 _start_lastbuilddate = _start_updated
1370 def _end_updated(self):
1371 value = self.pop('updated')
1372 parsed_value = _parse_date(value)
1373 self._save('updated_parsed', parsed_value, overwrite=True)
1374 _end_modified = _end_updated
1375 _end_dcterms_modified = _end_updated
1376 _end_pubdate = _end_updated
1377 _end_dc_date = _end_updated
1378 _end_lastbuilddate = _end_updated
1380 def _start_created(self, attrsD):
1381 self.push('created', 1)
1382 _start_dcterms_created = _start_created
1384 def _end_created(self):
1385 value = self.pop('created')
1386 self._save('created_parsed', _parse_date(value), overwrite=True)
1387 _end_dcterms_created = _end_created
1389 def _start_expirationdate(self, attrsD):
1390 self.push('expired', 1)
1392 def _end_expirationdate(self):
1393 self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
1395 def _start_cc_license(self, attrsD):
1396 context = self._getContext()
1397 value = self._getAttribute(attrsD, 'rdf:resource')
1398 attrsD = FeedParserDict()
1399 attrsD['rel']='license'
1400 if value: attrsD['href']=value
1401 context.setdefault('links', []).append(attrsD)
1403 def _start_creativecommons_license(self, attrsD):
1404 self.push('license', 1)
1405 _start_creativeCommons_license = _start_creativecommons_license
1407 def _end_creativecommons_license(self):
1408 value = self.pop('license')
1409 context = self._getContext()
1410 attrsD = FeedParserDict()
1411 attrsD['rel']='license'
1412 if value: attrsD['href']=value
1413 context.setdefault('links', []).append(attrsD)
1414 del context['license']
1415 _end_creativeCommons_license = _end_creativecommons_license
1417 def _addXFN(self, relationships, href, name):
1418 context = self._getContext()
1419 xfn = context.setdefault('xfn', [])
1420 value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
1421 if value not in xfn:
1424 def _addTag(self, term, scheme, label):
1425 context = self._getContext()
1426 tags = context.setdefault('tags', [])
1427 if (not term) and (not scheme) and (not label): return
1428 value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1429 if value not in tags:
1432 def _start_category(self, attrsD):
1433 if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1434 term = attrsD.get('term')
1435 scheme = attrsD.get('scheme', attrsD.get('domain'))
1436 label = attrsD.get('label')
1437 self._addTag(term, scheme, label)
1438 self.push('category', 1)
1439 _start_dc_subject = _start_category
1440 _start_keywords = _start_category
1442 def _start_media_category(self, attrsD):
1443 attrsD.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema')
1444 self._start_category(attrsD)
1446 def _end_itunes_keywords(self):
1447 for term in self.pop('itunes_keywords').split():
1448 self._addTag(term, 'http://www.itunes.com/', None)
1450 def _start_itunes_category(self, attrsD):
1451 self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1452 self.push('category', 1)
1454 def _end_category(self):
1455 value = self.pop('category')
1456 if not value: return
1457 context = self._getContext()
1458 tags = context['tags']
1459 if value and len(tags) and not tags[-1]['term']:
1460 tags[-1]['term'] = value
1462 self._addTag(value, None, None)
1463 _end_dc_subject = _end_category
1464 _end_keywords = _end_category
1465 _end_itunes_category = _end_category
1466 _end_media_category = _end_category
1468 def _start_cloud(self, attrsD):
1469 self._getContext()['cloud'] = FeedParserDict(attrsD)
1471 def _start_link(self, attrsD):
1472 attrsD.setdefault('rel', 'alternate')
1473 if attrsD['rel'] == 'self':
1474 attrsD.setdefault('type', 'application/atom+xml')
1476 attrsD.setdefault('type', 'text/html')
1477 context = self._getContext()
1478 attrsD = self._itsAnHrefDamnIt(attrsD)
1479 if attrsD.has_key('href'):
1480 attrsD['href'] = self.resolveURI(attrsD['href'])
1481 expectingText = self.infeed or self.inentry or self.insource
1482 context.setdefault('links', [])
1483 if not (self.inentry and self.inimage):
1484 context['links'].append(FeedParserDict(attrsD))
1485 if attrsD.has_key('href'):
1487 if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1488 context['link'] = attrsD['href']
1490 self.push('link', expectingText)
1491 _start_producturl = _start_link
1493 def _end_link(self):
1494 value = self.pop('link')
1495 context = self._getContext()
1496 _end_producturl = _end_link
1498 def _start_guid(self, attrsD):
1499 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1502 def _end_guid(self):
1503 value = self.pop('id')
1504 self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1506 # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1507 # and only if the item doesn't already have a link element
1508 self._save('link', value)
1510 def _start_title(self, attrsD):
1511 if self.svgOK: return self.unknown_starttag('title', attrsD.items())
1512 self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1513 _start_dc_title = _start_title
1514 _start_media_title = _start_title
1516 def _end_title(self):
1517 if self.svgOK: return
1518 value = self.popContent('title')
1519 if not value: return
1520 context = self._getContext()
1522 _end_dc_title = _end_title
1524 def _end_media_title(self):
1525 hasTitle = self.hasTitle
1527 self.hasTitle = hasTitle
1529 def _start_description(self, attrsD):
1530 context = self._getContext()
1531 if context.has_key('summary'):
1532 self._summaryKey = 'content'
1533 self._start_content(attrsD)
1535 self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1536 _start_dc_description = _start_description
1538 def _start_abstract(self, attrsD):
1539 self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1541 def _end_description(self):
1542 if self._summaryKey == 'content':
1545 value = self.popContent('description')
1546 self._summaryKey = None
1547 _end_abstract = _end_description
1548 _end_dc_description = _end_description
1550 def _start_info(self, attrsD):
1551 self.pushContent('info', attrsD, 'text/plain', 1)
1552 _start_feedburner_browserfriendly = _start_info
1554 def _end_info(self):
1555 self.popContent('info')
1556 _end_feedburner_browserfriendly = _end_info
1558 def _start_generator(self, attrsD):
1560 attrsD = self._itsAnHrefDamnIt(attrsD)
1561 if attrsD.has_key('href'):
1562 attrsD['href'] = self.resolveURI(attrsD['href'])
1563 self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1564 self.push('generator', 1)
1566 def _end_generator(self):
1567 value = self.pop('generator')
1568 context = self._getContext()
1569 if context.has_key('generator_detail'):
1570 context['generator_detail']['name'] = value
1572 def _start_admin_generatoragent(self, attrsD):
1573 self.push('generator', 1)
1574 value = self._getAttribute(attrsD, 'rdf:resource')
1576 self.elementstack[-1][2].append(value)
1577 self.pop('generator')
1578 self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1580 def _start_admin_errorreportsto(self, attrsD):
1581 self.push('errorreportsto', 1)
1582 value = self._getAttribute(attrsD, 'rdf:resource')
1584 self.elementstack[-1][2].append(value)
1585 self.pop('errorreportsto')
1587 def _start_summary(self, attrsD):
1588 context = self._getContext()
1589 if context.has_key('summary'):
1590 self._summaryKey = 'content'
1591 self._start_content(attrsD)
1593 self._summaryKey = 'summary'
1594 self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1595 _start_itunes_summary = _start_summary
1597 def _end_summary(self):
1598 if self._summaryKey == 'content':
1601 self.popContent(self._summaryKey or 'summary')
1602 self._summaryKey = None
1603 _end_itunes_summary = _end_summary
1605 def _start_enclosure(self, attrsD):
1606 attrsD = self._itsAnHrefDamnIt(attrsD)
1607 context = self._getContext()
1608 attrsD['rel']='enclosure'
1609 context.setdefault('links', []).append(FeedParserDict(attrsD))
1611 def _start_source(self, attrsD):
1613 # This means that we're processing a source element from an RSS 2.0 feed
1614 self.sourcedata['href'] = attrsD[u'url']
1615 self.push('source', 1)
1619 def _end_source(self):
1621 value = self.pop('source')
1623 self.sourcedata['title'] = value
1624 self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1625 self.sourcedata.clear()
1627 def _start_content(self, attrsD):
1628 self.pushContent('content', attrsD, 'text/plain', 1)
1629 src = attrsD.get('src')
1631 self.contentparams['src'] = src
1632 self.push('content', 1)
1634 def _start_prodlink(self, attrsD):
1635 self.pushContent('content', attrsD, 'text/html', 1)
1637 def _start_body(self, attrsD):
1638 self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1639 _start_xhtml_body = _start_body
1641 def _start_content_encoded(self, attrsD):
1642 self.pushContent('content', attrsD, 'text/html', 1)
1643 _start_fullitem = _start_content_encoded
1645 def _end_content(self):
1646 copyToSummary = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1647 value = self.popContent('content')
1649 self._save('summary', value)
1651 _end_body = _end_content
1652 _end_xhtml_body = _end_content
1653 _end_content_encoded = _end_content
1654 _end_fullitem = _end_content
1655 _end_prodlink = _end_content
1657 def _start_itunes_image(self, attrsD):
1658 self.push('itunes_image', 0)
1659 if attrsD.get('href'):
1660 self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1661 _start_itunes_link = _start_itunes_image
1663 def _end_itunes_block(self):
1664 value = self.pop('itunes_block', 0)
1665 self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1667 def _end_itunes_explicit(self):
1668 value = self.pop('itunes_explicit', 0)
1669 # Convert 'yes' -> True, 'clean' to False, and any other value to None
1670 # False and None both evaluate as False, so the difference can be ignored
1671 # by applications that only need to know if the content is explicit.
1672 self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
1674 def _start_media_content(self, attrsD):
1675 context = self._getContext()
1676 context.setdefault('media_content', [])
1677 context['media_content'].append(attrsD)
1679 def _start_media_thumbnail(self, attrsD):
1680 context = self._getContext()
1681 context.setdefault('media_thumbnail', [])
1682 self.push('url', 1) # new
1683 context['media_thumbnail'].append(attrsD)
1685 def _end_media_thumbnail(self):
1686 url = self.pop('url')
1687 context = self._getContext()
1688 if url != None and len(url.strip()) != 0:
1689 if not context['media_thumbnail'][-1].has_key('url'):
1690 context['media_thumbnail'][-1]['url'] = url
1692 def _start_media_player(self, attrsD):
1693 self.push('media_player', 0)
1694 self._getContext()['media_player'] = FeedParserDict(attrsD)
1696 def _end_media_player(self):
1697 value = self.pop('media_player')
1698 context = self._getContext()
1699 context['media_player']['content'] = value
1701 def _start_newlocation(self, attrsD):
1702 self.push('newlocation', 1)
1704 def _end_newlocation(self):
1705 url = self.pop('newlocation')
1706 context = self._getContext()
1707 # don't set newlocation if the context isn't right
1708 if context is not self.feeddata:
1710 context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
1713 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1714 def __init__(self, baseuri, baselang, encoding):
1715 if _debug: sys.stderr.write('trying StrictFeedParser\n')
1716 xml.sax.handler.ContentHandler.__init__(self)
1717 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1722 def startPrefixMapping(self, prefix, uri):
1723 self.trackNamespace(prefix, uri)
1724 if uri == 'http://www.w3.org/1999/xlink':
1725 self.decls['xmlns:'+prefix] = uri
1727 def startElementNS(self, name, qname, attrs):
1728 namespace, localname = name
1729 lowernamespace = str(namespace or '').lower()
1730 if lowernamespace.find('backend.userland.com/rss') <> -1:
1731 # match any backend.userland.com namespace
1732 namespace = 'http://backend.userland.com/rss'
1733 lowernamespace = namespace
1734 if qname and qname.find(':') > 0:
1735 givenprefix = qname.split(':')[0]
1738 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1739 if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1740 raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1741 localname = str(localname).lower()
1743 # qname implementation is horribly broken in Python 2.1 (it
1744 # doesn't report any), and slightly broken in Python 2.2 (it
1745 # doesn't report the xml: namespace). So we match up namespaces
1746 # with a known list first, and then possibly override them with
1747 # the qnames the SAX parser gives us (if indeed it gives us any
1748 # at all). Thanks to MatejC for helping me test this and
1749 # tirelessly telling me that it didn't work yet.
1750 attrsD, self.decls = self.decls, {}
1751 if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
1752 attrsD['xmlns']=namespace
1753 if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
1754 attrsD['xmlns']=namespace
1757 localname = prefix.lower() + ':' + localname
1758 elif namespace and not qname: #Expat
1759 for name,value in self.namespacesInUse.items():
1760 if name and value == namespace:
1761 localname = name + ':' + localname
1763 if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1765 for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1766 lowernamespace = (namespace or '').lower()
1767 prefix = self._matchnamespaces.get(lowernamespace, '')
1769 attrlocalname = prefix + ':' + attrlocalname
1770 attrsD[str(attrlocalname).lower()] = attrvalue
1771 for qname in attrs.getQNames():
1772 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1773 self.unknown_starttag(localname, attrsD.items())
1775 def characters(self, text):
1776 self.handle_data(text)
1778 def endElementNS(self, name, qname):
1779 namespace, localname = name
1780 lowernamespace = str(namespace or '').lower()
1781 if qname and qname.find(':') > 0:
1782 givenprefix = qname.split(':')[0]
1785 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1787 localname = prefix + ':' + localname
1788 elif namespace and not qname: #Expat
1789 for name,value in self.namespacesInUse.items():
1790 if name and value == namespace:
1791 localname = name + ':' + localname
1793 localname = str(localname).lower()
1794 self.unknown_endtag(localname)
1796 def error(self, exc):
1800 def fatalError(self, exc):
1804 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1805 special = re.compile('''[<>'"]''')
1806 bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
1807 elements_no_end_tag = [
1808 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
1809 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
1810 'source', 'track', 'wbr'
1813 def __init__(self, encoding, _type):
1814 self.encoding = encoding
1816 if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1817 sgmllib.SGMLParser.__init__(self)
1821 sgmllib.SGMLParser.reset(self)
1823 def _shorttag_replace(self, match):
1824 tag = match.group(1)
1825 if tag in self.elements_no_end_tag:
1826 return '<' + tag + ' />'
1828 return '<' + tag + '></' + tag + '>'
1830 def parse_starttag(self,i):
1831 j=sgmllib.SGMLParser.parse_starttag(self, i)
1832 if self._type == 'application/xhtml+xml':
1833 if j>2 and self.rawdata[j-2:j]=='/>':
1834 self.unknown_endtag(self.lasttag)
1837 def feed(self, data):
1838 data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
1839 #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1840 data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
1841 data = data.replace(''', "'")
1842 data = data.replace('"', '"')
1847 self.encoding = self.encoding + '_INVALID_PYTHON_3'
1849 if self.encoding and type(data) == type(u''):
1850 data = data.encode(self.encoding)
1851 sgmllib.SGMLParser.feed(self, data)
1852 sgmllib.SGMLParser.close(self)
1854 def normalize_attrs(self, attrs):
1855 if not attrs: return attrs
1856 # utility method to be called by descendants
1857 attrs = dict([(k.lower(), v) for k, v in attrs]).items()
1858 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1862 def unknown_starttag(self, tag, attrs):
1863 # called for each start tag
1864 # attrs is a list of (attr, value) tuples
1865 # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1866 if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1870 for key, value in attrs:
1871 value=value.replace('>','>').replace('<','<').replace('"','"')
1872 value = self.bare_ampersand.sub("&", value)
1873 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1874 if type(value) != type(u''):
1876 value = unicode(value, self.encoding)
1878 value = unicode(value, 'iso-8859-1')
1880 # Currently, in Python 3 the key is already a str, and cannot be decoded again
1881 uattrs.append((unicode(key, self.encoding), value))
1883 uattrs.append((key, value))
1884 strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
1887 strattrs=strattrs.encode(self.encoding)
1890 if tag in self.elements_no_end_tag:
1891 self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1893 self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1895 def unknown_endtag(self, tag):
1896 # called for each end tag, e.g. for </pre>, tag will be 'pre'
1897 # Reconstruct the original end tag.
1898 if tag not in self.elements_no_end_tag:
1899 self.pieces.append("</%(tag)s>" % locals())
1901 def handle_charref(self, ref):
1902 # called for each character reference, e.g. for ' ', ref will be '160'
1903 # Reconstruct the original character reference.
1904 if ref.startswith('x'):
1905 value = unichr(int(ref[1:],16))
1907 value = unichr(int(ref))
1909 if value in _cp1252.keys():
1910 self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
1912 self.pieces.append('&#%(ref)s;' % locals())
1914 def handle_entityref(self, ref):
1915 # called for each entity reference, e.g. for '©', ref will be 'copy'
1916 # Reconstruct the original entity reference.
1917 if name2codepoint.has_key(ref):
1918 self.pieces.append('&%(ref)s;' % locals())
1920 self.pieces.append('&%(ref)s' % locals())
1922 def handle_data(self, text):
1923 # called for each block of plain text, i.e. outside of any tag and
1924 # not containing any character or entity references
1925 # Store the original text verbatim.
1926 if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text)
1927 self.pieces.append(text)
1929 def handle_comment(self, text):
1930 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1931 # Reconstruct the original comment.
1932 self.pieces.append('<!--%(text)s-->' % locals())
1934 def handle_pi(self, text):
1935 # called for each processing instruction, e.g. <?instruction>
1936 # Reconstruct original processing instruction.
1937 self.pieces.append('<?%(text)s>' % locals())
1939 def handle_decl(self, text):
1940 # called for the DOCTYPE, if present, e.g.
1941 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1942 # "http://www.w3.org/TR/html4/loose.dtd">
1943 # Reconstruct original DOCTYPE
1944 self.pieces.append('<!%(text)s>' % locals())
1946 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1947 def _scan_name(self, i, declstartpos):
1948 rawdata = self.rawdata
1952 m = self._new_declname_match(rawdata, i)
1956 if (i + len(s)) == n:
1957 return None, -1 # end of buffer
1958 return name.lower(), m.end()
1960 self.handle_data(rawdata)
1961 # self.updatepos(declstartpos, i)
1964 def convert_charref(self, name):
1965 return '&#%s;' % name
1967 def convert_entityref(self, name):
1968 return '&%s;' % name
1971 '''Return processed HTML as a single string'''
1972 return ''.join([str(p) for p in self.pieces])
1974 def parse_declaration(self, i):
1976 return sgmllib.SGMLParser.parse_declaration(self, i)
1977 except sgmllib.SGMLParseError:
1978 # escape the doctype declaration and continue parsing
1979 self.handle_data('<')
1982 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1983 def __init__(self, baseuri, baselang, encoding, entities):
1984 sgmllib.SGMLParser.__init__(self)
1985 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1986 _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
1987 self.entities=entities
1989 def decodeEntities(self, element, data):
1990 data = data.replace('<', '<')
1991 data = data.replace('<', '<')
1992 data = data.replace('<', '<')
1993 data = data.replace('>', '>')
1994 data = data.replace('>', '>')
1995 data = data.replace('>', '>')
1996 data = data.replace('&', '&')
1997 data = data.replace('&', '&')
1998 data = data.replace('"', '"')
1999 data = data.replace('"', '"')
2000 data = data.replace(''', ''')
2001 data = data.replace(''', ''')
2002 if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
2003 data = data.replace('<', '<')
2004 data = data.replace('>', '>')
2005 data = data.replace('&', '&')
2006 data = data.replace('"', '"')
2007 data = data.replace(''', "'")
2010 def strattrs(self, attrs):
2011 return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
2013 class _MicroformatsParser:
2020 known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
2021 known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
2023 def __init__(self, data, baseuri, encoding):
2024 self.document = BeautifulSoup.BeautifulSoup(data)
2025 self.baseuri = baseuri
2026 self.encoding = encoding
2027 if type(data) == type(u''):
2028 data = data.encode(encoding)
2030 self.enclosures = []
2034 def vcardEscape(self, s):
2035 if type(s) in (type(''), type(u'')):
2036 s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
2039 def vcardFold(self, s):
2040 s = re.sub(';+$', '', s)
2044 while len(s) > iMax:
2045 sFolded += sPrefix + s[:iMax] + '\n'
2049 sFolded += sPrefix + s
2052 def normalize(self, s):
2053 return re.sub(r'\s+', ' ', s).strip()
2055 def unique(self, aList):
2057 for element in aList:
2058 if element not in results:
2059 results.append(element)
2062 def toISO8601(self, dt):
2063 return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
2065 def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
2067 sProperty = sProperty.lower()
2070 propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
2071 if bAllowMultiple and (iPropertyType != self.NODE):
2073 containers = elmRoot(['ul', 'ol'], propertyMatch)
2074 for container in containers:
2075 snapResults.extend(container('li'))
2076 bFound = (len(snapResults) != 0)
2078 snapResults = elmRoot(all, propertyMatch)
2079 bFound = (len(snapResults) != 0)
2080 if (not bFound) and (sProperty == 'value'):
2081 snapResults = elmRoot('pre')
2082 bFound = (len(snapResults) != 0)
2083 bNormalize = not bFound
2085 snapResults = [elmRoot]
2086 bFound = (len(snapResults) != 0)
2088 if sProperty == 'vcard':
2089 snapFilter = elmRoot(all, propertyMatch)
2090 for node in snapFilter:
2091 if node.findParent(all, propertyMatch):
2092 arFilter.append(node)
2094 for node in snapResults:
2095 if node not in arFilter:
2096 arResults.append(node)
2097 bFound = (len(arResults) != 0)
2099 if bAllowMultiple: return []
2100 elif iPropertyType == self.STRING: return ''
2101 elif iPropertyType == self.DATE: return None
2102 elif iPropertyType == self.URI: return ''
2103 elif iPropertyType == self.NODE: return None
2106 for elmResult in arResults:
2108 if iPropertyType == self.NODE:
2110 arValues.append(elmResult)
2114 sNodeName = elmResult.name.lower()
2115 if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
2116 sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
2118 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2119 if (not sValue) and (sNodeName == 'abbr'):
2120 sValue = elmResult.get('title')
2122 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2123 if (not sValue) and (iPropertyType == self.URI):
2124 if sNodeName == 'a': sValue = elmResult.get('href')
2125 elif sNodeName == 'img': sValue = elmResult.get('src')
2126 elif sNodeName == 'object': sValue = elmResult.get('data')
2128 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2129 if (not sValue) and (sNodeName == 'img'):
2130 sValue = elmResult.get('alt')
2132 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2134 sValue = elmResult.renderContents()
2135 sValue = re.sub(r'<\S[^>]*>', '', sValue)
2136 sValue = sValue.replace('\r\n', '\n')
2137 sValue = sValue.replace('\r', '\n')
2139 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2140 if not sValue: continue
2141 if iPropertyType == self.DATE:
2142 sValue = _parse_date_iso8601(sValue)
2144 arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
2146 return bAutoEscape and self.vcardEscape(sValue) or sValue
2149 def findVCards(self, elmRoot, bAgentParsing=0):
2152 if not bAgentParsing:
2153 arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
2157 for elmCard in arCards:
2160 def processSingleString(sProperty):
2161 sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding)
2163 arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
2164 return sValue or u''
2166 def processSingleURI(sProperty):
2167 sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
2172 if sValue.startswith('data:'):
2173 sEncoding = ';ENCODING=b'
2174 sContentType = sValue.split(';')[0].split('/').pop()
2175 sValue = sValue.split(',', 1).pop()
2177 elmValue = self.getPropertyValue(elmCard, sProperty)
2179 if sProperty != 'url':
2180 sValueKey = ';VALUE=uri'
2181 sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
2182 sContentType = sContentType.upper()
2183 if sContentType == 'OCTET-STREAM':
2186 sContentType = ';TYPE=' + sContentType.upper()
2187 arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
2189 def processTypeValue(sProperty, arDefaultType, arForceType=None):
2190 arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
2191 for elmResult in arResults:
2192 arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
2194 arType = self.unique(arForceType + arType)
2196 arType = arDefaultType
2197 sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
2199 arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
2202 # must do this before all other properties because it is destructive
2203 # (removes nested class="vcard" nodes so they don't interfere with
2204 # this vcard's other properties)
2205 arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
2206 for elmAgent in arAgent:
2207 if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
2208 sAgentValue = self.findVCards(elmAgent, 1) + '\n'
2209 sAgentValue = sAgentValue.replace('\n', '\\n')
2210 sAgentValue = sAgentValue.replace(';', '\\;')
2212 arLines.append(self.vcardFold('AGENT:' + sAgentValue))
2213 # Completely remove the agent element from the parse tree
2216 sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
2218 arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
2221 sFN = processSingleString('fn')
2224 elmName = self.getPropertyValue(elmCard, 'n')
2226 sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
2227 sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
2228 arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
2229 arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
2230 arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
2231 arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
2233 ','.join(arAdditionalNames) + ';' +
2234 ','.join(arHonorificPrefixes) + ';' +
2235 ','.join(arHonorificSuffixes)))
2237 # implied "N" optimization
2238 # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
2239 arNames = self.normalize(sFN).split()
2240 if len(arNames) == 2:
2241 bFamilyNameFirst = (arNames[0].endswith(',') or
2242 len(arNames[1]) == 1 or
2243 ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
2244 if bFamilyNameFirst:
2245 arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
2247 arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
2250 sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
2252 arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
2255 arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
2257 arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
2260 processSingleURI('photo')
2263 dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
2265 arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
2268 arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
2269 for elmAdr in arAdr:
2270 arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
2272 arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
2273 sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
2274 sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
2275 sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
2276 sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
2277 sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
2278 sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
2279 sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
2280 arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
2281 sPostOfficeBox + ';' +
2282 sExtendedAddress + ';' +
2283 sStreetAddress + ';' +
2290 processTypeValue('label', ['intl','postal','parcel','work'])
2292 # TEL (phone number)
2293 processTypeValue('tel', ['voice'])
2296 processTypeValue('email', ['internet'], ['internet'])
2299 processSingleString('mailer')
2302 processSingleString('tz')
2304 # GEO (geographical information)
2305 elmGeo = self.getPropertyValue(elmCard, 'geo')
2307 sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
2308 sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
2309 arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
2312 processSingleString('title')
2315 processSingleString('role')
2318 processSingleURI('logo')
2320 # ORG (organization)
2321 elmOrg = self.getPropertyValue(elmCard, 'org')
2323 sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
2324 if not sOrganizationName:
2325 # implied "organization-name" optimization
2326 # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
2327 sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
2328 if sOrganizationName:
2329 arLines.append(self.vcardFold('ORG:' + sOrganizationName))
2331 arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
2332 arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
2335 arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
2337 arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
2340 processSingleString('note')
2343 processSingleString('rev')
2346 processSingleURI('sound')
2349 processSingleString('uid')
2352 processSingleURI('url')
2355 processSingleString('class')
2358 processSingleURI('key')
2361 arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard']
2362 sVCards += u'\n'.join(arLines) + u'\n'
2364 return sVCards.strip()
2366 def isProbablyDownloadable(self, elm):
2367 attrsD = elm.attrMap
2368 if not attrsD.has_key('href'): return 0
2369 linktype = attrsD.get('type', '').strip()
2370 if linktype.startswith('audio/') or \
2371 linktype.startswith('video/') or \
2372 (linktype.startswith('application/') and not linktype.endswith('xml')):
2374 path = urlparse.urlparse(attrsD['href'])[2]
2375 if path.find('.') == -1: return 0
2376 fileext = path.split('.').pop().lower()
2377 return fileext in self.known_binary_extensions
2381 for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
2382 href = elm.get('href')
2383 if not href: continue
2384 urlscheme, domain, path, params, query, fragment = \
2385 urlparse.urlparse(_urljoin(self.baseuri, href))
2386 segments = path.split('/')
2387 tag = segments.pop()
2389 tag = segments.pop()
2390 tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
2391 if not tagscheme.endswith('/'):
2393 self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
2395 def findEnclosures(self):
2397 enclosure_match = re.compile(r'\benclosure\b')
2398 for elm in self.document(all, {'href': re.compile(r'.+')}):
2399 if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue
2400 if elm.attrMap not in self.enclosures:
2401 self.enclosures.append(elm.attrMap)
2402 if elm.string and not elm.get('title'):
2403 self.enclosures[-1]['title'] = elm.string
2407 for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
2408 rels = elm.get('rel', '').split()
2411 if rel in self.known_xfn_relationships:
2412 xfn_rels.append(rel)
2414 self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
2416 def _parseMicroformats(htmlSource, baseURI, encoding):
2417 if not BeautifulSoup: return
2418 if _debug: sys.stderr.write('entering _parseMicroformats\n')
2420 p = _MicroformatsParser(htmlSource, baseURI, encoding)
2421 except UnicodeEncodeError:
2422 # sgmllib throws this exception when performing lookups of tags
2423 # with non-ASCII characters in them.
2425 p.vcard = p.findVCards(p.document)
2429 return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
2431 class _RelativeURIResolver(_BaseHTMLProcessor):
2432 relative_uris = [('a', 'href'),
2433 ('applet', 'codebase'),
2435 ('blockquote', 'cite'),
2436 ('body', 'background'),
2439 ('frame', 'longdesc'),
2441 ('iframe', 'longdesc'),
2443 ('head', 'profile'),
2444 ('img', 'longdesc'),
2448 ('input', 'usemap'),
2451 ('object', 'classid'),
2452 ('object', 'codebase'),
2454 ('object', 'usemap'),
2458 def __init__(self, baseuri, encoding, _type):
2459 _BaseHTMLProcessor.__init__(self, encoding, _type)
2460 self.baseuri = baseuri
2462 def resolveURI(self, uri):
2463 return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip()))
2465 def unknown_starttag(self, tag, attrs):
2467 sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs)))
2468 attrs = self.normalize_attrs(attrs)
2469 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
2470 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
2472 def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
2474 sys.stderr.write('entering _resolveRelativeURIs\n')
2476 p = _RelativeURIResolver(baseURI, encoding, _type)
2480 def _makeSafeAbsoluteURI(base, rel=None):
2481 # bail if ACCEPTABLE_URI_SCHEMES is empty
2482 if not ACCEPTABLE_URI_SCHEMES:
2483 return _urljoin(base, rel or u'')
2487 scheme = urlparse.urlparse(base)[0]
2488 if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
2491 uri = _urljoin(base, rel)
2492 if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2496 class _HTMLSanitizer(_BaseHTMLProcessor):
2497 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
2498 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
2499 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
2500 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
2501 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
2502 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
2503 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
2504 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
2505 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
2506 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2507 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
2508 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
2509 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
2511 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
2512 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
2513 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2514 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2515 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2516 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2517 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
2518 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
2519 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2520 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2521 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2522 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2523 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2524 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2525 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2526 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
2527 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
2528 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
2529 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
2530 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
2531 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
2534 unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
2536 acceptable_css_properties = ['azimuth', 'background-color',
2537 'border-bottom-color', 'border-collapse', 'border-color',
2538 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2539 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2540 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2541 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2542 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2543 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2544 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2545 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2546 'white-space', 'width']
2548 # survey of common keywords found in feeds
2549 acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
2550 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2551 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2552 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2553 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2554 'transparent', 'underline', 'white', 'yellow']
2556 valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
2557 '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
2559 mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
2560 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
2561 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
2562 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
2563 'munderover', 'none', 'semantics']
2565 mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
2566 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
2567 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
2568 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
2569 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
2570 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
2571 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
2572 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
2573 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
2575 # svgtiny - foreignObject + linearGradient + radialGradient + stop
2576 svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
2577 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2578 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2579 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2580 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2581 'svg', 'switch', 'text', 'title', 'tspan', 'use']
2583 # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2584 svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
2585 'arabic-form', 'ascent', 'attributeName', 'attributeType',
2586 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2587 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2588 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2589 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2590 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2591 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2592 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2593 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2594 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2595 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2596 'overline-position', 'overline-thickness', 'panose-1', 'path',
2597 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2598 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2599 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2600 'stop-color', 'stop-opacity', 'strikethrough-position',
2601 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2602 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2603 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2604 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2605 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2606 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2607 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2608 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2609 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2615 acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
2616 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2620 _BaseHTMLProcessor.reset(self)
2621 self.unacceptablestack = 0
2625 def unknown_starttag(self, tag, attrs):
2626 acceptable_attributes = self.acceptable_attributes
2628 if not tag in self.acceptable_elements or self.svgOK:
2629 if tag in self.unacceptable_elements_with_end_tag:
2630 self.unacceptablestack += 1
2632 # add implicit namespaces to html5 inline svg/mathml
2633 if self._type.endswith('html'):
2634 if not dict(attrs).get('xmlns'):
2636 attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
2638 attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
2640 # not otherwise acceptable, perhaps it is MathML or SVG?
2641 if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
2643 if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
2646 # chose acceptable attributes based on tag class, else bail
2647 if self.mathmlOK and tag in self.mathml_elements:
2648 acceptable_attributes = self.mathml_attributes
2649 elif self.svgOK and tag in self.svg_elements:
2650 # for most vocabularies, lowercasing is a good idea. Many
2651 # svg elements, however, are camel case
2652 if not self.svg_attr_map:
2653 lower=[attr.lower() for attr in self.svg_attributes]
2654 mix=[a for a in self.svg_attributes if a not in lower]
2655 self.svg_attributes = lower
2656 self.svg_attr_map = dict([(a.lower(),a) for a in mix])
2658 lower=[attr.lower() for attr in self.svg_elements]
2659 mix=[a for a in self.svg_elements if a not in lower]
2660 self.svg_elements = lower
2661 self.svg_elem_map = dict([(a.lower(),a) for a in mix])
2662 acceptable_attributes = self.svg_attributes
2663 tag = self.svg_elem_map.get(tag,tag)
2664 keymap = self.svg_attr_map
2665 elif not tag in self.acceptable_elements:
2668 # declare xlink namespace, if needed
2669 if self.mathmlOK or self.svgOK:
2670 if filter(lambda (n,v): n.startswith('xlink:'),attrs):
2671 if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
2672 attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2675 for key, value in self.normalize_attrs(attrs):
2676 if key in acceptable_attributes:
2677 key=keymap.get(key,key)
2678 # make sure the uri uses an acceptable uri scheme
2680 value = _makeSafeAbsoluteURI(value)
2681 clean_attrs.append((key,value))
2683 clean_value = self.sanitize_style(value)
2684 if clean_value: clean_attrs.append((key,clean_value))
2685 _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
2687 def unknown_endtag(self, tag):
2688 if not tag in self.acceptable_elements:
2689 if tag in self.unacceptable_elements_with_end_tag:
2690 self.unacceptablestack -= 1
2691 if self.mathmlOK and tag in self.mathml_elements:
2692 if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1
2693 elif self.svgOK and tag in self.svg_elements:
2694 tag = self.svg_elem_map.get(tag,tag)
2695 if tag == 'svg' and self.svgOK: self.svgOK -= 1
2698 _BaseHTMLProcessor.unknown_endtag(self, tag)
2700 def handle_pi(self, text):
2703 def handle_decl(self, text):
2706 def handle_data(self, text):
2707 if not self.unacceptablestack:
2708 _BaseHTMLProcessor.handle_data(self, text)
2710 def sanitize_style(self, style):
2712 style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
2715 if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
2716 # This replaced a regexp that used re.match and was prone to pathological back-tracking.
2717 if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return ''
2720 for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
2721 if not value: continue
2722 if prop.lower() in self.acceptable_css_properties:
2723 clean.append(prop + ': ' + value + ';')
2724 elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
2725 for keyword in value.split():
2726 if not keyword in self.acceptable_css_keywords and \
2727 not self.valid_css_values.match(keyword):
2730 clean.append(prop + ': ' + value + ';')
2731 elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
2732 clean.append(prop + ': ' + value + ';')
2734 return ' '.join(clean)
2736 def parse_comment(self, i, report=1):
2737 ret = _BaseHTMLProcessor.parse_comment(self, i, report)
2740 # if ret == -1, this may be a malicious attempt to circumvent
2741 # sanitization, or a page-destroying unclosed comment
2742 match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
2745 # unclosed comment; deliberately fail to handle_data()
2746 return len(self.rawdata)
2749 def _sanitizeHTML(htmlSource, encoding, _type):
2750 p = _HTMLSanitizer(encoding, _type)
2751 htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[')
2755 # loop through list of preferred Tidy interfaces looking for one that's installed,
2756 # then set up a common _tidy function to wrap the interface-specific API.
2758 for tidy_interface in PREFERRED_TIDY_INTERFACES:
2760 if tidy_interface == "uTidy":
2761 from tidy import parseString as _utidy
2762 def _tidy(data, **kwargs):
2763 return str(_utidy(data, **kwargs))
2765 elif tidy_interface == "mxTidy":
2766 from mx.Tidy import Tidy as _mxtidy
2767 def _tidy(data, **kwargs):
2768 nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
2774 utf8 = type(data) == type(u'')
2776 data = data.encode('utf-8')
2777 data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
2779 data = unicode(data, 'utf-8')
2780 if data.count('<body'):
2781 data = data.split('<body', 1)[1]
2783 data = data.split('>', 1)[1]
2784 if data.count('</body'):
2785 data = data.split('</body', 1)[0]
2786 data = data.strip().replace('\r\n', '\n')
2789 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
2790 def http_error_default(self, req, fp, code, msg, headers):
2791 if ((code / 100) == 3) and (code != 304):
2792 return self.http_error_302(req, fp, code, msg, headers)
2793 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2794 infourl.status = code
2797 def http_error_302(self, req, fp, code, msg, headers):
2798 if headers.dict.has_key('location'):
2799 infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
2801 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2802 if not hasattr(infourl, 'status'):
2803 infourl.status = code
2806 def http_error_301(self, req, fp, code, msg, headers):
2807 if headers.dict.has_key('location'):
2808 infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
2810 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2811 if not hasattr(infourl, 'status'):
2812 infourl.status = code
2815 http_error_300 = http_error_302
2816 http_error_303 = http_error_302
2817 http_error_307 = http_error_302
2819 def http_error_401(self, req, fp, code, msg, headers):
2821 # - server requires digest auth, AND
2822 # - we tried (unsuccessfully) with basic auth, AND
2823 # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
2824 # If all conditions hold, parse authentication information
2825 # out of the Authorization header we sent the first time
2826 # (for the username and password) and the WWW-Authenticate
2827 # header the server sent back (for the realm) and retry
2828 # the request with the appropriate digest auth headers instead.
2829 # This evil genius hack has been brought to you by Aaron Swartz.
2830 host = urlparse.urlparse(req.get_full_url())[1]
2832 assert sys.version.split()[0] >= '2.3.3'
2833 assert base64 != None
2834 user, passw = _base64decode(req.headers['Authorization'].split(' ')[1]).split(':')
2835 realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
2836 self.add_password(realm, host, user, passw)
2837 retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
2838 self.reset_retry_count()
2841 return self.http_error_default(req, fp, code, msg, headers)
2843 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
2844 """URL, filename, or string --> stream
2846 This function lets you define parsers that take any input source
2847 (URL, pathname to local or network file, or actual data as a string)
2848 and deal with it in a uniform manner. Returned object is guaranteed
2849 to have all the basic stdio read methods (read, readline, readlines).
2850 Just .close() the object when you're done with it.
2852 If the etag argument is supplied, it will be used as the value of an
2853 If-None-Match request header.
2855 If the modified argument is supplied, it can be a tuple of 9 integers
2856 (as returned by gmtime() in the standard Python time module) or a date
2857 string in any format supported by feedparser. Regardless, it MUST
2858 be in GMT (Greenwich Mean Time). It will be reformatted into an
2859 RFC 1123-compliant date and used as the value of an If-Modified-Since
2862 If the agent argument is supplied, it will be used as the value of a
2863 User-Agent request header.
2865 If the referrer argument is supplied, it will be used as the value of a
2866 Referer[sic] request header.
2868 If handlers is supplied, it is a list of handlers used to build a
2871 if request_headers is supplied it is a dictionary of HTTP request headers
2872 that will override the values generated by FeedParser.
2875 if hasattr(url_file_stream_or_string, 'read'):
2876 return url_file_stream_or_string
2878 if url_file_stream_or_string == '-':
2881 if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
2882 # Deal with the feed URI scheme
2883 if url_file_stream_or_string.startswith('feed:http'):
2884 url_file_stream_or_string = url_file_stream_or_string[5:]
2885 elif url_file_stream_or_string.startswith('feed:'):
2886 url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
2889 # test for inline user:password for basic auth
2892 urltype, rest = urllib.splittype(url_file_stream_or_string)
2893 realhost, rest = urllib.splithost(rest)
2895 user_passwd, realhost = urllib.splituser(realhost)
2897 url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
2898 auth = base64.standard_b64encode(user_passwd).strip()
2902 if isinstance(url_file_stream_or_string,unicode):
2903 url_file_stream_or_string = url_file_stream_or_string.encode('idna').decode('utf-8')
2905 url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna').decode('utf-8')
2909 # try to open with urllib2 (to use optional headers)
2910 request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
2911 opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()]))
2912 opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2914 return opener.open(request)
2916 opener.close() # JohnD
2918 # try to open with native open function (if url_file_stream_or_string is a filename)
2920 return open(url_file_stream_or_string, 'rb')
2924 # treat url_file_stream_or_string as string
2925 return _StringIO(str(url_file_stream_or_string))
2927 def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
2928 request = urllib2.Request(url)
2929 request.add_header('User-Agent', agent)
2931 request.add_header('If-None-Match', etag)
2932 if type(modified) == type(''):
2933 modified = _parse_date(modified)
2934 elif isinstance(modified, datetime.datetime):
2935 modified = modified.utctimetuple()
2937 # format into an RFC 1123-compliant timestamp. We can't use
2938 # time.strftime() since the %a and %b directives can be affected
2939 # by the current locale, but RFC 2616 states that dates must be
2941 short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2942 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2943 request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2945 request.add_header('Referer', referrer)
2947 request.add_header('Accept-encoding', 'gzip, deflate')
2949 request.add_header('Accept-encoding', 'gzip')
2951 request.add_header('Accept-encoding', 'deflate')
2953 request.add_header('Accept-encoding', '')
2955 request.add_header('Authorization', 'Basic %s' % auth)
2957 request.add_header('Accept', ACCEPT_HEADER)
2958 # use this for whatever -- cookies, special headers, etc
2959 # [('Cookie','Something'),('x-special-header','Another Value')]
2960 for header_name, header_value in request_headers.items():
2961 request.add_header(header_name, header_value)
2962 request.add_header('A-IM', 'feed') # RFC 3229 support
2966 def registerDateHandler(func):
2967 '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2968 _date_handlers.insert(0, func)
2970 # ISO-8601 date parsing routines written by Fazal Majid.
2971 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2972 # parser is beyond the scope of feedparser and would be a worthwhile addition
2973 # to the Python library.
2974 # A single regular expression cannot parse ISO 8601 date formats into groups
2975 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2976 # 0301-04-01), so we use templates instead.
2977 # Please note the order in templates is significant because we need a
2979 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
2980 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2981 '-YY-?MM', '-OOO', '-YY',
2987 'YYYY', r'(?P<year>\d{4})').replace(
2988 'YY', r'(?P<year>\d\d)').replace(
2989 'MM', r'(?P<month>[01]\d)').replace(
2990 'DD', r'(?P<day>[0123]\d)').replace(
2991 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
2992 'CC', r'(?P<century>\d\d$)')
2993 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2994 + r'(:(?P<second>\d{2}))?'
2995 + r'(\.(?P<fracsecond>\d+))?'
2996 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
2997 for tmpl in _iso8601_tmpl]
3002 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
3007 def _parse_date_iso8601(dateString):
3008 '''Parse a variety of ISO-8601-compatible formats like 20040105'''
3010 for _iso8601_match in _iso8601_matches:
3011 m = _iso8601_match(dateString)
3014 if m.span() == (0, 0): return
3015 params = m.groupdict()
3016 ordinal = params.get('ordinal', 0)
3018 ordinal = int(ordinal)
3021 year = params.get('year', '--')
3022 if not year or year == '--':
3023 year = time.gmtime()[0]
3024 elif len(year) == 2:
3025 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
3026 year = 100 * int(time.gmtime()[0] / 100) + int(year)
3029 month = params.get('month', '-')
3030 if not month or month == '-':
3031 # ordinals are NOT normalized by mktime, we simulate them
3032 # by setting month=1, day=ordinal
3036 month = time.gmtime()[1]
3038 day = params.get('day', 0)
3043 elif params.get('century', 0) or \
3044 params.get('year', 0) or params.get('month', 0):
3047 day = time.gmtime()[2]
3050 # special case of the century - is the first year of the 21st century
3051 # 2000 or 2001 ? The debate goes on...
3052 if 'century' in params.keys():
3053 year = (int(params['century']) - 1) * 100 + 1
3054 # in ISO 8601 most fields are optional
3055 for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
3056 if not params.get(field, None):
3058 hour = int(params.get('hour', 0))
3059 minute = int(params.get('minute', 0))
3060 second = int(float(params.get('second', 0)))
3061 # weekday is normalized by mktime(), we can ignore it
3063 daylight_savings_flag = -1
3064 tm = [year, month, day, hour, minute, second, weekday,
3065 ordinal, daylight_savings_flag]
3066 # ISO 8601 time zone adjustments
3067 tz = params.get('tz')
3068 if tz and tz != 'Z':
3070 tm[3] += int(params.get('tzhour', 0))
3071 tm[4] += int(params.get('tzmin', 0))
3073 tm[3] -= int(params.get('tzhour', 0))
3074 tm[4] -= int(params.get('tzmin', 0))
3077 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
3078 # which is guaranteed to normalize d/m/y/h/m/s.
3079 # Many implementations have bugs, but we'll pretend they don't.
3080 return time.localtime(time.mktime(tuple(tm)))
3081 registerDateHandler(_parse_date_iso8601)
3083 # 8-bit date handling routines written by ytrewq1.
3084 _korean_year = u'\ub144' # b3e2 in euc-kr
3085 _korean_month = u'\uc6d4' # bff9 in euc-kr
3086 _korean_day = u'\uc77c' # c0cf in euc-kr
3087 _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
3088 _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
3090 _korean_onblog_date_re = \
3091 re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
3092 (_korean_year, _korean_month, _korean_day))
3093 _korean_nate_date_re = \
3094 re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
3095 (_korean_am, _korean_pm))
3096 def _parse_date_onblog(dateString):
3097 '''Parse a string according to the OnBlog 8-bit date format'''
3098 m = _korean_onblog_date_re.match(dateString)
3100 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3101 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3102 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3103 'zonediff': '+09:00'}
3104 if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
3105 return _parse_date_w3dtf(w3dtfdate)
3106 registerDateHandler(_parse_date_onblog)
3108 def _parse_date_nate(dateString):
3109 '''Parse a string according to the Nate 8-bit date format'''
3110 m = _korean_nate_date_re.match(dateString)
3112 hour = int(m.group(5))
3114 if (ampm == _korean_pm):
3119 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3120 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3121 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
3122 'zonediff': '+09:00'}
3123 if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
3124 return _parse_date_w3dtf(w3dtfdate)
3125 registerDateHandler(_parse_date_nate)
3128 re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
3129 def _parse_date_mssql(dateString):
3130 '''Parse a string according to the MS SQL date format'''
3131 m = _mssql_date_re.match(dateString)
3133 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3134 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3135 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3136 'zonediff': '+09:00'}
3137 if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
3138 return _parse_date_w3dtf(w3dtfdate)
3139 registerDateHandler(_parse_date_mssql)
3141 # Unicode strings for Greek date strings
3144 u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
3145 u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
3146 u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
3147 u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
3148 u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
3149 u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
3150 u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
3151 u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
3152 u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
3153 u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
3154 u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
3155 u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
3156 u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
3157 u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
3158 u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
3159 u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
3160 u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
3161 u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
3162 u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
3167 u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
3168 u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
3169 u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
3170 u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
3171 u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
3172 u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
3173 u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
3176 _greek_date_format_re = \
3177 re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
3179 def _parse_date_greek(dateString):
3180 '''Parse a string according to a Greek 8-bit date format.'''
3181 m = _greek_date_format_re.match(dateString)
3184 wday = _greek_wdays[m.group(1)]
3185 month = _greek_months[m.group(3)]
3188 rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
3189 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
3190 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
3191 'zonediff': m.group(8)}
3192 if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
3193 return _parse_date_rfc822(rfc822date)
3194 registerDateHandler(_parse_date_greek)
3196 # Unicode strings for Hungarian date strings
3197 _hungarian_months = \
3199 u'janu\u00e1r': u'01', # e1 in iso-8859-2
3200 u'febru\u00e1ri': u'02', # e1 in iso-8859-2
3201 u'm\u00e1rcius': u'03', # e1 in iso-8859-2
3202 u'\u00e1prilis': u'04', # e1 in iso-8859-2
3203 u'm\u00e1ujus': u'05', # e1 in iso-8859-2
3204 u'j\u00fanius': u'06', # fa in iso-8859-2
3205 u'j\u00falius': u'07', # fa in iso-8859-2
3206 u'augusztus': u'08',
3207 u'szeptember': u'09',
3208 u'okt\u00f3ber': u'10', # f3 in iso-8859-2
3213 _hungarian_date_format_re = \
3214 re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
3216 def _parse_date_hungarian(dateString):
3217 '''Parse a string according to a Hungarian 8-bit date format.'''
3218 m = _hungarian_date_format_re.match(dateString)
3221 month = _hungarian_months[m.group(2)]
3230 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
3231 {'year': m.group(1), 'month': month, 'day': day,\
3232 'hour': hour, 'minute': m.group(5),\
3233 'zonediff': m.group(6)}
3234 if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
3235 return _parse_date_w3dtf(w3dtfdate)
3236 registerDateHandler(_parse_date_hungarian)
3238 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
3239 # Drake and licensed under the Python license. Removed all range checking
3240 # for month, day, hour, minute, and second, since mktime will normalize
3242 def _parse_date_w3dtf(dateString):
3243 def __extract_date(m):
3244 year = int(m.group('year'))
3246 year = 100 * int(time.gmtime()[0] / 100) + int(year)
3249 julian = m.group('julian')
3251 julian = int(julian)
3252 month = julian / 30 + 1
3253 day = julian % 30 + 1
3255 while jday != julian:
3256 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
3257 jday = time.gmtime(t)[-2]
3258 diff = abs(jday - julian)
3270 return year, month, day
3271 month = m.group('month')
3277 day = m.group('day')
3282 return year, month, day
3284 def __extract_time(m):
3287 hours = m.group('hours')
3291 minutes = int(m.group('minutes'))
3292 seconds = m.group('seconds')
3294 seconds = int(seconds)
3297 return hours, minutes, seconds
3299 def __extract_tzd(m):
3300 '''Return the Time Zone Designator as an offset in seconds from UTC.'''
3303 tzd = m.group('tzd')
3308 hours = int(m.group('tzdhours'))
3309 minutes = m.group('tzdminutes')
3311 minutes = int(minutes)
3314 offset = (hours*60 + minutes) * 60
3319 __date_re = ('(?P<year>\d\d\d\d)'
3321 '(?:(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?'
3322 '|(?P<julian>\d\d\d)))?')
3323 __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
3324 __tzd_rx = re.compile(__tzd_re)
3325 __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
3326 '(?:(?P=tsep)(?P<seconds>\d\d)(?:[.,]\d+)?)?'
3328 __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
3329 __datetime_rx = re.compile(__datetime_re)
3330 m = __datetime_rx.match(dateString)
3331 if (m is None) or (m.group() != dateString): return
3332 gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
3333 if gmt[0] == 0: return
3334 return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
3335 registerDateHandler(_parse_date_w3dtf)
3337 def _parse_date_rfc822(dateString):
3338 '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
3339 data = dateString.split()
3340 if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
3346 data[3:] = [s[:i], s[i+1:]]
3349 dateString = " ".join(data)
3350 # Account for the Etc/GMT timezone by stripping 'Etc/'
3351 elif len(data) == 5 and data[4].lower().startswith('etc/'):
3352 data[4] = data[4][4:]
3353 dateString = " ".join(data)
3355 dateString += ' 00:00:00 GMT'
3356 tm = rfc822.parsedate_tz(dateString)
3358 return time.gmtime(rfc822.mktime_tz(tm))
3359 # rfc822.py defines several time zones, but we define some extra ones.
3360 # 'ET' is equivalent to 'EST', etc.
3361 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
3362 rfc822._timezones.update(_additional_timezones)
3363 registerDateHandler(_parse_date_rfc822)
3365 def _parse_date_perforce(aDateString):
3366 """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3367 # Fri, 2006/09/15 08:19:53 EDT
3368 _my_date_pattern = re.compile( \
3369 r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3371 dow, year, month, day, hour, minute, second, tz = \
3372 _my_date_pattern.search(aDateString).groups()
3373 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3374 dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
3375 tm = rfc822.parsedate_tz(dateString)
3377 return time.gmtime(rfc822.mktime_tz(tm))
3378 registerDateHandler(_parse_date_perforce)
3380 def _parse_date(dateString):
3381 '''Parses a variety of date formats into a 9-tuple in GMT'''
3382 for handler in _date_handlers:
3384 date9tuple = handler(dateString)
3385 if not date9tuple: continue
3386 if len(date9tuple) != 9:
3387 if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
3389 map(int, date9tuple)
3391 except Exception, e:
3392 if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
3396 def _getCharacterEncoding(http_headers, xml_data):
3397 '''Get the character encoding of the XML document
3399 http_headers is a dictionary
3400 xml_data is a raw string (not Unicode)
3402 This is so much trickier than it sounds, it's not even funny.
3403 According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3404 is application/xml, application/*+xml,
3405 application/xml-external-parsed-entity, or application/xml-dtd,
3406 the encoding given in the charset parameter of the HTTP Content-Type
3407 takes precedence over the encoding given in the XML prefix within the
3408 document, and defaults to 'utf-8' if neither are specified. But, if
3409 the HTTP Content-Type is text/xml, text/*+xml, or
3410 text/xml-external-parsed-entity, the encoding given in the XML prefix
3411 within the document is ALWAYS IGNORED and only the encoding given in
3412 the charset parameter of the HTTP Content-Type header should be
3413 respected, and it defaults to 'us-ascii' if not specified.
3415 Furthermore, discussion on the atom-syntax mailing list with the
3416 author of RFC 3023 leads me to the conclusion that any document
3417 served with a Content-Type of text/* and no charset parameter
3418 must be treated as us-ascii. (We now do this.) And also that it
3419 must always be flagged as non-well-formed. (We now do this too.)
3421 If Content-Type is unspecified (input was local file or non-HTTP source)
3422 or unrecognized (server just got it totally wrong), then go by the
3423 encoding given in the XML prefix of the document and default to
3424 'iso-8859-1' as per the HTTP specification (RFC 2616).
3426 Then, assuming we didn't find a character encoding in the HTTP headers
3427 (and the HTTP Content-type allowed us to look in the body), we need
3428 to sniff the first few bytes of the XML data and try to determine
3429 whether the encoding is ASCII-compatible. Section F of the XML
3430 specification shows the way here:
3431 http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3433 If the sniffed encoding is not ASCII-compatible, we need to make it
3434 ASCII compatible so that we can sniff further into the XML declaration
3435 to find the encoding attribute, which will tell us the true encoding.
3437 Of course, none of this guarantees that we will be able to parse the
3438 feed in the declared character encoding (assuming it was declared
3439 correctly, which many are not). CJKCodecs and iconv_codec help a lot;
3440 you should definitely install them if you can.
3441 http://cjkpython.i18n.org/
3444 def _parseHTTPContentType(content_type):
3445 '''takes HTTP Content-Type header and returns (content type, charset)
3447 If no charset is specified, returns (content type, '')
3448 If no content type is specified, returns ('', '')
3449 Both return parameters are guaranteed to be lowercase strings
3451 content_type = content_type or ''
3452 content_type, params = cgi.parse_header(content_type)
3453 return content_type, params.get('charset', '').replace("'", '')
3455 sniffed_xml_encoding = ''
3458 http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type')))
3459 # Must sniff for non-ASCII-compatible character encodings before
3460 # searching for XML declaration. This heuristic is defined in
3461 # section F of the XML specification:
3462 # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3464 if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
3466 xml_data = _ebcdic_to_ascii(xml_data)
3467 elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
3469 sniffed_xml_encoding = 'utf-16be'
3470 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
3471 elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
3473 sniffed_xml_encoding = 'utf-16be'
3474 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
3475 elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
3477 sniffed_xml_encoding = 'utf-16le'
3478 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
3479 elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
3481 sniffed_xml_encoding = 'utf-16le'
3482 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
3483 elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
3485 sniffed_xml_encoding = 'utf-32be'
3486 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
3487 elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
3489 sniffed_xml_encoding = 'utf-32le'
3490 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
3491 elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3493 sniffed_xml_encoding = 'utf-32be'
3494 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
3495 elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3497 sniffed_xml_encoding = 'utf-32le'
3498 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
3499 elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3501 sniffed_xml_encoding = 'utf-8'
3502 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
3506 xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
3508 xml_encoding_match = None
3509 if xml_encoding_match:
3510 xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
3511 if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
3512 xml_encoding = sniffed_xml_encoding
3513 acceptable_content_type = 0
3514 application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
3515 text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
3516 if (http_content_type in application_content_types) or \
3517 (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
3518 acceptable_content_type = 1
3519 true_encoding = http_encoding or xml_encoding or 'utf-8'
3520 elif (http_content_type in text_content_types) or \
3521 (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
3522 acceptable_content_type = 1
3523 true_encoding = http_encoding or 'us-ascii'
3524 elif http_content_type.startswith('text/'):
3525 true_encoding = http_encoding or 'us-ascii'
3526 elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))):
3527 true_encoding = xml_encoding or 'iso-8859-1'
3529 true_encoding = xml_encoding or 'utf-8'
3530 # some feeds claim to be gb2312 but are actually gb18030.
3531 # apparently MSIE and Firefox both do the following switch:
3532 if true_encoding.lower() == 'gb2312':
3533 true_encoding = 'gb18030'
3534 return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
3536 def _toUTF8(data, encoding):
3537 '''Changes an XML data stream on the fly to specify a new encoding
3539 data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3540 encoding is a string recognized by encodings.aliases
3542 if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
3543 # strip Byte Order Mark (if present)
3544 if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3546 sys.stderr.write('stripping BOM\n')
3547 if encoding != 'utf-16be':
3548 sys.stderr.write('trying utf-16be instead\n')
3549 encoding = 'utf-16be'
3551 elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3553 sys.stderr.write('stripping BOM\n')
3554 if encoding != 'utf-16le':
3555 sys.stderr.write('trying utf-16le instead\n')
3556 encoding = 'utf-16le'
3558 elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3560 sys.stderr.write('stripping BOM\n')
3561 if encoding != 'utf-8':
3562 sys.stderr.write('trying utf-8 instead\n')
3565 elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3567 sys.stderr.write('stripping BOM\n')
3568 if encoding != 'utf-32be':
3569 sys.stderr.write('trying utf-32be instead\n')
3570 encoding = 'utf-32be'
3572 elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3574 sys.stderr.write('stripping BOM\n')
3575 if encoding != 'utf-32le':
3576 sys.stderr.write('trying utf-32le instead\n')
3577 encoding = 'utf-32le'
3579 newdata = unicode(data, encoding)
3580 if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
3581 declmatch = re.compile('^<\?xml[^>]*?>')
3582 newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
3583 if declmatch.search(newdata):
3584 newdata = declmatch.sub(newdecl, newdata)
3586 newdata = newdecl + u'\n' + newdata
3587 return newdata.encode('utf-8')
3589 def _stripDoctype(data):
3590 '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3592 rss_version may be 'rss091n' or None
3593 stripped_data is the same XML document, minus the DOCTYPE
3595 start = re.search(_s2bytes('<\w'), data)
3596 start = start and start.start() or -1
3597 head,data = data[:start+1], data[start+1:]
3599 entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
3600 entity_results=entity_pattern.findall(head)
3601 head = entity_pattern.sub(_s2bytes(''), head)
3602 doctype_pattern = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
3603 doctype_results = doctype_pattern.findall(head)
3604 doctype = doctype_results and doctype_results[0] or _s2bytes('')
3605 if doctype.lower().count(_s2bytes('netscape')):
3610 # only allow in 'safe' inline entity definitions
3611 replacement=_s2bytes('')
3612 if len(doctype_results)==1 and entity_results:
3613 safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
3614 safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
3616 replacement=_s2bytes('<!DOCTYPE feed [\n <!ENTITY') + _s2bytes('>\n <!ENTITY ').join(safe_entities) + _s2bytes('>\n]>')
3617 data = doctype_pattern.sub(replacement, head) + data
3619 return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
3621 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], request_headers={}, response_headers={}):
3622 '''Parse a feed from a URL, file, stream, or string.
3624 request_headers, if given, is a dict from http header name to value to add
3625 to the request; this overrides internally generated values.
3627 result = FeedParserDict()
3628 result['feed'] = FeedParserDict()
3629 result['entries'] = []
3632 if not isinstance(handlers, list):
3633 handlers = [handlers]
3635 f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
3637 except Exception, e:
3639 result['bozo_exception'] = e
3643 if hasattr(f, 'headers'):
3644 result['headers'] = dict(f.headers)
3645 # overwrite existing headers using response_headers
3646 if 'headers' in result:
3647 result['headers'].update(response_headers)
3648 elif response_headers:
3649 result['headers'] = copy.deepcopy(response_headers)
3651 # if feed is gzip-compressed, decompress it
3652 if f and data and 'headers' in result:
3653 if gzip and result['headers'].get('content-encoding') == 'gzip':
3655 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
3656 except Exception, e:
3657 # Some feeds claim to be gzipped but they're not, so
3658 # we get garbage. Ideally, we should re-request the
3659 # feed without the 'Accept-encoding: gzip' header,
3662 result['bozo_exception'] = e
3664 elif zlib and result['headers'].get('content-encoding') == 'deflate':
3666 data = zlib.decompress(data, -zlib.MAX_WBITS)
3667 except Exception, e:
3669 result['bozo_exception'] = e
3673 if 'headers' in result:
3674 if 'etag' in result['headers'] or 'ETag' in result['headers']:
3675 etag = result['headers'].get('etag', result['headers'].get('ETag'))
3677 result['etag'] = etag
3678 if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']:
3679 modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified'))
3681 result['modified'] = _parse_date(modified)
3682 if hasattr(f, 'url'):
3683 result['href'] = f.url
3684 result['status'] = 200
3685 if hasattr(f, 'status'):
3686 result['status'] = f.status
3687 if hasattr(f, 'close'):
3690 # there are four encodings to keep track of:
3691 # - http_encoding is the encoding declared in the Content-Type HTTP header
3692 # - xml_encoding is the encoding declared in the <?xml declaration
3693 # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
3694 # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
3695 http_headers = result.get('headers', {})
3696 result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
3697 _getCharacterEncoding(http_headers, data)
3698 if http_headers and (not acceptable_content_type):
3699 if http_headers.has_key('content-type') or http_headers.has_key('Content-type'):
3700 bozo_message = '%s is not an XML media type' % http_headers.get('content-type', http_headers.get('Content-type'))
3702 bozo_message = 'no Content-type specified'
3704 result['bozo_exception'] = NonXMLContentType(bozo_message)
3706 if data is not None:
3707 result['version'], data, entities = _stripDoctype(data)
3709 # ensure that baseuri is an absolute uri using an acceptable URI scheme
3710 contentloc = http_headers.get('content-location', http_headers.get('Content-Location', ''))
3711 href = result.get('href', '')
3712 baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
3714 baselang = http_headers.get('content-language', http_headers.get('Content-Language', None))
3716 # if server sent 304, we're done
3717 if result.get('status', 0) == 304:
3718 result['version'] = ''
3719 result['debug_message'] = 'The feed has not changed since you last checked, ' + \
3720 'so the server sent no data. This is a feature, not a bug!'
3723 # if there was a problem downloading, we're done
3727 # determine character encoding
3728 use_strict_parser = 0
3730 tried_encodings = []
3731 # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
3732 for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
3733 if not proposed_encoding: continue
3734 if proposed_encoding in tried_encodings: continue
3735 tried_encodings.append(proposed_encoding)
3737 data = _toUTF8(data, proposed_encoding)
3738 known_encoding = use_strict_parser = 1
3742 # if no luck and we have auto-detection library, try that
3743 if (not known_encoding) and chardet:
3745 proposed_encoding = chardet.detect(data)['encoding']
3746 if proposed_encoding and (proposed_encoding not in tried_encodings):
3747 tried_encodings.append(proposed_encoding)
3748 data = _toUTF8(data, proposed_encoding)
3749 known_encoding = use_strict_parser = 1
3752 # if still no luck and we haven't tried utf-8 yet, try that
3753 if (not known_encoding) and ('utf-8' not in tried_encodings):
3755 proposed_encoding = 'utf-8'
3756 tried_encodings.append(proposed_encoding)
3757 data = _toUTF8(data, proposed_encoding)
3758 known_encoding = use_strict_parser = 1
3761 # if still no luck and we haven't tried windows-1252 yet, try that
3762 if (not known_encoding) and ('windows-1252' not in tried_encodings):
3764 proposed_encoding = 'windows-1252'
3765 tried_encodings.append(proposed_encoding)
3766 data = _toUTF8(data, proposed_encoding)
3767 known_encoding = use_strict_parser = 1
3770 # if still no luck and we haven't tried iso-8859-2 yet, try that.
3771 if (not known_encoding) and ('iso-8859-2' not in tried_encodings):
3773 proposed_encoding = 'iso-8859-2'
3774 tried_encodings.append(proposed_encoding)
3775 data = _toUTF8(data, proposed_encoding)
3776 known_encoding = use_strict_parser = 1
3779 # if still no luck, give up
3780 if not known_encoding:
3782 result['bozo_exception'] = CharacterEncodingUnknown( \
3783 'document encoding unknown, I tried ' + \
3784 '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
3785 (result['encoding'], xml_encoding))
3786 result['encoding'] = ''
3787 elif proposed_encoding != result['encoding']:
3789 result['bozo_exception'] = CharacterEncodingOverride( \
3790 'document declared as %s, but parsed as %s' % \
3791 (result['encoding'], proposed_encoding))
3792 result['encoding'] = proposed_encoding
3794 if not _XML_AVAILABLE:
3795 use_strict_parser = 0
3796 if use_strict_parser:
3797 # initialize the SAX parser
3798 feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
3799 saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
3800 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
3801 saxparser.setContentHandler(feedparser)
3802 saxparser.setErrorHandler(feedparser)
3803 source = xml.sax.xmlreader.InputSource()
3804 source.setByteStream(_StringIO(data))
3805 if hasattr(saxparser, '_ns_stack'):
3806 # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
3807 # PyXML doesn't have this problem, and it doesn't have _ns_stack either
3808 saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
3810 saxparser.parse(source)
3811 except Exception, e:
3814 traceback.print_stack()
3815 traceback.print_exc()
3816 sys.stderr.write('xml parsing failed\n')
3818 result['bozo_exception'] = feedparser.exc or e
3819 use_strict_parser = 0
3820 if not use_strict_parser:
3821 feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
3822 feedparser.feed(data.decode('utf-8', 'replace'))
3823 result['feed'] = feedparser.feeddata
3824 result['entries'] = feedparser.entries
3825 result['version'] = result['version'] or feedparser.version
3826 result['namespaces'] = feedparser.namespacesInUse
3830 def __init__(self, results):
3831 self.results = results
3833 class TextSerializer(Serializer):
3834 def write(self, stream=sys.stdout):
3835 self._writer(stream, self.results, '')
3837 def _writer(self, stream, node, prefix):
3839 if hasattr(node, 'keys'):
3843 if k in ('description', 'link'): continue
3844 if node.has_key(k + '_detail'): continue
3845 if node.has_key(k + '_parsed'): continue
3846 self._writer(stream, node[k], prefix + k + '.')
3847 elif type(node) == types.ListType:
3850 self._writer(stream, n, prefix[:-1] + '[' + str(index) + '].')
3854 s = str(node).encode('utf-8')
3855 s = s.replace('\\', '\\\\')
3856 s = s.replace('\r', '')
3857 s = s.replace('\n', r'\n')
3858 stream.write(prefix[:-1])
3865 class PprintSerializer(Serializer):
3866 def write(self, stream=sys.stdout):
3867 if self.results.has_key('href'):
3868 stream.write(self.results['href'] + '\n\n')
3869 from pprint import pprint
3870 pprint(self.results, stream)
3873 if __name__ == '__main__':
3875 from optparse import OptionParser
3880 optionParser = OptionParser(version=__version__, usage="%prog [options] url_or_filename_or_-")
3881 optionParser.set_defaults(format="pprint")
3882 optionParser.add_option("-A", "--user-agent", dest="agent", metavar="AGENT", help="User-Agent for HTTP URLs")
3883 optionParser.add_option("-e", "--referer", "--referrer", dest="referrer", metavar="URL", help="Referrer for HTTP URLs")
3884 optionParser.add_option("-t", "--etag", dest="etag", metavar="TAG", help="ETag/If-None-Match for HTTP URLs")
3885 optionParser.add_option("-m", "--last-modified", dest="modified", metavar="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)")
3886 optionParser.add_option("-f", "--format", dest="format", metavar="FORMAT", help="output results in FORMAT (text, pprint)")
3887 optionParser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="write debugging information to stderr")
3888 (options, urls) = optionParser.parse_args()
3892 optionParser.print_help()
3895 if not sys.argv[1:]:
3899 etag = modified = agent = referrer = None
3901 options = _Options()
3904 zopeCompatibilityHack()
3906 serializer = globals().get(options.format.capitalize() + 'Serializer', Serializer)
3908 results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer)
3909 serializer(results).write(sys.stdout)