extra/feedparser.py

   1 #!/usr/bin/env python
   2 """Universal feed parser
   3
   4 Visit http://diveintomark.org/projects/feed_parser/ for the latest version
   5
   6 Handles RSS 0.9x, RSS 1.0, RSS 2.0, Atom feeds
   7
   8 Things it handles that choke other parsers:
   9 - bastard combinations of RSS 0.9x and RSS 1.0
  10 - illegal 8-bit XML characters
  11 - naked and/or invalid HTML in description
  12 - content:encoded, xhtml:body, fullitem
  13 - guid
  14 - elements in non-standard namespaces or non-default namespaces
  15 - multiple content items per entry (Atom)
  16 - multiple links per entry (Atom)
  17
  18 Other features:
  19 - resolves relative URIs in some elements
  20   - uses xml:base to define base URI
  21   - uses URI of feed if no xml:base is given
  22   - to control which elements are resolved, set _FeedParserMixin.can_be_relative_uri
  23 - resolves relative URIs within embedded markup
  24   - to control which elements are resolved, set _FeedParserMixin.can_contain_relative_uris
  25 - sanitizes embedded markup in some elements
  26   - to allow/disallow HTML elements, set _HTMLSanitizer.acceptable_elements
  27   - to allow/disallow HTML attributes, set _HTMLSanitizer.acceptable_attributes
  28   - to control which feed elements are sanitized, set _FeedParserMixin.can_contain_dangerous_markup
  29   - to disable entirely (NOT RECOMMENDED), set _FeedParserMixin.can_contain_dangerous_markup = []
  30 - optionally tidies embedded markup
  31   - fixes malformed HTML
  32   - converts to XHTML
  33   - converts character entities to numeric entities
  34   - requires mxTidy <http://www.lemburg.com/files/python/mxTidy.html>
  35
  36 Required: Python 2.1 or later
  37 Recommended: Python 2.3 or later
  38 """
  39
  40 __version__ = "3.0-beta-14"
  41 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
  42 __copyright__ = "Copyright 2002-4, Mark Pilgrim"
  43 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
  44                     "John Beimler <http://john.beimler.org/>",
  45                     "Fazal Majid <http://www.majid.info/mylos/weblog/>"]
  46 __license__ = "Python"
  47 _debug = 0
  48
  49 # if you are embedding feedparser in a larger application, you should change this to your application name and URL
  50 USER_AGENT = "UniversalFeedParser/%s%s +http://diveintomark.org/projects/feed_parser/" % (__version__, _debug and "-debug" or "")
  51
  52 # ---------- required modules (should come with any Python distribution) ----------
  53 import sgmllib, re, sys, copy, urlparse, time, rfc822
  54 try:
  55     from cStringIO import StringIO as _StringIO
  56 except:
  57     from StringIO import StringIO as _StringIO
  58
  59 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
  60
  61 # gzip is included with most Python distributions, but may not be available if you compiled your own
  62 try:
  63     import gzip
  64 except:
  65     gzip = None
  66
  67 # timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers.
  68 # Python 2.3 now has this functionality available in the standard socket library, so under
  69 # 2.3 you don't need to install anything.
  70 import socket
  71 if hasattr(socket, 'setdefaulttimeout'):
  72     socket.setdefaulttimeout(10)
  73 else:
  74     try:
  75         import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
  76         timeoutsocket.setDefaultSocketTimeout(10)
  77     except ImportError:
  78         pass
  79 import urllib2
  80
  81 # mxtidy allows feedparser to tidy malformed embedded HTML markup in description, content, etc.
  82 # this does not affect HTML sanitizing, which is self-contained in the _HTMLSanitizer class
  83 try:
  84     from mx.Tidy import Tidy as _mxtidy # http://www.lemburg.com/files/python/mxTidy.html
  85 except:
  86     _mxtidy = None
  87
  88 # If a real XML parser is available, feedparser will attempt to use it.  feedparser works
  89 # with both the built-in SAX parser and PyXML SAX parser.  On platforms where the Python
  90 # distribution does not come with an XML parser (such as Mac OS X 10.2 and some versions of
  91 # FreeBSD), feedparser will just fall back on regex-based parsing.  If XML libraries are
  92 # available but the feed turns out not to be well-formed XML, feedparser will fall back
  93 # on regex-based parsing and set the "bozo" bit in the results to indicate that the feed
  94 # author is a bozo who can't generate well-formed XML.  The two advantages of using a real
  95 # XML parser are (1) Unicode support, and (2) to get people to stop yelling at me for not
  96 # using one.
  97 try:
  98     import xml.sax
  99     from xml.sax.saxutils import escape as xmlescape
 100     _XML_AVAILABLE = 1
 101 except:
 102     _XML_AVAILABLE = 0
 103     def xmlescape(data):
 104         data = data.replace("&", "&amp;")
 105         data = data.replace(">", "&gt;")
 106         data = data.replace("<", "&lt;")
 107         return data
 108
 109 # base64 support for Atom feeds that contain embedded binary data
 110 try:
 111     import base64, binascii
 112 except:
 113     base64 = binascii = None
 114
 115 # ---------- don't touch these ----------
 116 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 117 sgmllib.special = re.compile('<!')
 118
 119 SUPPORTED_VERSIONS = {'': 'unknown',
 120                       'rss090': 'RSS 0.90',
 121                       'rss091n': 'RSS 0.91 (Netscape)',
 122                       'rss091u': 'RSS 0.91 (Userland)',
 123                       'rss092': 'RSS 0.92',
 124                       'rss093': 'RSS 0.93',
 125                       'rss094': 'RSS 0.94',
 126                       'rss20': 'RSS 2.0',
 127                       'rss10': 'RSS 1.0',
 128                       'rss': 'RSS (unknown version)',
 129                       'atom01': 'Atom 0.1',
 130                       'atom02': 'Atom 0.2',
 131                       'atom03': 'Atom 0.3',
 132                       'atom': 'Atom (unknown version)'
 133                       }
 134
 135 try:
 136     dict
 137 except NameError:
 138     # Python 2.1 does not have a built-in dict() function
 139     def dict(aList):
 140         rc = {}
 141         for k, v in aList:
 142             rc[k] = v
 143         return rc
 144
 145 class _FeedParserMixin:
 146     namespaces = {"http://backend.userland.com/rss": "",
 147                   "http://blogs.law.harvard.edu/tech/rss": "",
 148                   "http://purl.org/rss/1.0/": "",
 149                   "http://example.com/newformat#": "",
 150                   "http://example.com/necho": "",
 151                   "http://purl.org/echo/": "",
 152                   "uri/of/echo/namespace#": "",
 153                   "http://purl.org/pie/": "",
 154                   "http://purl.org/atom/ns#": "",
 155                   "http://purl.org/rss/1.0/modules/rss091#": "",
 156
 157                   "http://webns.net/mvcb/":                               "admin",
 158                   "http://purl.org/rss/1.0/modules/aggregation/":         "ag",
 159                   "http://purl.org/rss/1.0/modules/annotate/":            "annotate",
 160                   "http://media.tangent.org/rss/1.0/":                    "audio",
 161                   "http://backend.userland.com/blogChannelModule":        "blogChannel",
 162                   "http://web.resource.org/cc/":                          "cc",
 163                   "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
 164                   "http://purl.org/rss/1.0/modules/company":              "co",
 165                   "http://purl.org/rss/1.0/modules/content/":             "content",
 166                   "http://my.theinfo.org/changed/1.0/rss/":               "cp",
 167                   "http://purl.org/dc/elements/1.1/":                     "dc",
 168                   "http://purl.org/dc/terms/":                            "dcterms",
 169                   "http://purl.org/rss/1.0/modules/email/":               "email",
 170                   "http://purl.org/rss/1.0/modules/event/":               "ev",
 171                   "http://postneo.com/icbm/":                             "icbm",
 172                   "http://purl.org/rss/1.0/modules/image/":               "image",
 173                   "http://xmlns.com/foaf/0.1/":                           "foaf",
 174                   "http://freshmeat.net/rss/fm/":                         "fm",
 175                   "http://purl.org/rss/1.0/modules/link/":                "l",
 176                   "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
 177                   "http://prismstandard.org/namespaces/1.2/basic/":       "prism",
 178                   "http://www.w3.org/1999/02/22-rdf-syntax-ns#":          "rdf",
 179                   "http://www.w3.org/2000/01/rdf-schema#":                "rdfs",
 180                   "http://purl.org/rss/1.0/modules/reference/":           "ref",
 181                   "http://purl.org/rss/1.0/modules/richequiv/":           "reqv",
 182                   "http://purl.org/rss/1.0/modules/search/":              "search",
 183                   "http://purl.org/rss/1.0/modules/slash/":               "slash",
 184                   "http://purl.org/rss/1.0/modules/servicestatus/":       "ss",
 185                   "http://hacks.benhammersley.com/rss/streaming/":        "str",
 186                   "http://purl.org/rss/1.0/modules/subscription/":        "sub",
 187                   "http://purl.org/rss/1.0/modules/syndication/":         "sy",
 188                   "http://purl.org/rss/1.0/modules/taxonomy/":            "taxo",
 189                   "http://purl.org/rss/1.0/modules/threading/":           "thr",
 190                   "http://purl.org/rss/1.0/modules/textinput/":           "ti",
 191                   "http://madskills.com/public/xml/rss/module/trackback/":"trackback",
 192                   "http://wellformedweb.org/CommentAPI/":                 "wfw",
 193                   "http://purl.org/rss/1.0/modules/wiki/":                "wiki",
 194                   "http://schemas.xmlsoap.org/soap/envelope/":            "soap",
 195                   "http://www.w3.org/1999/xhtml":                         "xhtml",
 196                   "http://www.w3.org/XML/1998/namespace":                 "xml"
 197 }
 198
 199     can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentRSS', 'docs', 'url', 'comments']
 200     can_contain_relative_uris = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
 201     can_contain_dangerous_markup = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
 202     html_types = ['text/html', 'application/xhtml+xml']
 203
 204     def __init__(self, baseuri=None):
 205         if _debug: sys.stderr.write("initializing FeedParser\n")
 206         self.channel = {} # channel- or feed-level data
 207         self.items = [] # list of item- or entry-level data
 208         self.version = '' # feed type/version, see SUPPORTED_VERSIONS
 209
 210         # the following are used internally to track state;
 211         # some of this is kind of out of control and should
 212         # probably be refactored into a finite state machine
 213         self.inchannel = 0
 214         self.initem = 0
 215         self.incontent = 0
 216         self.intextinput = 0
 217         self.inimage = 0
 218         self.inauthor = 0
 219         self.incontributor = 0
 220         self.contentparams = {}
 221         self.namespacemap = {}
 222         self.elementstack = []
 223         self.basestack = []
 224         self.langstack = []
 225         self.baseuri = baseuri or ''
 226         self.lang = None
 227
 228     def unknown_starttag(self, tag, attrs):
 229         if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
 230         # normalize attrs
 231         attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
 232         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
 233
 234         # track xml:base and xml:lang
 235         attrsD = dict(attrs)
 236         baseuri = attrsD.get('xml:base')
 237         if baseuri:
 238             if _debug: sys.stderr.write('self.baseuri=%s\n' % baseuri)
 239             self.baseuri = baseuri
 240         lang = attrsD.get('xml:lang')
 241         if lang:
 242             self.lang = lang
 243         self.basestack.append(baseuri)
 244         self.langstack.append(lang)
 245
 246         # track namespaces
 247         for prefix, uri in attrs:
 248             if prefix.startswith('xmlns:'):
 249                 self.trackNamespace(prefix[6:], uri)
 250             elif prefix == 'xmlns':
 251                 self.trackNamespace(None, uri)
 252
 253         # track inline content
 254         if self.incontent and self.contentparams.get('mode') == 'escaped':
 255             # element declared itself as escaped markup, but it isn't really
 256             self.contentparams['mode'] = 'xml'
 257         if self.incontent and self.contentparams.get('mode') == 'xml':
 258             # Note: probably shouldn't simply recreate localname here, but
 259             # our namespace handling isn't actually 100% correct in cases where
 260             # the feed redefines the default namespace (which is actually
 261             # the usual case for inline content, thanks Sam), so here we
 262             # cheat and just reconstruct the element based on localname
 263             # because that compensates for the bugs in our namespace handling.
 264             # This will horribly munge inline content with non-empty qnames,
 265             # but nobody actually does that, so I'm not fixing it.
 266             tag = tag.split(':')[-1]
 267             return self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0)
 268
 269         # match namespaces
 270         try:
 271             prefix, suffix = tag.split(':', 1)
 272         except ValueError:
 273             prefix, suffix = '', tag
 274         prefix = self.namespacemap.get(prefix, prefix)
 275         if prefix:
 276             prefix = prefix + '_'
 277
 278         # call special handler (if defined) or default handler
 279         methodname = '_start_' + prefix + suffix
 280         try:
 281             method = getattr(self, methodname)
 282             return method(attrsD)
 283         except AttributeError:
 284             return self.push(prefix + suffix, 1)
 285
 286     def unknown_endtag(self, tag):
 287         if _debug: sys.stderr.write('end %s\n' % tag)
 288         # match namespaces
 289         try:
 290             prefix, suffix = tag.split(':', 1)
 291         except ValueError:
 292             prefix, suffix = '', tag
 293         prefix = self.namespacemap.get(prefix, prefix)
 294         if prefix:
 295             prefix = prefix + '_'
 296
 297         # call special handler (if defined) or default handler
 298         methodname = '_end_' + prefix + suffix
 299         try:
 300             method = getattr(self, methodname)
 301             method()
 302         except AttributeError:
 303             self.pop(prefix + suffix)
 304
 305         # track inline content
 306         if self.incontent and self.contentparams.get('mode') == 'escaped':
 307             # element declared itself as escaped markup, but it isn't really
 308             self.contentparams['mode'] = 'xml'
 309         if self.incontent and self.contentparams.get('mode') == 'xml':
 310             tag = tag.split(':')[-1]
 311             self.handle_data("</%s>" % tag, escape=0)
 312
 313         # track xml:base and xml:lang going out of scope
 314         if self.basestack:
 315             self.basestack.pop()
 316             if self.basestack and self.basestack[-1]:
 317                 baseuri = self.basestack[-1]
 318                 if _debug: sys.stderr.write('self.baseuri=%s\n' % baseuri)
 319                 self.baseuri = baseuri
 320         if self.langstack:
 321             lang = self.langstack.pop()
 322             if lang:
 323                 self.lang = lang
 324
 325     def handle_charref(self, ref):
 326         # called for each character reference, e.g. for "&#160;", ref will be "160"
 327         # Reconstruct the original character reference.
 328         if not self.elementstack: return
 329         text = "&#%s;" % ref
 330         self.elementstack[-1][2].append(text)
 331
 332     def handle_entityref(self, ref):
 333         # called for each entity reference, e.g. for "&copy;", ref will be "copy"
 334         # Reconstruct the original entity reference.
 335         if not self.elementstack: return
 336         text = "&%s;" % ref
 337         self.elementstack[-1][2].append(text)
 338
 339     def handle_data(self, text, escape=1):
 340         # called for each block of plain text, i.e. outside of any tag and
 341         # not containing any character or entity references
 342         if not self.elementstack: return
 343         if escape and self.contentparams.get('mode') == 'xml':
 344             text = xmlescape(text)
 345         self.elementstack[-1][2].append(text)
 346
 347     def handle_comment(self, text):
 348         # called for each comment, e.g. <!-- insert message here -->
 349         pass
 350
 351     def handle_pi(self, text):
 352         # called for each processing instruction, e.g. <?instruction>
 353         pass
 354
 355     def handle_decl(self, text):
 356         # called for the DOCTYPE, if present, e.g.
 357         # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
 358         #     "http://www.w3.org/TR/html4/loose.dtd">
 359         if text.count('http://my.netscape.com/publish/formats/rss-0.91.dtd'):
 360             self.version = 'rss091n'
 361
 362     _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
 363     def _scan_name(self, i, declstartpos):
 364         rawdata = self.rawdata
 365         n = len(rawdata)
 366         if i == n:
 367             return None, -1
 368         m = self._new_declname_match(rawdata, i)
 369         if m:
 370             s = m.group()
 371             name = s.strip()
 372             if (i + len(s)) == n:
 373                 return None, -1  # end of buffer
 374             return name.lower(), m.end()
 375         else:
 376             self.updatepos(declstartpos, i)
 377             self.error("expected name token")
 378
 379     def parse_declaration(self, i):
 380         # override internal declaration handler to handle CDATA blocks
 381         if _debug: sys.stderr.write("entering parse_declaration\n")
 382         if re.search(r'^<!DOCTYPE\s+?rss\s+?PUBLIC\s+?"-//Netscape Communications//DTD RSS 0.91//EN"\s+?"http://my.netscape.com/publish/formats/rss-0.91.dtd">', self.rawdata[i:]):
 383             if _debug: sys.stderr.write("found Netscape DOCTYPE\n")
 384             self.version = 'rss091n'
 385         if self.rawdata[i:i+9] == '<![CDATA[':
 386             k = self.rawdata.find(']]>', i)
 387             if k == -1: k = len(self.rawdata)
 388             self.handle_data(xmlescape(self.rawdata[i+9:k]), 0)
 389             return k+3
 390         else:
 391             k = self.rawdata.find('>', i)
 392             return k+1
 393
 394     def trackNamespace(self, prefix, uri):
 395         if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
 396             self.version = 'rss090'
 397         if not prefix: return
 398         if uri.find('backend.userland.com/rss') <> -1:
 399             # match any backend.userland.com namespace
 400             uri = 'http://backend.userland.com/rss'
 401         if self.namespaces.has_key(uri):
 402             self.namespacemap[prefix] = self.namespaces[uri]
 403
 404     def resolveURI(self, uri):
 405         return urlparse.urljoin(self.baseuri or '', uri)
 406
 407     def decodeEntities(self, element, data):
 408         if self.contentparams.get('mode') == 'escaped':
 409             data = data.replace('&lt;', '<')
 410             data = data.replace('&gt;', '>')
 411             data = data.replace('&amp;', '&')
 412             data = data.replace('&quot;', '"')
 413             data = data.replace('&apos;', "'")
 414         return data
 415
 416     def push(self, element, expectingText):
 417 #        print 'push', element, expectingText
 418 #        while self.elementstack and self.elementstack[-1][1]:
 419 #            self.pop(self.elementstack[-1][0])
 420         self.elementstack.append([element, expectingText, []])
 421
 422     def pop(self, element):
 423 #        print 'pop', element
 424         if not self.elementstack: return
 425 #        while self.elementstack[-1][0] != element: self.pop(self.elementstack[-1][0])
 426         if self.elementstack[-1][0] != element: return
 427
 428         element, expectingText, pieces = self.elementstack.pop()
 429         output = "".join(pieces)
 430         output = output.strip()
 431         if not expectingText: return output
 432
 433         # decode base64 content
 434         if self.contentparams.get('mode') == 'base64' and base64:
 435             try:
 436                 output = base64.decodestring(output)
 437             except binascii.Error:
 438                 pass
 439             except binascii.Incomplete:
 440                 pass
 441
 442         # resolve relative URIs
 443         if (element in self.can_be_relative_uri) and output:
 444             output = self.resolveURI(output)
 445
 446         # decode entities within embedded markup
 447         output = self.decodeEntities(element, output)
 448
 449         # resolve relative URIs within embedded markup
 450         if element in self.can_contain_relative_uris:
 451             output = _resolveRelativeURIs(output, self.baseuri)
 452
 453         # sanitize embedded markup
 454         if element in self.can_contain_dangerous_markup:
 455             output = _sanitizeHTML(output)
 456
 457         # store output in appropriate place(s)
 458         if self.initem:
 459             if element == 'content':
 460                 self.items[-1].setdefault(element, [])
 461                 contentparams = copy.deepcopy(self.contentparams)
 462                 contentparams['value'] = output
 463                 self.items[-1][element].append(contentparams)
 464             elif element == 'category':
 465                 self.items[-1][element] = output
 466                 domain = self.items[-1]['categories'][-1][0]
 467                 self.items[-1]['categories'][-1] = (domain, output)
 468             elif element == 'source':
 469                 self.items[-1]['source']['value'] = output
 470             elif element == 'link':
 471                 self.items[-1][element] = output
 472                 if output:
 473                     self.items[-1]['links'][-1]['href'] = output
 474             else:
 475                 if self.incontent and element != 'description':
 476                     contentparams = copy.deepcopy(self.contentparams)
 477                     contentparams['value'] = output
 478                     self.items[-1][element + '_detail'] = contentparams
 479                 self.items[-1][element] = output
 480         elif self.inchannel and (not self.intextinput) and (not self.inimage):
 481             if element == 'category':
 482                 domain = self.channel['categories'][-1][0]
 483                 self.channel['categories'][-1] = (domain, output)
 484             elif element == 'link':
 485                 self.channel['links'][-1]['href'] = output
 486             else:
 487                 if self.incontent and element != 'description':
 488                     contentparams = copy.deepcopy(self.contentparams)
 489                     contentparams['value'] = output
 490                     self.channel[element + '_detail'] = contentparams
 491             self.channel[element] = output
 492         return output
 493
 494     def _mapToStandardPrefix(self, name):
 495         colonpos = name.find(':')
 496         if colonpos <> -1:
 497             prefix = name[:colonpos]
 498             suffix = name[colonpos+1:]
 499             prefix = self.namespacemap.get(prefix, prefix)
 500             name = prefix + ':' + suffix
 501         return name
 502
 503     def _getAttribute(self, attrsD, name):
 504         return attrsD.get(self._mapToStandardPrefix(name))
 505
 506     def _save(self, key, value):
 507         if value:
 508             if self.initem:
 509                 self.items[-1].setdefault(key, value)
 510             elif self.channel:
 511                 self.channel.setdefault(key, value)
 512
 513     def _start_rss(self, attrsD):
 514         versionmap = {'0.91': 'rss091u',
 515                       '0.92': 'rss092',
 516                       '0.93': 'rss093',
 517                       '0.94': 'rss094'}
 518         if not self.version:
 519             attr_version = attrsD.get('version', '')
 520             version = versionmap.get(attr_version)
 521             if version:
 522                 self.version = version
 523             elif attr_version.startswith('2.'):
 524                 self.version = 'rss20'
 525             else:
 526                 self.version = 'rss'
 527
 528     def _start_channel(self, attrsD):
 529         self.inchannel = 1
 530
 531     def _start_feed(self, attrsD):
 532         self.inchannel = 1
 533         versionmap = {'0.1': 'atom01',
 534                       '0.2': 'atom02',
 535                       '0.3': 'atom03'}
 536         if not self.version:
 537             attr_version = attrsD.get('version')
 538             version = versionmap.get(attr_version)
 539             if version:
 540                 self.version = version
 541             else:
 542                 self.version = 'atom'
 543
 544     def _end_channel(self):
 545         self.inchannel = 0
 546     _end_feed = _end_channel
 547
 548     def _start_image(self, attrsD):
 549         self.inimage = 1
 550
 551     def _end_image(self):
 552         self.inimage = 0
 553
 554     def _start_textinput(self, attrsD):
 555         self.intextinput = 1
 556     _start_textInput = _start_textinput
 557
 558     def _end_textinput(self):
 559         self.intextinput = 0
 560     _end_textInput = _end_textinput
 561
 562     def _start_author(self, attrsD):
 563         self.inauthor = 1
 564         self.push('author', 1)
 565     _start_managingeditor = _start_author
 566     _start_dc_author = _start_author
 567     _start_dc_creator = _start_author
 568
 569     def _end_author(self):
 570         self.pop('author')
 571         self.inauthor = 0
 572         self._sync_author_detail()
 573     _end_managingeditor = _end_author
 574     _end_dc_author = _end_author
 575     _end_dc_creator = _end_author
 576
 577     def _start_contributor(self, attrsD):
 578         self.incontributor = 1
 579         context = self._getContext()
 580         context.setdefault('contributors', [])
 581         context['contributors'].append({})
 582         self.push('contributor', 0)
 583
 584     def _end_contributor(self):
 585         self.pop('contributor')
 586         self.incontributor = 0
 587
 588     def _start_name(self, attrsD):
 589         self.push('name', 0)
 590
 591     def _end_name(self):
 592         value = self.pop('name')
 593         if self.inauthor:
 594             self._save_author('name', value)
 595         elif self.incontributor:
 596             self._save_contributor('name', value)
 597             pass
 598         elif self.intextinput:
 599             # TODO
 600             pass
 601
 602     def _start_url(self, attrsD):
 603         self.push('url', 0)
 604     _start_homepage = _start_url
 605     _start_uri = _start_url
 606
 607     def _end_url(self):
 608         value = self.pop('url')
 609         if self.inauthor:
 610             self._save_author('url', value)
 611         elif self.incontributor:
 612             self._save_contributor('url', value)
 613         elif self.inimage:
 614             # TODO
 615             pass
 616         elif self.intextinput:
 617             # TODO
 618             pass
 619     _end_homepage = _end_url
 620     _end_uri = _end_url
 621
 622     def _start_email(self, attrsD):
 623         self.push('email', 0)
 624
 625     def _end_email(self):
 626         value = self.pop('email')
 627         if self.inauthor:
 628             self._save_author('email', value)
 629         elif self.incontributor:
 630             self._save_contributor('email', value)
 631             pass
 632         elif self.inimage:
 633             # TODO
 634             pass
 635         elif self.intextinput:
 636             # TODO
 637             pass
 638
 639     def _getContext(self):
 640         if self.initem:
 641             context = self.items[-1]
 642         else:
 643             context = self.channel
 644         return context
 645
 646     def _save_author(self, key, value):
 647         context = self._getContext()
 648         context.setdefault('author_detail', {})
 649         context['author_detail'][key] = value
 650         self._sync_author_detail()
 651
 652     def _save_contributor(self, key, value):
 653         context = self._getContext()
 654         context.setdefault('contributors', [{}])
 655         context['contributors'][-1][key] = value
 656
 657     def _sync_author_detail(self):
 658         context = self._getContext()
 659         detail = context.get('author_detail')
 660         if detail:
 661             name = detail.get('name')
 662             email = detail.get('email')
 663             if name and email:
 664                 context['author'] = "%s (%s)" % (name, email)
 665             elif name:
 666                 context['author'] = name
 667             elif email:
 668                 context['author'] = email
 669         else:
 670             author = context.get('author')
 671             if not author: return
 672             emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author)
 673             if not emailmatch: return
 674             email = emailmatch.group(0)
 675             author = author.replace(email, '')
 676             author = author.replace('()', '')
 677             author = author.strip()
 678             context.setdefault('author_detail', {})
 679             context['author_detail']['name'] = author
 680             context['author_detail']['email'] = email
 681
 682     def _start_tagline(self, attrsD):
 683         self.incontent += 1
 684         self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
 685                               'type': attrsD.get('type', 'text/plain'),
 686                               'language': attrsD.get('xml:lang', self.lang),
 687                               'base': attrsD.get('xml:base', self.baseuri)}
 688         self.push('tagline', 1)
 689     _start_subtitle = _start_tagline
 690
 691     def _end_tagline(self):
 692         value = self.pop('tagline')
 693         self.incontent -= 1
 694         self.contentparams.clear()
 695         if self.inchannel:
 696             self.channel['description'] = value
 697     _end_subtitle = _end_tagline
 698
 699     def _start_copyright(self, attrsD):
 700         self.incontent += 1
 701         self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
 702                               'type': attrsD.get('type', 'text/plain'),
 703                               'language': attrsD.get('xml:lang', self.lang),
 704                               'base': attrsD.get('xml:base', self.baseuri)}
 705         self.push('copyright', 1)
 706     _start_dc_rights = _start_copyright
 707
 708     def _end_copyright(self):
 709         self.pop('copyright')
 710         self.incontent -= 1
 711         self.contentparams.clear()
 712     _end_dc_rights = _end_copyright
 713
 714     def _start_item(self, attrsD):
 715         self.items.append({})
 716         self.push('item', 0)
 717         self.initem = 1
 718     _start_entry = _start_item
 719
 720     def _end_item(self):
 721         self.pop('item')
 722         self.initem = 0
 723     _end_entry = _end_item
 724
 725     def _start_dc_language(self, attrsD):
 726         self.push('language', 1)
 727     _start_language = _start_dc_language
 728
 729     def _end_dc_language(self):
 730         self.lang = self.pop('language')
 731     _end_language = _end_dc_language
 732
 733     def _start_dc_publisher(self, attrsD):
 734         self.push('publisher', 1)
 735     _start_webmaster = _start_dc_publisher
 736
 737     def _end_dc_publisher(self):
 738         self.pop('publisher')
 739     _end_webmaster = _end_dc_publisher
 740
 741     def _start_dcterms_issued(self, attrsD):
 742         self.push('issued', 1)
 743     _start_issued = _start_dcterms_issued
 744
 745     def _end_dcterms_issued(self):
 746         value = self.pop('issued')
 747         self._save('issued_parsed', _parse_date(value))
 748     _end_issued = _end_dcterms_issued
 749
 750     def _start_dcterms_created(self, attrsD):
 751         self.push('created', 1)
 752     _start_created = _start_dcterms_created
 753
 754     def _end_dcterms_created(self):
 755         value = self.pop('created')
 756         self._save('created_parsed', _parse_date(value))
 757     _end_created = _end_dcterms_created
 758
 759     def _start_dcterms_modified(self, attrsD):
 760         self.push('modified', 1)
 761     _start_modified = _start_dcterms_modified
 762     _start_dc_date = _start_dcterms_modified
 763     _start_pubdate = _start_dcterms_modified
 764
 765     def _end_dcterms_modified(self):
 766         value = self.pop('modified')
 767         parsed_value = _parse_date(value)
 768         self._save('date', value)
 769         self._save('date_parsed', parsed_value)
 770         self._save('modified_parsed', parsed_value)
 771     _end_modified = _end_dcterms_modified
 772     _end_dc_date = _end_dcterms_modified
 773     _end_pubdate = _end_dcterms_modified
 774
 775     def _start_expirationdate(self, attrsD):
 776         self.push('expired', 1)
 777
 778     def _end_expirationdate(self):
 779         self._save('expired_parsed', _parse_date(self.pop('expired')))
 780
 781     def _start_cc_license(self, attrsD):
 782         self.push('license', 1)
 783         value = self._getAttribute(attrsD, 'rdf:resource')
 784         if value:
 785             self.elementstack[-1][2].append(value)
 786         self.pop('license')
 787
 788     def _start_creativecommons_license(self, attrsD):
 789         self.push('license', 1)
 790
 791     def _end_creativecommons_license(self):
 792         self.pop('license')
 793
 794     def _start_category(self, attrsD):
 795         self.push('category', 1)
 796         domain = self._getAttribute(attrsD, 'domain')
 797         cats = []
 798         if self.initem:
 799             cats = self.items[-1].setdefault('categories', [])
 800         elif self.inchannel:
 801             cats = self.channel.setdefault('categories', [])
 802         cats.append((domain, None))
 803     _start_dc_subject = _start_category
 804
 805     def _end_category(self):
 806         self.pop('category')
 807     _end_dc_subject = _end_category
 808
 809     def _start_cloud(self, attrsD):
 810         self.channel['cloud'] = attrsD
 811
 812     def _start_link(self, attrsD):
 813         attrsD.setdefault('rel', 'alternate')
 814         attrsD.setdefault('type', 'text/html')
 815         if attrsD.has_key('href'):
 816             attrsD['href'] = self.resolveURI(attrsD['href'])
 817         expectingText = self.inchannel or self.initem
 818         if self.initem:
 819             self.items[-1].setdefault('links', [])
 820             self.items[-1]['links'].append(attrsD)
 821         elif self.inchannel:
 822             self.channel.setdefault('links', [])
 823             self.channel['links'].append(attrsD)
 824         if attrsD.has_key('href'):
 825             expectingText = 0
 826             if attrsD.get('type', '') in self.html_types:
 827                 if self.initem:
 828                     self.items[-1]['link'] = attrsD['href']
 829                 elif self.inchannel:
 830                     self.channel['link'] = attrsD['href']
 831         else:
 832             self.push('link', expectingText)
 833
 834     def _start_guid(self, attrsD):
 835         self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
 836         self.push('guid', 1)
 837
 838     def _end_guid(self):
 839         value = self.pop('guid')
 840         self._save('id', value)
 841         if self.guidislink:
 842             # guid acts as link, but only if "ispermalink" is not present or is "true",
 843             # and only if the item doesn't already have a link element
 844             self._save('link', value)
 845
 846     def _start_id(self, attrsD):
 847         self.push('id', 1)
 848
 849     def _end_id(self):
 850         value = self.pop('id')
 851         self._save('guid', value)
 852
 853     def _start_title(self, attrsD):
 854         self.incontent += 1
 855         self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
 856                               'type': attrsD.get('type', 'text/plain'),
 857                               'language': attrsD.get('xml:lang', self.lang),
 858                               'base': attrsD.get('xml:base', self.baseuri)}
 859         self.push('title', self.inchannel or self.initem)
 860     _start_dc_title = _start_title
 861
 862     def _end_title(self):
 863         self.pop('title')
 864         self.incontent -= 1
 865         self.contentparams.clear()
 866     _end_dc_title = _end_title
 867
 868     def _start_description(self, attrsD):
 869         self.incontent += 1
 870         self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
 871                               'type': attrsD.get('type', 'text/html'),
 872                               'language': attrsD.get('xml:lang', self.lang),
 873                               'base': attrsD.get('xml:base', self.baseuri)}
 874         self.push('description', self.inchannel or self.initem)
 875
 876     def _end_description(self):
 877         value = self.pop('description')
 878         if self.initem:
 879             self.items[-1]['summary'] = value
 880         elif self.inchannel:
 881             self.channel['tagline'] = value
 882         self.incontent -= 1
 883         self.contentparams.clear()
 884
 885     def _start_info(self, attrsD):
 886         self.incontent += 1
 887         self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
 888                               'type': attrsD.get('type', 'text/plain'),
 889                               'language': attrsD.get('xml:lang', self.lang),
 890                               'base': attrsD.get('xml:base', self.baseuri)}
 891         self.push('info', 1)
 892
 893     def _end_info(self):
 894         self.pop('info')
 895         self.incontent -= 1
 896         self.contentparams.clear()
 897
 898     def _start_generator(self, attrsD):
 899         if attrsD:
 900             self.channel['generator_detail'] = attrsD
 901         self.push('generator', 1)
 902
 903     def _end_generator(self):
 904         value = self.pop('generator')
 905         if self.channel.has_key('generator_detail'):
 906             self.channel['generator_detail']['name'] = value
 907
 908     def _start_admin_generatoragent(self, attrsD):
 909         self.push('generator', 1)
 910         value = self._getAttribute(attrsD, 'rdf:resource')
 911         if value:
 912             self.elementstack[-1][2].append(value)
 913         self.pop('generator')
 914
 915     def _start_admin_errorreportsto(self, attrsD):
 916         self.push('errorreportsto', 1)
 917         value = self._getAttribute(attrsD, 'rdf:resource')
 918         if value:
 919             self.elementstack[-1][2].append(value)
 920         self.pop('errorreportsto')
 921
 922     def _start_summary(self, attrsD):
 923         self.incontent += 1
 924         self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
 925                               'type': attrsD.get('type', 'text/plain'),
 926                               'language': attrsD.get('xml:lang', self.lang),
 927                               'base': attrsD.get('xml:base', self.baseuri)}
 928         self.push('summary', 1)
 929
 930     def _end_summary(self):
 931         value = self.pop('summary')
 932         if self.items:
 933             self.items[-1]['description'] = value
 934         self.incontent -= 1
 935         self.contentparams.clear()
 936
 937     def _start_enclosure(self, attrsD):
 938         if self.initem:
 939             self.items[-1].setdefault('enclosures', [])
 940             self.items[-1]['enclosures'].append(attrsD)
 941
 942     def _start_source(self, attrsD):
 943         if self.initem:
 944             self.items[-1]['source'] = attrsD
 945         self.push('source', 1)
 946
 947     def _end_source(self):
 948         self.pop('source')
 949
 950     def _start_content(self, attrsD):
 951         self.incontent += 1
 952         self.contentparams = {'mode': attrsD.get('mode', 'xml'),
 953                               'type': attrsD.get('type', 'text/plain'),
 954                               'language': attrsD.get('xml:lang', self.lang),
 955                               'base': attrsD.get('xml:base', self.baseuri)}
 956         self.push('content', 1)
 957
 958     def _start_body(self, attrsD):
 959         self.incontent += 1
 960         self.contentparams = {'mode': 'xml',
 961                               'type': 'application/xhtml+xml',
 962                               'language': attrsD.get('xml:lang', self.lang),
 963                               'base': attrsD.get('xml:base', self.baseuri)}
 964         self.push('content', 1)
 965     _start_xhtml_body = _start_body
 966
 967     def _start_content_encoded(self, attrsD):
 968         self.incontent += 1
 969         self.contentparams = {'mode': 'escaped',
 970                               'type': 'text/html',
 971                               'language': attrsD.get('xml:lang', self.lang),
 972                               'base': attrsD.get('xml:base', self.baseuri)}
 973         self.push('content', 1)
 974     _start_fullitem = _start_content_encoded
 975
 976     def _end_content(self):
 977         value = self.pop('content')
 978         if self.contentparams.get('type') in (['text/plain'] + self.html_types):
 979             self._save('description', value)
 980         self.incontent -= 1
 981         self.contentparams.clear()
 982     _end_body = _end_content
 983     _end_xhtml_body = _end_content
 984     _end_content_encoded = _end_content
 985     _end_fullitem = _end_content
 986
 987 if _XML_AVAILABLE:
 988     class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):#, xml.sax.handler.DTDHandler):
 989         def __init__(self, baseuri):
 990             if _debug: sys.stderr.write('trying StrictFeedParser\n')
 991             xml.sax.handler.ContentHandler.__init__(self)
 992             _FeedParserMixin.__init__(self, baseuri)
 993             self.bozo = 0
 994             self.exc = None
 995
 996         def startPrefixMapping(self, prefix, uri):
 997             self.trackNamespace(prefix, uri)
 998
 999         def startElementNS(self, name, qname, attrs):
1000             namespace, localname = name
1001             namespace = str(namespace)
1002             prefix = self.namespaces.get(namespace, '')
1003             if prefix:
1004                 localname = prefix + ':' + localname
1005             localname = str(localname).lower()
1006
1007             # qname implementation is horribly broken in Python 2.1 (it
1008             # doesn't report any), and slightly broken in Python 2.2 (it
1009             # doesn't report the xml: namespace). So we match up namespaces
1010             # with a known list first, and then possibly override them with
1011             # the qnames the SAX parser gives us (if indeed it gives us any
1012             # at all).  Thanks to MatejC for helping me test this and
1013             # tirelessly telling me that it didn't work yet.
1014             attrsD = {}
1015             for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1016                 prefix = self.namespaces.get(namespace, '')
1017                 if prefix:
1018                     attrlocalname = prefix + ":" + attrlocalname
1019                 attrsD[str(attrlocalname).lower()] = attrvalue
1020             for qname in attrs.getQNames():
1021                 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1022             self.unknown_starttag(localname, attrsD.items())
1023
1024         def resolveEntity(self, publicId, systemId):
1025             return _StringIO()
1026
1027         def characters(self, text):
1028             self.handle_data(text)
1029
1030         def endElementNS(self, name, qname):
1031             namespace, localname = name
1032             namespace = str(namespace)
1033             prefix = self.namespaces.get(namespace, '')
1034             if prefix:
1035                 localname = prefix + ':' + localname
1036             localname = str(localname).lower()
1037             self.unknown_endtag(localname)
1038
1039         def fatalError(self, exc):
1040             self.bozo = 1
1041             self.exc = exc
1042         error = fatalError
1043
1044 class _LooseFeedParser(_FeedParserMixin, sgmllib.SGMLParser):
1045     def __init__(self, baseuri):
1046         sgmllib.SGMLParser.__init__(self)
1047         _FeedParserMixin.__init__(self, baseuri)
1048
1049 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1050     elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1051       'img', 'input', 'isindex', 'link', 'meta', 'param']
1052
1053     def __init__(self):
1054         sgmllib.SGMLParser.__init__(self)
1055
1056     def reset(self):
1057         # extend (called by sgmllib.SGMLParser.__init__)
1058         self.pieces = []
1059         sgmllib.SGMLParser.reset(self)
1060
1061     def normalize_attrs(self, attrs):
1062         # utility method to be called by descendants
1063         attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
1064         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1065         return attrs
1066
1067     def unknown_starttag(self, tag, attrs):
1068         # called for each start tag
1069         # attrs is a list of (attr, value) tuples
1070         # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
1071         strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
1072         if tag in self.elements_no_end_tag:
1073             self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
1074         else:
1075             self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
1076
1077     def unknown_endtag(self, tag):
1078         # called for each end tag, e.g. for </pre>, tag will be "pre"
1079         # Reconstruct the original end tag.
1080         if tag not in self.elements_no_end_tag:
1081             self.pieces.append("</%(tag)s>" % locals())
1082
1083     def handle_charref(self, ref):
1084         # called for each character reference, e.g. for "&#160;", ref will be "160"
1085         # Reconstruct the original character reference.
1086         self.pieces.append("&#%(ref)s;" % locals())
1087
1088     def handle_entityref(self, ref):
1089         # called for each entity reference, e.g. for "&copy;", ref will be "copy"
1090         # Reconstruct the original entity reference.
1091         self.pieces.append("&%(ref)s;" % locals())
1092
1093     def handle_data(self, text):
1094         # called for each block of plain text, i.e. outside of any tag and
1095         # not containing any character or entity references
1096         # Store the original text verbatim.
1097         self.pieces.append(text)
1098
1099     def handle_comment(self, text):
1100         # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1101         # Reconstruct the original comment.
1102         self.pieces.append("<!--%(text)s-->" % locals())
1103
1104     def handle_pi(self, text):
1105         # called for each processing instruction, e.g. <?instruction>
1106         # Reconstruct original processing instruction.
1107         self.pieces.append("<?%(text)s>" % locals())
1108
1109     def handle_decl(self, text):
1110         # called for the DOCTYPE, if present, e.g.
1111         # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1112         #     "http://www.w3.org/TR/html4/loose.dtd">
1113         # Reconstruct original DOCTYPE
1114         self.pieces.append("<!%(text)s>" % locals())
1115
1116     def output(self):
1117         """Return processed HTML as a single string"""
1118         return "".join(self.pieces)
1119
1120 class _RelativeURIResolver(_BaseHTMLProcessor):
1121     relative_uris = [('a', 'href'),
1122                      ('applet', 'codebase'),
1123                      ('area', 'href'),
1124                      ('blockquote', 'cite'),
1125                      ('body', 'background'),
1126                      ('del', 'cite'),
1127                      ('form', 'action'),
1128                      ('frame', 'longdesc'),
1129                      ('frame', 'src'),
1130                      ('iframe', 'longdesc'),
1131                      ('iframe', 'src'),
1132                      ('head', 'profile'),
1133                      ('img', 'longdesc'),
1134                      ('img', 'src'),
1135                      ('img', 'usemap'),
1136                      ('input', 'src'),
1137                      ('input', 'usemap'),
1138                      ('ins', 'cite'),
1139                      ('link', 'href'),
1140                      ('object', 'classid'),
1141                      ('object', 'codebase'),
1142                      ('object', 'data'),
1143                      ('object', 'usemap'),
1144                      ('q', 'cite'),
1145                      ('script', 'src')]
1146
1147     def __init__(self, baseuri):
1148         _BaseHTMLProcessor.__init__(self)
1149         self.baseuri = baseuri
1150
1151     def resolveURI(self, uri):
1152         return urlparse.urljoin(self.baseuri, uri)
1153
1154     def unknown_starttag(self, tag, attrs):
1155         attrs = self.normalize_attrs(attrs)
1156         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1157         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1158
1159 def _resolveRelativeURIs(htmlSource, baseURI):
1160     p = _RelativeURIResolver(baseURI)
1161     p.feed(htmlSource)
1162     return p.output()
1163
1164 class _HTMLSanitizer(_BaseHTMLProcessor):
1165     acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1166       'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1167       'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1168       'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1169       'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1170       'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1171       'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1172       'thead', 'tr', 'tt', 'u', 'ul', 'var']
1173
1174     acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1175       'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1176       'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1177       'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1178       'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1179       'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1180       'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1181       'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1182       'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1183       'usemap', 'valign', 'value', 'vspace', 'width']
1184
1185     unacceptable_elements_with_end_tag = ['script', 'applet']
1186
1187     def reset(self):
1188         _BaseHTMLProcessor.reset(self)
1189         self.unacceptablestack = 0
1190
1191     def unknown_starttag(self, tag, attrs):
1192         if not tag in self.acceptable_elements:
1193             if tag in self.unacceptable_elements_with_end_tag:
1194                 self.unacceptablestack += 1
1195             return
1196         attrs = self.normalize_attrs(attrs)
1197         attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1198         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1199
1200     def unknown_endtag(self, tag):
1201         if not tag in self.acceptable_elements:
1202             if tag in self.unacceptable_elements_with_end_tag:
1203                 self.unacceptablestack -= 1
1204             return
1205         _BaseHTMLProcessor.unknown_endtag(self, tag)
1206
1207     def handle_pi(self, text):
1208         pass
1209
1210     def handle_decl(self, text):
1211         pass
1212
1213     def handle_data(self, text):
1214         if not self.unacceptablestack:
1215             _BaseHTMLProcessor.handle_data(self, text)
1216
1217 def _sanitizeHTML(htmlSource):
1218     p = _HTMLSanitizer()
1219     p.feed(htmlSource)
1220     data = p.output()
1221     if _mxtidy:
1222         nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
1223         if data.count('<body'):
1224             data = data.split('<body', 1)[1]
1225             if data.count('>'):
1226                 data = data.split('>', 1)[1]
1227         if data.count('</body'):
1228             data = data.split('</body', 1)[0]
1229     data = data.strip().replace('\r\n', '\n')
1230     return data
1231
1232 class _FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1233     def http_error_default(self, req, fp, code, msg, headers):
1234         if ((code / 100) == 3) and (code != 304):
1235             return self.http_error_302(req, fp, code, msg, headers)
1236         from urllib import addinfourl
1237         infourl = addinfourl(fp, headers, req.get_full_url())
1238         infourl.status = code
1239         return infourl
1240
1241     def http_error_302(self, req, fp, code, msg, headers):
1242         infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1243         infourl.status = code
1244         return infourl
1245
1246     def http_error_301(self, req, fp, code, msg, headers):
1247         infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1248         infourl.status = code
1249         return infourl
1250
1251     http_error_300 = http_error_302
1252     http_error_307 = http_error_302
1253
1254 def _open_resource(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None):
1255     """URL, filename, or string --> stream
1256
1257     This function lets you define parsers that take any input source
1258     (URL, pathname to local or network file, or actual data as a string)
1259     and deal with it in a uniform manner.  Returned object is guaranteed
1260     to have all the basic stdio read methods (read, readline, readlines).
1261     Just .close() the object when you're done with it.
1262
1263     If the etag argument is supplied, it will be used as the value of an
1264     If-None-Match request header.
1265
1266     If the modified argument is supplied, it must be a tuple of 9 integers
1267     as returned by gmtime() in the standard Python time module. This MUST
1268     be in GMT (Greenwich Mean Time). The formatted date/time will be used
1269     as the value of an If-Modified-Since request header.
1270
1271     If the agent argument is supplied, it will be used as the value of a
1272     User-Agent request header.
1273
1274     If the referrer argument is supplied, it will be used as the value of a
1275     Referer[sic] request header.
1276     """
1277
1278     if hasattr(url_file_stream_or_string, "read"):
1279         return url_file_stream_or_string
1280
1281     if url_file_stream_or_string == "-":
1282         return sys.stdin
1283
1284     if not agent:
1285         agent = USER_AGENT
1286
1287     # try to open with urllib2 (to use optional headers)
1288     request = urllib2.Request(url_file_stream_or_string)
1289     if etag:
1290         request.add_header("If-None-Match", etag)
1291     if modified:
1292         # format into an RFC 1123-compliant timestamp. We can't use
1293         # time.strftime() since the %a and %b directives can be affected
1294         # by the current locale, but RFC 2616 states that dates must be
1295         # in English.
1296         short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
1297         months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
1298         request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1299     request.add_header("User-Agent", agent)
1300     if referrer:
1301         request.add_header("Referer", referrer)
1302     if gzip:
1303         request.add_header("Accept-encoding", "gzip")
1304     opener = urllib2.build_opener(_FeedURLHandler())
1305     opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1306     try:
1307         try:
1308             return opener.open(request)
1309         except:
1310             # url_file_stream_or_string is not a valid URL, but it might be a valid filename
1311             pass
1312     finally:
1313         opener.close() # JohnD
1314
1315     # try to open with native open function (if url_file_stream_or_string is a filename)
1316     try:
1317         return open(url_file_stream_or_string)
1318     except:
1319         pass
1320
1321     # treat url_file_stream_or_string as string
1322     return _StringIO(str(url_file_stream_or_string))
1323
1324 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
1325 # Drake and licensed under the Python license.  Removed all range checking
1326 # for month, day, hour, minute, and second, since mktime will normalize
1327 # these later
1328 def _w3dtf_parse(s):
1329     def __extract_date(m):
1330         year = int(m.group("year"))
1331         if year < 100:
1332             year = 100 * int(time.gmtime()[0] / 100) + int(year)
1333         if year < 1000:
1334             return 0, 0, 0
1335         julian = m.group("julian")
1336         if julian:
1337             julian = int(julian)
1338             month = julian / 30 + 1
1339             day = julian % 30 + 1
1340             jday = None
1341             while jday != julian:
1342                 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
1343                 jday = time.gmtime(t)[-2]
1344                 diff = abs(jday - julian)
1345                 if jday > julian:
1346                     if diff < day:
1347                         day = day - diff
1348                     else:
1349                         month = month - 1
1350                         day = 31
1351                 elif jday < julian:
1352                     if day + diff < 28:
1353                        day = day + diff
1354                     else:
1355                         month = month + 1
1356             return year, month, day
1357         month = m.group("month")
1358         day = 1
1359         if month is None:
1360             month = 1
1361         else:
1362             month = int(month)
1363             day = m.group("day")
1364             if day:
1365                 day = int(day)
1366             else:
1367                 day = 1
1368         return year, month, day
1369
1370     def __extract_time(m):
1371         if not m:
1372             return 0, 0, 0
1373         hours = m.group("hours")
1374         if not hours:
1375             return 0, 0, 0
1376         hours = int(hours)
1377         minutes = int(m.group("minutes"))
1378         seconds = m.group("seconds")
1379         if seconds:
1380             seconds = int(seconds)
1381         else:
1382             seconds = 0
1383         return hours, minutes, seconds
1384
1385     def __extract_tzd(m):
1386         """Return the Time Zone Designator as an offset in seconds from UTC."""
1387         if not m:
1388             return 0
1389         tzd = m.group("tzd")
1390         if not tzd:
1391             return 0
1392         if tzd == "Z":
1393             return 0
1394         hours = int(m.group("tzdhours"))
1395         minutes = m.group("tzdminutes")
1396         if minutes:
1397             minutes = int(minutes)
1398         else:
1399             minutes = 0
1400         offset = (hours*60 + minutes) * 60
1401         if tzd[0] == "+":
1402             return -offset
1403         return offset
1404
1405     __date_re = ("(?P<year>\d\d\d\d)"
1406                  "(?:(?P<dsep>-|)"
1407                  "(?:(?P<julian>\d\d\d)"
1408                  "|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?")
1409     __tzd_re = "(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)"
1410     __tzd_rx = re.compile(__tzd_re)
1411     __time_re = ("(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)"
1412                  "(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?"
1413                  + __tzd_re)
1414     __datetime_re = "%s(?:T%s)?" % (__date_re, __time_re)
1415     __datetime_rx = re.compile(__datetime_re)
1416     m = __datetime_rx.match(s)
1417     if m is None or m.group() != s:
1418         return None
1419     gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
1420     if gmt[0] == 0: return
1421     return time.mktime(gmt) + __extract_tzd(m) - time.timezone
1422
1423 # Additional ISO-8601 date parsing routines written by Fazal Majid
1424 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1425 # parser is beyond the scope of feedparser and would be a worthwhile addition
1426 # to the Python library
1427 # A single regular expression cannot parse ISO 8601 date formats into groups
1428 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
1429 # 0301-04-01), so we use templates instead
1430 # Please note the order in templates is significant because we need a
1431 # greedy match
1432 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1433                 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1434                 '-YY-?MM', '-OOO', '-YY',
1435                 '--MM-?DD', '--MM',
1436                 '---DD',
1437                 'CC', '']
1438 _iso8601_re = [
1439     tmpl.replace(
1440     'YYYY', r'(?P<year>\d{4})').replace(
1441     'YY', r'(?P<year>\d\d)').replace(
1442     'MM', r'(?P<month>[01]\d)').replace(
1443     'DD', r'(?P<day>[0123]\d)').replace(
1444     'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1445     'CC', r'(?P<century>\d\d$)')
1446     + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1447     + r'(:(?P<second>\d{2}))?'
1448     + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1449     for tmpl in _iso8601_tmpl]
1450 del tmpl
1451
1452 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1453 del regex
1454
1455 # rfc822.py defines several time zones, but we define some extra ones.
1456 # "ET" is equivalent to "EST", etc.
1457 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
1458 rfc822._timezones.update(_additional_timezones)
1459
1460 def _parse_date(date):
1461     """Parses a variety of date formats into a tuple of 9 integers"""
1462     date = str(date)
1463     try:
1464         # try the standard rfc822 library, which handles
1465         # RFC822, RFC1123, RFC2822, and asctime
1466         tm = rfc822.parsedate_tz(date)
1467         if tm:
1468             return time.gmtime(rfc822.mktime_tz(tm))
1469         # not a RFC2822 date, try W3DTF profile of ISO-8601
1470         try:
1471             tm = _w3dtf_parse(date)
1472         except ValueError:
1473             tm = None
1474         if tm:
1475             return time.gmtime(tm)
1476         # try various non-W3DTF ISO-8601-compatible formats like 20040105
1477         m = None
1478         for _iso8601_match in _iso8601_matches:
1479             m = _iso8601_match(date)
1480             if m: break
1481         if not m: return
1482         # catch truly malformed strings
1483         if m.span() == (0, 0): return
1484         params = m.groupdict()
1485         ordinal = params.get("ordinal", 0)
1486         if ordinal:
1487             ordinal = int(ordinal)
1488         else:
1489             ordinal = 0
1490         year = params.get("year", "--")
1491         if not year or year == "--":
1492             year = time.gmtime()[0]
1493         elif len(year) == 2:
1494             # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1495             year = 100 * int(time.gmtime()[0] / 100) + int(year)
1496         else:
1497             year = int(year)
1498         month = params.get("month", "-")
1499         if not month or month == "-":
1500             # ordinals are NOT normalized by mktime, we simulate them
1501             # by setting month=1, day=ordinal
1502             if ordinal:
1503                 month = 1
1504             else:
1505                 month = time.gmtime()[1]
1506         month = int(month)
1507         day = params.get("day", 0)
1508         if not day:
1509             # see above
1510             if ordinal:
1511                 day = ordinal
1512             elif params.get("century", 0) or \
1513                      params.get("year", 0) or params.get("month", 0):
1514                 day = 1
1515             else:
1516                 day = time.gmtime()[2]
1517         else:
1518             day = int(day)
1519         # special case of the century - is the first year of the 21st century
1520         # 2000 or 2001 ? The debate goes on...
1521         if "century" in params.keys():
1522             year = (int(params["century"]) - 1) * 100 + 1
1523         # in ISO 8601 most fields are optional
1524         for field in ["hour", "minute", "second", "tzhour", "tzmin"]:
1525             if not params.get(field, None):
1526                 params[field] = 0
1527         hour = int(params.get("hour", 0))
1528         minute = int(params.get("minute", 0))
1529         second = int(params.get("second", 0))
1530         # weekday is normalized by mktime(), we can ignore it
1531         weekday = 0
1532         # daylight savings is complex, but not needed for feedparser's purposes
1533         # as time zones, if specified, include mention of whether it is active
1534         # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1535         # and most implementations have DST bugs
1536         daylight_savings_flag = 0
1537         tm = [year, month, day, hour, minute, second, weekday,
1538               ordinal, daylight_savings_flag]
1539         # ISO 8601 time zone adjustments
1540         tz = params.get("tz")
1541         if tz and tz != "Z":
1542             if tz[0] == "-":
1543                 tm[3] += int(params.get("tzhour", 0))
1544                 tm[4] += int(params.get("tzmin", 0))
1545             elif tz[0] == "+":
1546                 tm[3] -= int(params.get("tzhour", 0))
1547                 tm[4] -= int(params.get("tzmin", 0))
1548             else:
1549                 return None
1550         # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1551         # which is guaranteed to normalize d/m/y/h/m/s
1552         # many implementations have bugs, but we'll pretend they don't
1553         return time.localtime(time.mktime(tm))
1554     except:
1555         return None
1556
1557 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None):
1558     """Parse a feed from a URL, file, stream, or string"""
1559     result = {}
1560     f = _open_resource(url_file_stream_or_string, etag=etag, modified=modified, agent=agent, referrer=referrer)
1561     data = f.read()
1562     if hasattr(f, "headers"):
1563         if gzip and f.headers.get('content-encoding', '') == 'gzip':
1564             try:
1565                 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
1566             except:
1567                 # some feeds claim to be gzipped but they're not, so we get garbage
1568                 data = ''
1569     if hasattr(f, "info"):
1570         info = f.info()
1571         result["etag"] = info.getheader("ETag")
1572         last_modified = info.getheader("Last-Modified")
1573         if last_modified:
1574             result["modified"] = _parse_date(last_modified)
1575     if hasattr(f, "url"):
1576         result["url"] = f.url
1577         result["status"] = 200 # default, may be overridden later
1578     if hasattr(f, "status"):
1579         result["status"] = f.status
1580     if hasattr(f, "headers"):
1581         result["headers"] = f.headers.dict
1582     # get the xml encoding
1583     xmlheaderRe = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>') # Andrei's version
1584     match = xmlheaderRe.match(data)
1585     if match:
1586         result["encoding"] = match.groups()[0].lower()
1587     f.close()
1588     result['channel'] = {}
1589     result['items'] = {}
1590     baseuri = result.get('headers', {}).get('content-location', result.get('url'))
1591     # try true XML parser first
1592     if _XML_AVAILABLE:
1593         if _debug: sys.stderr.write('using xml library\n')
1594         result['bozo'] = 0
1595         feedparser = _StrictFeedParser(baseuri)
1596         if re.search(r'<!DOCTYPE\s+?rss\s+?PUBLIC\s+?"-//Netscape Communications//DTD RSS 0.91//EN"\s+?"http://my.netscape.com/publish/formats/rss-0.91.dtd">', data):
1597             feedparser.version = 'rss091n'
1598         source = xml.sax.xmlreader.InputSource()
1599         source.setByteStream(_StringIO(data))
1600         saxparser = xml.sax.make_parser()#["drv_libxml2"])
1601         saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
1602         saxparser.setContentHandler(feedparser)
1603         saxparser.setErrorHandler(feedparser)
1604         try:
1605             saxparser.setDTDHandler(feedparser)
1606             saxparser.setEntityResolver(feedparser)
1607         except xml.sax.SAXNotSupportedException:
1608             if _debug: sys.stderr.write('using an xml library that does not support DTDHandler and EntityResolver (this is not a problem)\n')
1609             # libxml2 driver does not currently support DTDHandler or EntityResolver
1610             pass
1611         if hasattr(saxparser, '_ns_stack'):
1612             # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
1613             # PyXML doesn't have this problem, and it doesn't have _ns_stack either
1614             saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
1615         try:
1616             saxparser.parse(source)
1617         except Exception, e:
1618             # SAX parser is supposed to catch all of these and call feedparser.fatal_error,
1619             # which captures them.  For some reason, some Unicode-related errors go
1620             # uncaught on some combination of platform, XML library, Python version,
1621             # and phase of the moon.
1622             feedparser.bozo = 1
1623             feedparser.bozo_exception = e
1624         if feedparser.bozo:
1625             # feed is not well-formed XML, fall back on regex-based parser
1626             if _debug: sys.stderr.write('xml parsing failed, using regexes.  now you have two problems...\n')
1627             result['bozo'] = 1
1628             result['bozo_exception'] = feedparser.exc
1629             # munge short tags, e.g. <description/> becomes <description></description>
1630             data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
1631             feedparser = _LooseFeedParser(baseuri)
1632             feedparser.feed(data)
1633     else:
1634         if _debug: sys.stderr.write('no xml libraries available, using regexes\n')
1635         data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
1636         feedparser = _LooseFeedParser(baseuri)
1637         feedparser.feed(data)
1638     result['channel'] = feedparser.channel
1639     result['items'] = feedparser.items
1640     result['version'] = feedparser.version
1641     return result
1642
1643 _TEST_SUITE = ('http://www.pocketsoap.com/rssTests/rss1.0withModules.xml',
1644               'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNS.xml',
1645               'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNSLocalNameClash.xml',
1646               'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModules.xml',
1647               'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModulesLocalNameClash.xml',
1648               'http://www.pocketsoap.com/rssTests/rss2.0NSwithModules.xml',
1649               'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNS.xml',
1650               'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNSLocalNameClash.xml')
1651
1652 if __name__ == '__main__':
1653     if sys.argv[1:]:
1654         urls = sys.argv[1:]
1655     else:
1656         urls = _TEST_SUITE
1657     from pprint import pprint
1658     for url in urls:
1659         print url
1660         print
1661         result = parse(url)
1662         pprint(result)
1663         print
1664
1665 #TODO
1666 #- image
1667 #- textinput/textInput
1668 #- comments
1669 #
1670 #encoding notes:
1671 #- RFC 3023
1672 #- content-type.startswith('text/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else "us-ascii"
1673 #- content-type.startswith('application/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else look for encoding="(.*?)" in document, else "utf-8"
1674 #- parsing encoding: http://www.w3.org/TR/REC-xml#NT-EncodingDecl
1675 #
1676 #REVISION HISTORY
1677 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
1678 #  added Simon Fell's test suite
1679 #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
1680 #2.0 - 10/19/2002
1681 #  JD - use inchannel to watch out for image and textinput elements which can
1682 #  also contain title, link, and description elements
1683 #  JD - check for isPermaLink="false" attribute on guid elements
1684 #  JD - replaced openAnything with open_resource supporting ETag and
1685 #  If-Modified-Since request headers
1686 #  JD - parse now accepts etag, modified, agent, and referrer optional
1687 #  arguments
1688 #  JD - modified parse to return a dictionary instead of a tuple so that any
1689 #  etag or modified information can be returned and cached by the caller
1690 #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
1691 #  because of etag/modified, return the old etag/modified to the caller to
1692 #  indicate why nothing is being returned
1693 #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
1694 #  useless.  Fixes the problem JD was addressing by adding it.
1695 #2.1 - 11/14/2002 - MAP - added gzip support
1696 #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
1697 #  start_admingeneratoragent is an example of how to handle elements with
1698 #  only attributes, no content.
1699 #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
1700 #  also, make sure we send the User-Agent even if urllib2 isn't available.
1701 #  Match any variation of backend.userland.com/rss namespace.
1702 #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
1703 #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
1704 #  snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
1705 #  project name
1706 #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
1707 #  removed unnecessary urllib code -- urllib2 should always be available anyway;
1708 #  return actual url, status, and full HTTP headers (as result['url'],
1709 #  result['status'], and result['headers']) if parsing a remote feed over HTTP --
1710 #  this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
1711 #  added the latest namespace-of-the-week for RSS 2.0
1712 #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
1713 #  User-Agent (otherwise urllib2 sends two, which confuses some servers)
1714 #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
1715 #  inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
1716 #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
1717 #  textInput, and also to return the character encoding (if specified)
1718 #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
1719 #  nested divs within content (JohnD); fixed missing sys import (JohanS);
1720 #  fixed regular expression to capture XML character encoding (Andrei);
1721 #  added support for Atom 0.3-style links; fixed bug with textInput tracking;
1722 #  added support for cloud (MartijnP); added support for multiple
1723 #  category/dc:subject (MartijnP); normalize content model: "description" gets
1724 #  description (which can come from description, summary, or full content if no
1725 #  description), "content" gets dict of base/language/type/value (which can come
1726 #  from content:encoded, xhtml:body, content, or fullitem);
1727 #  fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
1728 #  tracking; fixed bug tracking unknown tags; fixed bug tracking content when
1729 #  <content> element is not in default namespace (like Pocketsoap feed);
1730 #  resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
1731 #  wfw:commentRSS; resolve relative URLs within embedded HTML markup in
1732 #  description, xhtml:body, content, content:encoded, title, subtitle,
1733 #  summary, info, tagline, and copyright; added support for pingback and
1734 #  trackback namespaces
1735 #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
1736 #  namespaces, as opposed to 2.6 when I said I did but didn't really;
1737 #  sanitize HTML markup within some elements; added mxTidy support (if
1738 #  installed) to tidy HTML markup within some elements; fixed indentation
1739 #  bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
1740 #  (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
1741 #  'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
1742 #  'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
1743 #  and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
1744 #2.7.1 - 1/9/2004 - MAP - fixed bug handling &quot; and &apos;.  fixed memory
1745 #  leak not closing url opener (JohnD); added dc:publisher support (MarekK);
1746 #  added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
1747 #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
1748 #  encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
1749 #  fixed relative URI processing for guid (skadz); added ICBM support; added
1750 #  base64 support
1751 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
1752 #  blogspot.com sites); added _debug variable
1753 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
1754 #3.0 - MAP - parse entire feed with real XML parser (if available); added several
1755 #  new supported namespaces; fixed bug tracking naked markup in description;
1756 #  added support for enclosure; added support for source; re-added support for
1757 #  cloud which got dropped somehow; added support for expirationDate; fixed
1758 #  xml:lang inheritance; fixed multiple bugs tracking xml:base URI, one for
1759 #  documents that don't define one explicitly and one for documents that define
1760 #  an outer and an inner xml:base that goes out of scope before the end of the
1761 #  document; fixed bug parsing multiple links at feed level; added feed type and
1762 #  version detection, results["version"] will be one of SUPPORTED_VERSIONS.keys()
1763 #  or empty string if unrecognized; added support for creativeCommons:license and
1764 #  cc:license; added support for full Atom content model in title, tagline, info,
1765 #  copyright, summary; fixed bug with gzip encoding (not always telling server
1766 #  we support it when we do); support Atom-style author element in author_detail
1767 #  (dictionary of "name", "url", "email"); map author to author_detail if author
1768 #  contains name + email address; better handling of empty HTML tags (br, hr, img,
1769 #  etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />);
1770 #  fixed CDATA handling in non-wellformed feeds under Python 2.1