Visit http://diveintomark.org/projects/feed_parser/ for the latest version
-Handles RSS 0.9x, RSS 1.0, RSS 2.0, Atom feeds
-
-Things it handles that choke other parsers:
-- bastard combinations of RSS 0.9x and RSS 1.0
-- illegal 8-bit XML characters
-- naked and/or invalid HTML in description
-- content:encoded, xhtml:body, fullitem
-- guid
-- elements in non-standard namespaces or non-default namespaces
-- multiple content items per entry (Atom)
-- multiple links per entry (Atom)
-
-Other features:
-- resolves relative URIs in some elements
- - uses xml:base to define base URI
- - uses URI of feed if no xml:base is given
- - to control which elements are resolved, set _FeedParserMixin.can_be_relative_uri
-- resolves relative URIs within embedded markup
- - to control which elements are resolved, set _FeedParserMixin.can_contain_relative_uris
-- sanitizes embedded markup in some elements
- - to allow/disallow HTML elements, set _HTMLSanitizer.acceptable_elements
- - to allow/disallow HTML attributes, set _HTMLSanitizer.acceptable_attributes
- - to control which feed elements are sanitized, set _FeedParserMixin.can_contain_dangerous_markup
- - to disable entirely (NOT RECOMMENDED), set _FeedParserMixin.can_contain_dangerous_markup = []
-- optionally tidies embedded markup
- - fixes malformed HTML
- - converts to XHTML
- - converts character entities to numeric entities
- - requires mxTidy <http://www.lemburg.com/files/python/mxTidy.html>
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds
Required: Python 2.1 or later
Recommended: Python 2.3 or later
+Recommended: libxml2 <http://xmlsoft.org/python.html>
"""
-__version__ = "3.0-beta-14"
+__version__ = "3.0-beta-22"
__author__ = "Mark Pilgrim <http://diveintomark.org/>"
__copyright__ = "Copyright 2002-4, Mark Pilgrim"
__contributors__ = ["Jason Diamond <http://injektilo.org/>",
"Fazal Majid <http://www.majid.info/mylos/weblog/>"]
__license__ = "Python"
_debug = 0
+_debug_never_use_libxml2 = 0
# if you are embedding feedparser in a larger application, you should change this to your application name and URL
USER_AGENT = "UniversalFeedParser/%s%s +http://diveintomark.org/projects/feed_parser/" % (__version__, _debug and "-debug" or "")
+# If you want feedparser to automatically run HTML markup through HTML Tidy, set this to 1.
+# This is off by default because of reports of crashing on some platforms. If it crashes
+# for you, please submit a bug report with your OS platform, Python version, and the URL
+# of the feed you were attempting to parse.
+# Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
+TIDY_MARKUP = 0
+
# ---------- required modules (should come with any Python distribution) ----------
-import sgmllib, re, sys, copy, urlparse, time, rfc822
+import sgmllib, re, sys, copy, urlparse, time, rfc822, types
try:
from cStringIO import StringIO as _StringIO
except:
# timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers.
# Python 2.3 now has this functionality available in the standard socket library, so under
-# 2.3 you don't need to install anything.
-import socket
-if hasattr(socket, 'setdefaulttimeout'):
- socket.setdefaulttimeout(10)
-else:
- try:
- import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
- timeoutsocket.setDefaultSocketTimeout(10)
- except ImportError:
- pass
+# 2.3 you don't need to install anything. But you probably should anyway, because the socket
+# module is buggy and timeoutsocket is better.
+try:
+ import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
+ timeoutsocket.setDefaultSocketTimeout(10)
+except ImportError:
+ import socket
+ if hasattr(socket, 'setdefaulttimeout'):
+ socket.setdefaulttimeout(10)
import urllib2
-# mxtidy allows feedparser to tidy malformed embedded HTML markup in description, content, etc.
-# this does not affect HTML sanitizing, which is self-contained in the _HTMLSanitizer class
-try:
- from mx.Tidy import Tidy as _mxtidy # http://www.lemburg.com/files/python/mxTidy.html
-except:
- _mxtidy = None
+_mxtidy = None
+if TIDY_MARKUP:
+ try:
+ from mx.Tidy import Tidy as _mxtidy
+ except:
+ pass
# If a real XML parser is available, feedparser will attempt to use it. feedparser works
# with both the built-in SAX parser and PyXML SAX parser. On platforms where the Python
# using one.
try:
import xml.sax
- from xml.sax.saxutils import escape as xmlescape
+ from xml.sax.saxutils import escape as _xmlescape
+ class CharacterEncodingOverride(xml.sax.SAXException): pass
_XML_AVAILABLE = 1
except:
_XML_AVAILABLE = 0
- def xmlescape(data):
+ def _xmlescape(data):
data = data.replace("&", "&")
data = data.replace(">", ">")
data = data.replace("<", "<")
'atom01': 'Atom 0.1',
'atom02': 'Atom 0.2',
'atom03': 'Atom 0.3',
- 'atom': 'Atom (unknown version)'
+ 'atom': 'Atom (unknown version)',
+ 'cdf': 'CDF',
+ 'hotrss': 'Hot RSS'
}
try:
rc[k] = v
return rc
+from UserDict import UserDict
+class FeedParserDict(UserDict):
+ def __getitem__(self, key):
+ if key == 'channel': key = 'feed'
+ if key == 'items': key = 'entries'
+ return UserDict.__getitem__(self, key)
+
+ def __getattr__(self, key):
+ try:
+ return self.__dict__[key]
+ except KeyError:
+ pass
+ try:
+ return self.__getitem__(key)
+ except:
+ raise AttributeError, "object has no attribute '%s'" % key
+
class _FeedParserMixin:
- namespaces = {"http://backend.userland.com/rss": "",
+ namespaces = {"": "",
+ "http://backend.userland.com/rss": "",
"http://blogs.law.harvard.edu/tech/rss": "",
"http://purl.org/rss/1.0/": "",
+ "http://my.netscape.com/rdf/simple/0.9/": "",
"http://example.com/newformat#": "",
"http://example.com/necho": "",
"http://purl.org/echo/": "",
"http://www.w3.org/XML/1998/namespace": "xml"
}
- can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentRSS', 'docs', 'url', 'comments']
+ can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'comments']
can_contain_relative_uris = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
can_contain_dangerous_markup = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
html_types = ['text/html', 'application/xhtml+xml']
- def __init__(self, baseuri=None):
+ def __init__(self, baseuri=None, encoding='utf-8'):
if _debug: sys.stderr.write("initializing FeedParser\n")
- self.channel = {} # channel- or feed-level data
- self.items = [] # list of item- or entry-level data
+ self.feeddata = FeedParserDict() # feed-level data
+ self.encoding = encoding # character encoding
+ self.entries = [] # list of entry-level data
self.version = '' # feed type/version, see SUPPORTED_VERSIONS
# the following are used internally to track state;
# some of this is kind of out of control and should
# probably be refactored into a finite state machine
- self.inchannel = 0
- self.initem = 0
+ self.infeed = 0
+ self.inentry = 0
self.incontent = 0
self.intextinput = 0
self.inimage = 0
self.inauthor = 0
self.incontributor = 0
- self.contentparams = {}
+ self.contentparams = FeedParserDict()
self.namespacemap = {}
self.elementstack = []
self.basestack = []
# track xml:base and xml:lang
attrsD = dict(attrs)
- baseuri = attrsD.get('xml:base')
+ baseuri = attrsD.get('xml:base', attrsD.get('base'))
if baseuri:
if _debug: sys.stderr.write('self.baseuri=%s\n' % baseuri)
self.baseuri = baseuri
- lang = attrsD.get('xml:lang')
+ lang = attrsD.get('xml:lang', attrsD.get('lang'))
if lang:
self.lang = lang
self.basestack.append(baseuri)
return self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0)
# match namespaces
- try:
+ if tag.find(':') <> -1:
prefix, suffix = tag.split(':', 1)
- except ValueError:
+ else:
prefix, suffix = '', tag
prefix = self.namespacemap.get(prefix, prefix)
if prefix:
def unknown_endtag(self, tag):
if _debug: sys.stderr.write('end %s\n' % tag)
# match namespaces
- try:
+ if tag.find(':') <> -1:
prefix, suffix = tag.split(':', 1)
- except ValueError:
+ else:
prefix, suffix = '', tag
prefix = self.namespacemap.get(prefix, prefix)
if prefix:
# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
if not self.elementstack: return
+# if _debug: sys.stderr.write(text)
if escape and self.contentparams.get('mode') == 'xml':
- text = xmlescape(text)
+ text = _xmlescape(text)
self.elementstack[-1][2].append(text)
def handle_comment(self, text):
pass
def handle_decl(self, text):
- # called for the DOCTYPE, if present, e.g.
- # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
- # "http://www.w3.org/TR/html4/loose.dtd">
- if text.count('http://my.netscape.com/publish/formats/rss-0.91.dtd'):
- self.version = 'rss091n'
-
- _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
- def _scan_name(self, i, declstartpos):
- rawdata = self.rawdata
- n = len(rawdata)
- if i == n:
- return None, -1
- m = self._new_declname_match(rawdata, i)
- if m:
- s = m.group()
- name = s.strip()
- if (i + len(s)) == n:
- return None, -1 # end of buffer
- return name.lower(), m.end()
- else:
- self.updatepos(declstartpos, i)
- self.error("expected name token")
+ pass
def parse_declaration(self, i):
# override internal declaration handler to handle CDATA blocks
if _debug: sys.stderr.write("entering parse_declaration\n")
- if re.search(r'^<!DOCTYPE\s+?rss\s+?PUBLIC\s+?"-//Netscape Communications//DTD RSS 0.91//EN"\s+?"http://my.netscape.com/publish/formats/rss-0.91.dtd">', self.rawdata[i:]):
- if _debug: sys.stderr.write("found Netscape DOCTYPE\n")
- self.version = 'rss091n'
if self.rawdata[i:i+9] == '<![CDATA[':
k = self.rawdata.find(']]>', i)
if k == -1: k = len(self.rawdata)
- self.handle_data(xmlescape(self.rawdata[i+9:k]), 0)
+ self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
return k+3
else:
k = self.rawdata.find('>', i)
def trackNamespace(self, prefix, uri):
if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
self.version = 'rss090'
+ if (prefix, uri) == (None, 'http://purl.org/rss/1.0/') and not self.version:
+ self.version = 'rss10'
if not prefix: return
if uri.find('backend.userland.com/rss') <> -1:
# match any backend.userland.com namespace
return data
def push(self, element, expectingText):
-# print 'push', element, expectingText
# while self.elementstack and self.elementstack[-1][1]:
# self.pop(self.elementstack[-1][0])
self.elementstack.append([element, expectingText, []])
def pop(self, element):
-# print 'pop', element
if not self.elementstack: return
# while self.elementstack[-1][0] != element: self.pop(self.elementstack[-1][0])
if self.elementstack[-1][0] != element: return
# resolve relative URIs within embedded markup
if element in self.can_contain_relative_uris:
- output = _resolveRelativeURIs(output, self.baseuri)
+ output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
# sanitize embedded markup
if element in self.can_contain_dangerous_markup:
- output = _sanitizeHTML(output)
+ output = _sanitizeHTML(output, self.encoding)
+
+ if type(output) == types.StringType:
+ try:
+ output = unicode(output, self.encoding)
+ except:
+ pass
# store output in appropriate place(s)
- if self.initem:
+ if self.inentry:
if element == 'content':
- self.items[-1].setdefault(element, [])
+ self.entries[-1].setdefault(element, [])
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
- self.items[-1][element].append(contentparams)
+ self.entries[-1][element].append(contentparams)
elif element == 'category':
- self.items[-1][element] = output
- domain = self.items[-1]['categories'][-1][0]
- self.items[-1]['categories'][-1] = (domain, output)
+ self.entries[-1][element] = output
+ domain = self.entries[-1]['categories'][-1][0]
+ self.entries[-1]['categories'][-1] = (domain, output)
elif element == 'source':
- self.items[-1]['source']['value'] = output
+ self.entries[-1]['source']['value'] = output
elif element == 'link':
- self.items[-1][element] = output
+ self.entries[-1][element] = output
if output:
- self.items[-1]['links'][-1]['href'] = output
+ self.entries[-1]['links'][-1]['href'] = output
else:
- if self.incontent and element != 'description':
+ self.entries[-1][element] = output
+ if self.incontent:
+ if element == 'description':
+ element = 'summary'
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
- self.items[-1][element + '_detail'] = contentparams
- self.items[-1][element] = output
- elif self.inchannel and (not self.intextinput) and (not self.inimage):
+ self.entries[-1][element + '_detail'] = contentparams
+ elif self.infeed and (not self.intextinput) and (not self.inimage):
+ self.feeddata[element] = output
if element == 'category':
- domain = self.channel['categories'][-1][0]
- self.channel['categories'][-1] = (domain, output)
+ domain = self.feeddata['categories'][-1][0]
+ self.feeddata['categories'][-1] = (domain, output)
elif element == 'link':
- self.channel['links'][-1]['href'] = output
- else:
- if self.incontent and element != 'description':
- contentparams = copy.deepcopy(self.contentparams)
- contentparams['value'] = output
- self.channel[element + '_detail'] = contentparams
- self.channel[element] = output
+ self.feeddata['links'][-1]['href'] = output
+ elif self.incontent:
+ if element == 'description':
+ element = 'tagline'
+ contentparams = copy.deepcopy(self.contentparams)
+ contentparams['value'] = output
+ self.feeddata[element + '_detail'] = contentparams
return output
def _mapToStandardPrefix(self, name):
def _save(self, key, value):
if value:
- if self.initem:
- self.items[-1].setdefault(key, value)
- elif self.channel:
- self.channel.setdefault(key, value)
+ if self.inentry:
+ self.entries[-1].setdefault(key, value)
+ elif self.feeddata:
+ self.feeddata.setdefault(key, value)
def _start_rss(self, attrsD):
versionmap = {'0.91': 'rss091u',
self.version = 'rss20'
else:
self.version = 'rss'
+
+ def _start_dlhottitles(self, attrsD):
+ self.version = 'hotrss'
def _start_channel(self, attrsD):
- self.inchannel = 1
-
+ self.infeed = 1
+ self._cdf_common(attrsD)
+ _start_feedinfo = _start_channel
+
+ def _cdf_common(self, attrsD):
+ if attrsD.has_key('lastmod'):
+ if _debug: sys.stderr.write(attrsD['lastmod'] + '\n')
+ self._start_modified({})
+ self.elementstack[-1][-1] = attrsD['lastmod']
+ self._end_modified()
+ if attrsD.has_key('href'):
+ self._start_link({})
+ self.elementstack[-1][-1] = attrsD['href']
+ self._end_link()
+
def _start_feed(self, attrsD):
- self.inchannel = 1
+ self.infeed = 1
versionmap = {'0.1': 'atom01',
'0.2': 'atom02',
'0.3': 'atom03'}
self.version = 'atom'
def _end_channel(self):
- self.inchannel = 0
+ self.infeed = 0
_end_feed = _end_channel
def _start_image(self, attrsD):
def _start_textinput(self, attrsD):
self.intextinput = 1
+ self.push('textinput', 0)
+ context = self._getContext()
+ context.setdefault('textinput', FeedParserDict())
_start_textInput = _start_textinput
def _end_textinput(self):
+ self.pop('textinput')
self.intextinput = 0
_end_textInput = _end_textinput
self.incontributor = 1
context = self._getContext()
context.setdefault('contributors', [])
- context['contributors'].append({})
+ context['contributors'].append(FeedParserDict())
self.push('contributor', 0)
def _end_contributor(self):
self._save_author('name', value)
elif self.incontributor:
self._save_contributor('name', value)
- pass
elif self.intextinput:
- # TODO
- pass
+ context = self._getContext()
+ context['textinput']['name'] = value
def _start_url(self, attrsD):
- self.push('url', 0)
+ self.push('url', 1)
_start_homepage = _start_url
_start_uri = _start_url
# TODO
pass
elif self.intextinput:
- # TODO
+ # TODO (map to link)
pass
_end_homepage = _end_url
_end_uri = _end_url
elif self.incontributor:
self._save_contributor('email', value)
pass
- elif self.inimage:
- # TODO
- pass
- elif self.intextinput:
- # TODO
- pass
def _getContext(self):
- if self.initem:
- context = self.items[-1]
+ if self.inentry:
+ context = self.entries[-1]
else:
- context = self.channel
+ context = self.feeddata
return context
def _save_author(self, key, value):
context = self._getContext()
- context.setdefault('author_detail', {})
+ context.setdefault('author_detail', FeedParserDict())
context['author_detail'][key] = value
self._sync_author_detail()
def _save_contributor(self, key, value):
context = self._getContext()
- context.setdefault('contributors', [{}])
+ context.setdefault('contributors', [FeedParserDict()])
context['contributors'][-1][key] = value
def _sync_author_detail(self):
emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author)
if not emailmatch: return
email = emailmatch.group(0)
+ # probably a better way to do the following, but it passes all the tests
author = author.replace(email, '')
author = author.replace('()', '')
author = author.strip()
- context.setdefault('author_detail', {})
+ if author and (author[0] == '('):
+ author = author[1:]
+ if author and (author[-1] == ')'):
+ author = author[:-1]
+ author = author.strip()
+ context.setdefault('author_detail', FeedParserDict())
context['author_detail']['name'] = author
context['author_detail']['email'] = email
def _start_tagline(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('tagline', 1)
_start_subtitle = _start_tagline
value = self.pop('tagline')
self.incontent -= 1
self.contentparams.clear()
- if self.inchannel:
- self.channel['description'] = value
+ if self.infeed:
+ self.feeddata['description'] = value
_end_subtitle = _end_tagline
def _start_copyright(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('copyright', 1)
_start_dc_rights = _start_copyright
_end_dc_rights = _end_copyright
def _start_item(self, attrsD):
- self.items.append({})
+ self.entries.append(FeedParserDict())
self.push('item', 0)
- self.initem = 1
+ self.inentry = 1
+ self._cdf_common(attrsD)
_start_entry = _start_item
+ _start_product = _start_item
def _end_item(self):
self.pop('item')
- self.initem = 0
+ self.inentry = 0
_end_entry = _end_item
def _start_dc_language(self, attrsD):
def _end_dcterms_modified(self):
value = self.pop('modified')
+ if _debug: sys.stderr.write('_end_dcterms_modified, value=' + value + '\n')
parsed_value = _parse_date(value)
self._save('date', value)
self._save('date_parsed', parsed_value)
self.push('category', 1)
domain = self._getAttribute(attrsD, 'domain')
cats = []
- if self.initem:
- cats = self.items[-1].setdefault('categories', [])
- elif self.inchannel:
- cats = self.channel.setdefault('categories', [])
+ if self.inentry:
+ cats = self.entries[-1].setdefault('categories', [])
+ elif self.infeed:
+ cats = self.feeddata.setdefault('categories', [])
cats.append((domain, None))
_start_dc_subject = _start_category
+ _start_keywords = _start_category
def _end_category(self):
self.pop('category')
_end_dc_subject = _end_category
+ _end_keywords = _end_category
def _start_cloud(self, attrsD):
- self.channel['cloud'] = attrsD
+ self.feeddata['cloud'] = attrsD
def _start_link(self, attrsD):
attrsD.setdefault('rel', 'alternate')
attrsD.setdefault('type', 'text/html')
if attrsD.has_key('href'):
attrsD['href'] = self.resolveURI(attrsD['href'])
- expectingText = self.inchannel or self.initem
- if self.initem:
- self.items[-1].setdefault('links', [])
- self.items[-1]['links'].append(attrsD)
- elif self.inchannel:
- self.channel.setdefault('links', [])
- self.channel['links'].append(attrsD)
+ expectingText = self.infeed or self.inentry
+ if self.inentry:
+ self.entries[-1].setdefault('links', [])
+ self.entries[-1]['links'].append(attrsD)
+ elif self.infeed:
+ self.feeddata.setdefault('links', [])
+ self.feeddata['links'].append(attrsD)
if attrsD.has_key('href'):
expectingText = 0
if attrsD.get('type', '') in self.html_types:
- if self.initem:
- self.items[-1]['link'] = attrsD['href']
- elif self.inchannel:
- self.channel['link'] = attrsD['href']
+ if self.inentry:
+ self.entries[-1]['link'] = attrsD['href']
+ elif self.infeed:
+ self.feeddata['link'] = attrsD['href']
else:
self.push('link', expectingText)
+ _start_producturl = _start_link
+
+ def _end_link(self):
+ value = self.pop('link')
+ if self.intextinput:
+ context = self._getContext()
+ context['textinput']['link'] = value
+ _end_producturl = _end_link
def _start_guid(self, attrsD):
self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
def _start_title(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
- self.push('title', self.inchannel or self.initem)
+ 'base': attrsD.get('xml:base', self.baseuri)})
+ self.push('title', self.infeed or self.inentry)
_start_dc_title = _start_title
def _end_title(self):
- self.pop('title')
+ value = self.pop('title')
self.incontent -= 1
self.contentparams.clear()
+ if self.intextinput:
+ context = self._getContext()
+ context['textinput']['title'] = value
_end_dc_title = _end_title
- def _start_description(self, attrsD):
+ def _start_description(self, attrsD, default_content_type='text/html'):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
- 'type': attrsD.get('type', 'text/html'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
+ 'type': attrsD.get('type', default_content_type),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
- self.push('description', self.inchannel or self.initem)
+ 'base': attrsD.get('xml:base', self.baseuri)})
+ self.push('description', self.infeed or self.inentry)
+
+ def _start_abstract(self, attrsD):
+ return self._start_description(attrsD, 'text/plain')
def _end_description(self):
value = self.pop('description')
- if self.initem:
- self.items[-1]['summary'] = value
- elif self.inchannel:
- self.channel['tagline'] = value
self.incontent -= 1
self.contentparams.clear()
-
+ context = self._getContext()
+ if self.intextinput:
+ context['textinput']['description'] = value
+ elif self.inentry:
+ context['summary'] = value
+ elif self.infeed:
+ context['tagline'] = value
+ _end_abstract = _end_description
+
def _start_info(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('info', 1)
def _end_info(self):
def _start_generator(self, attrsD):
if attrsD:
- self.channel['generator_detail'] = attrsD
+ if attrsD.has_key('url'):
+ attrsD['url'] = self.resolveURI(attrsD['url'])
+ self.feeddata['generator_detail'] = attrsD
self.push('generator', 1)
def _end_generator(self):
value = self.pop('generator')
- if self.channel.has_key('generator_detail'):
- self.channel['generator_detail']['name'] = value
+ if self.feeddata.has_key('generator_detail'):
+ self.feeddata['generator_detail']['name'] = value
def _start_admin_generatoragent(self, attrsD):
self.push('generator', 1)
def _start_summary(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('summary', 1)
def _end_summary(self):
value = self.pop('summary')
- if self.items:
- self.items[-1]['description'] = value
+ if self.entries:
+ self.entries[-1]['description'] = value
self.incontent -= 1
self.contentparams.clear()
def _start_enclosure(self, attrsD):
- if self.initem:
- self.items[-1].setdefault('enclosures', [])
- self.items[-1]['enclosures'].append(attrsD)
+ if self.inentry:
+ self.entries[-1].setdefault('enclosures', [])
+ self.entries[-1]['enclosures'].append(attrsD)
def _start_source(self, attrsD):
- if self.initem:
- self.items[-1]['source'] = attrsD
+ if self.inentry:
+ self.entries[-1]['source'] = attrsD
self.push('source', 1)
def _end_source(self):
def _start_content(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': attrsD.get('mode', 'xml'),
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
'type': attrsD.get('type', 'text/plain'),
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
+ self.push('content', 1)
+
+ def _start_prodlink(self, attrsD):
+ self.incontent += 1
+ self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
+ 'type': attrsD.get('type', 'text/html'),
+ 'language': attrsD.get('xml:lang', self.lang),
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('content', 1)
def _start_body(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': 'xml',
+ self.contentparams = FeedParserDict({'mode': 'xml',
'type': 'application/xhtml+xml',
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('content', 1)
_start_xhtml_body = _start_body
def _start_content_encoded(self, attrsD):
self.incontent += 1
- self.contentparams = {'mode': 'escaped',
+ self.contentparams = FeedParserDict({'mode': 'escaped',
'type': 'text/html',
'language': attrsD.get('xml:lang', self.lang),
- 'base': attrsD.get('xml:base', self.baseuri)}
+ 'base': attrsD.get('xml:base', self.baseuri)})
self.push('content', 1)
_start_fullitem = _start_content_encoded
_end_xhtml_body = _end_content
_end_content_encoded = _end_content
_end_fullitem = _end_content
+ _end_prodlink = _end_content
if _XML_AVAILABLE:
- class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):#, xml.sax.handler.DTDHandler):
- def __init__(self, baseuri):
+ class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler, xml.sax.handler.EntityResolver):#, xml.sax.handler.DTDHandler):
+ def __init__(self, baseuri, encoding):
if _debug: sys.stderr.write('trying StrictFeedParser\n')
xml.sax.handler.ContentHandler.__init__(self)
- _FeedParserMixin.__init__(self, baseuri)
+ _FeedParserMixin.__init__(self, baseuri, encoding)
self.bozo = 0
self.exc = None
def startElementNS(self, name, qname, attrs):
namespace, localname = name
- namespace = str(namespace)
- prefix = self.namespaces.get(namespace, '')
+ namespace = str(namespace or '')
+ if namespace.find('backend.userland.com/rss') <> -1:
+ # match any backend.userland.com namespace
+ namespace = 'http://backend.userland.com/rss'
+ prefix = self.namespaces.get(namespace, 'unknown')
if prefix:
localname = prefix + ':' + localname
localname = str(localname).lower()
localname = str(localname).lower()
self.unknown_endtag(localname)
- def fatalError(self, exc):
+ def error(self, exc):
self.bozo = 1
self.exc = exc
- error = fatalError
-
-class _LooseFeedParser(_FeedParserMixin, sgmllib.SGMLParser):
- def __init__(self, baseuri):
- sgmllib.SGMLParser.__init__(self)
- _FeedParserMixin.__init__(self, baseuri)
+
+ def fatalError(self, exc):
+ self.error(exc)
+ raise exc
class _BaseHTMLProcessor(sgmllib.SGMLParser):
elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
'img', 'input', 'isindex', 'link', 'meta', 'param']
- def __init__(self):
+ def __init__(self, encoding):
+ self.encoding = encoding
sgmllib.SGMLParser.__init__(self)
def reset(self):
- # extend (called by sgmllib.SGMLParser.__init__)
self.pieces = []
sgmllib.SGMLParser.reset(self)
+ def feed(self, data):
+ data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
+ data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
+ data = data.replace(''', "'")
+ data = data.replace('"', '"')
+ if type(data) == types.UnicodeType:
+ data = data.encode(self.encoding)
+ sgmllib.SGMLParser.feed(self, data)
+
def normalize_attrs(self, attrs):
# utility method to be called by descendants
attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
# called for each start tag
# attrs is a list of (attr, value) tuples
# e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
if tag in self.elements_no_end_tag:
self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
# Store the original text verbatim.
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
self.pieces.append(text)
def handle_comment(self, text):
# Reconstruct original DOCTYPE
self.pieces.append("<!%(text)s>" % locals())
+ _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
+ def _scan_name(self, i, declstartpos):
+ rawdata = self.rawdata
+ if _debug: sys.stderr.write("i=%s, declstartpos=%s, rawdata=%s\n" % (i, declstartpos, rawdata))
+ n = len(rawdata)
+ if i == n:
+ return None, -1
+ m = self._new_declname_match(rawdata, i)
+ if m:
+ s = m.group()
+ name = s.strip()
+ if (i + len(s)) == n:
+ return None, -1 # end of buffer
+ return name.lower(), m.end()
+ else:
+ self.handle_data(rawdata)
+# self.updatepos(declstartpos, i)
+ return None, -1
+
def output(self):
"""Return processed HTML as a single string"""
- return "".join(self.pieces)
+ if _debug:
+ for p in self.pieces:
+ sys.stderr.write(p)
+ sys.stderr.write('\n')
+ return "".join([str(p) for p in self.pieces])
+
+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
+ def __init__(self, baseuri, encoding):
+ sgmllib.SGMLParser.__init__(self)
+ _FeedParserMixin.__init__(self, baseuri, encoding)
class _RelativeURIResolver(_BaseHTMLProcessor):
relative_uris = [('a', 'href'),
('q', 'cite'),
('script', 'src')]
- def __init__(self, baseuri):
- _BaseHTMLProcessor.__init__(self)
+ def __init__(self, baseuri, encoding):
+ _BaseHTMLProcessor.__init__(self, encoding)
self.baseuri = baseuri
def resolveURI(self, uri):
attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-def _resolveRelativeURIs(htmlSource, baseURI):
- p = _RelativeURIResolver(baseURI)
+def _resolveRelativeURIs(htmlSource, baseURI, encoding):
+ if _debug: sys.stderr.write("entering _resolveRelativeURIs\n")
+ p = _RelativeURIResolver(baseURI, encoding)
+ if _debug: sys.stderr.write(repr(type(htmlSource)) + '\n')
p.feed(htmlSource)
return p.output()
if not self.unacceptablestack:
_BaseHTMLProcessor.handle_data(self, text)
-def _sanitizeHTML(htmlSource):
- p = _HTMLSanitizer()
+def _sanitizeHTML(htmlSource, encoding):
+ p = _HTMLSanitizer(encoding)
p.feed(htmlSource)
data = p.output()
- if _mxtidy:
+ if _mxtidy and TIDY_MARKUP:
nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
if data.count('<body'):
data = data.split('<body', 1)[1]
if url_file_stream_or_string == "-":
return sys.stdin
- if not agent:
- agent = USER_AGENT
+ if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
+ if not agent:
+ agent = USER_AGENT
- # try to open with urllib2 (to use optional headers)
- request = urllib2.Request(url_file_stream_or_string)
- if etag:
- request.add_header("If-None-Match", etag)
- if modified:
- # format into an RFC 1123-compliant timestamp. We can't use
- # time.strftime() since the %a and %b directives can be affected
- # by the current locale, but RFC 2616 states that dates must be
- # in English.
- short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
- months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
- request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
- request.add_header("User-Agent", agent)
- if referrer:
- request.add_header("Referer", referrer)
- if gzip:
- request.add_header("Accept-encoding", "gzip")
- opener = urllib2.build_opener(_FeedURLHandler())
- opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
- try:
+ # try to open with urllib2 (to use optional headers)
+ request = urllib2.Request(url_file_stream_or_string)
+ request.add_header("User-Agent", agent)
+ if etag:
+ request.add_header("If-None-Match", etag)
+ if modified:
+ # format into an RFC 1123-compliant timestamp. We can't use
+ # time.strftime() since the %a and %b directives can be affected
+ # by the current locale, but RFC 2616 states that dates must be
+ # in English.
+ short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
+ months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+ request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
+ if referrer:
+ request.add_header("Referer", referrer)
+ if gzip:
+ request.add_header("Accept-encoding", "gzip")
+ opener = urllib2.build_opener(_FeedURLHandler())
+ opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
try:
- return opener.open(request)
- except:
- # url_file_stream_or_string is not a valid URL, but it might be a valid filename
- pass
- finally:
- opener.close() # JohnD
+ try:
+ return opener.open(request)
+# except ValueError:
+# # not a valid URL, but might be a valid filename
+# pass
+# except AssertionError:
+# # under Python 2.1, non-URLs will fail with an AssertionError;
+# # still might be a valid filename, so fall through
+# pass
+ except:
+ return _StringIO('')
+ finally:
+ opener.close() # JohnD
# try to open with native open function (if url_file_stream_or_string is a filename)
try:
except:
return None
+def _getCharacterEncoding(http_headers, xml_data):
+ """Get the character encoding of the XML document
+
+ http_headers is a dictionary
+ xml_data is a raw string (not Unicode)
+
+ This is so much trickier than it sounds,
+ it's not even funny. According to RFC 3023 ("XML Media Types"), if
+ the HTTP Content-Type is application/xml, application/*+xml,
+ application/xml-external-parsed-entity, or application/xml-dtd,
+ the encoding given in the charset parameter of the HTTP Content-Type
+ takes precedence over the encoding given in the XML prefix within the
+ document, and defaults to "utf-8" if neither are specified. But, if
+ the HTTP Content-Type is text/xml, text/*+xml, or
+ text/xml-external-parsed-entity, the encoding given in the XML prefix
+ within the document is ALWAYS IGNORED and only the encoding given in
+ the charset parameter of the HTTP Content-Type header should be
+ respected, and it defaults to "us-ascii" if not specified. If
+ Content-Type is unspecified (input was local file or non-HTTP source)
+ or unrecognized (server just got it totally wrong), then go by the
+ encoding given in the XML prefix of the document and default to
+ "utf-8" as per the XML specification.
+ """
+
+ def _parseHTTPContentType(content_type):
+ """takes HTTP Content-Type header and returns (content type, charset)
+
+ If no charset is specified, returns (content type, '')
+ If no content type is specified, returns ('', '')
+ Both return parameters are guaranteed to be lowercase strings
+ """
+ if not content_type:
+ return '', ''
+ content_type = content_type.strip()
+ paramstr = content_type.split(';')[1:]
+ if not paramstr:
+ return content_type, ''
+ content_type = content_type.split(';', 1)[0].strip().lower()
+ if not paramstr[0]:
+ # declaration like "text/xml;" (note ending semicolon)
+ # dunno if this is malformed but it sure was hard to track down
+ return content_type, ''
+ import string
+ params = dict([map(string.lower, map(string.strip, p.strip().split('=', 1))) for p in paramstr])
+ charset = params.get('charset')
+ if not charset:
+ return content_type, ''
+ if charset[0] in ('"', "'"):
+ charset = charset[1:]
+ if charset and charset[-1] in ('"', "'"):
+ charset = charset[:-1]
+ charset = charset.strip()
+ return content_type, charset
+
+ true_encoding = None
+ http_content_type, http_encoding = _parseHTTPContentType(http_headers.get("content-type"))
+ xml_encoding_match = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
+ xml_encoding = xml_encoding_match and xml_encoding_match.groups()[0].lower() or ''
+ if (http_content_type == 'application/xml') or \
+ (http_content_type == 'application/xml-dtd') or \
+ (http_content_type == 'application/xml-external-parsed-entity') or \
+ (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
+ if http_encoding:
+ true_encoding = http_encoding
+ elif xml_encoding:
+ true_encoding = xml_encoding
+ else:
+ true_encoding = 'utf-8'
+ elif (http_content_type == 'text/xml') or \
+ (http_content_type == 'text/xml-external-parsed-entity') or \
+ (http_content_type.startswith('text/') and http_content_type.endswith('+xml')):
+ if http_encoding:
+ true_encoding = http_encoding
+ else:
+ true_encoding = 'us-ascii'
+ else:
+ true_encoding = xml_encoding or 'utf-8'
+ return true_encoding, http_encoding, xml_encoding
+
+def _changeEncodingDeclaration(data, encoding):
+ """Changes an XML data stream on the fly to specify a new encoding
+
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
+ encoding is a string recognized by encodings.aliases
+ """
+ if _debug: sys.stderr.write('entering _changeEncodingDeclaration\n')
+ if _debug: sys.stderr.write('proposed encoding: %s\n' % encoding)
+ #import cjkcodecs.aliases
+ #import japanese
+ data = unicode(data, encoding)
+ declmatch = re.compile(u'^<\?xml[^>]*?>')
+ newdecl = unicode("""<?xml version='1.0' encoding='%s'?>""" % encoding, encoding)
+ if declmatch.search(data):
+ data = declmatch.sub(newdecl, data)
+ else:
+ data = newdecl + u'\n' + data
+ return data.encode(encoding)
+
+def _stripDoctype(data):
+ """Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
+
+ rss_version may be "rss091n" or None
+ stripped_data is the same XML document, minus the DOCTYPE
+ """
+ doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
+ doctype_results = doctype_pattern.findall(data)
+ doctype = doctype_results and doctype_results[0] or ''
+ if doctype.lower().count('netscape'):
+ version = 'rss091n'
+ else:
+ version = None
+ data = doctype_pattern.sub('', data)
+ return version, data
+
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None):
"""Parse a feed from a URL, file, stream, or string"""
- result = {}
+ result = FeedParserDict()
f = _open_resource(url_file_stream_or_string, etag=etag, modified=modified, agent=agent, referrer=referrer)
data = f.read()
if hasattr(f, "headers"):
result["status"] = f.status
if hasattr(f, "headers"):
result["headers"] = f.headers.dict
- # get the xml encoding
- xmlheaderRe = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>') # Andrei's version
- match = xmlheaderRe.match(data)
- if match:
- result["encoding"] = match.groups()[0].lower()
f.close()
- result['channel'] = {}
- result['items'] = {}
+ if result.get("status", 0) == 304:
+ result['feed'] = FeedParserDict()
+ result['entries'] = []
+ result['debug_message'] = "The feed has not changed since you last checked, so the server sent no data. This is a feature, not a bug!"
+ return result
+ result['encoding'], http_encoding, xml_encoding = _getCharacterEncoding(result.get("headers", {}), data)
+ result['version'], data = _stripDoctype(data)
baseuri = result.get('headers', {}).get('content-location', result.get('url'))
# try true XML parser first
- if _XML_AVAILABLE:
+ if not _XML_AVAILABLE:
+ if _debug: sys.stderr.write('no xml libraries available\n')
+ use_strict_parser = _XML_AVAILABLE
+ if use_strict_parser:
if _debug: sys.stderr.write('using xml library\n')
result['bozo'] = 0
- feedparser = _StrictFeedParser(baseuri)
- if re.search(r'<!DOCTYPE\s+?rss\s+?PUBLIC\s+?"-//Netscape Communications//DTD RSS 0.91//EN"\s+?"http://my.netscape.com/publish/formats/rss-0.91.dtd">', data):
- feedparser.version = 'rss091n'
- source = xml.sax.xmlreader.InputSource()
- source.setByteStream(_StringIO(data))
- saxparser = xml.sax.make_parser()#["drv_libxml2"])
+ feedparser = _StrictFeedParser(baseuri, result['encoding'])
+ if _debug and _debug_never_use_libxml2:
+ sys.stderr.write('not using libxml2 (even if available)\n')
+ additional_parsers = []
+ else:
+ additional_parsers = ["drv_libxml2"]
+ saxparser = xml.sax.make_parser(additional_parsers)
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
saxparser.setContentHandler(feedparser)
saxparser.setErrorHandler(feedparser)
try:
saxparser.setDTDHandler(feedparser)
+ except xml.sax.SAXNotSupportedException:
+ # libxml2 driver does not support DTDHandler
+ if _debug: sys.stderr.write('using an xml library that does not support DTDHandler (not a big deal)\n')
+ try:
saxparser.setEntityResolver(feedparser)
except xml.sax.SAXNotSupportedException:
- if _debug: sys.stderr.write('using an xml library that does not support DTDHandler and EntityResolver (this is not a problem)\n')
- # libxml2 driver does not currently support DTDHandler or EntityResolver
- pass
+ # libxml2 driver does not support EntityResolver
+ if _debug: sys.stderr.write('using an xml library that does not support EntityResolver (not a big deal)\n')
+ encoding_set = (result['encoding'] == xml_encoding)
+ if not encoding_set:
+ bozo_exception = None
+ proposed_encodings = [result['encoding'], xml_encoding, 'utf-8', 'iso-8859-1', 'windows-1252']
+ tried_encodings = []
+ for proposed_encoding in proposed_encodings:
+ if proposed_encodings in tried_encodings: continue
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _changeEncodingDeclaration(data, proposed_encoding)
+ except Exception, bozo_exception:
+ if _debug: sys.stderr.write('character encoding is wrong\n')
+ else:
+ if proposed_encoding != result['encoding']:
+ try:
+ raise CharacterEncodingOverride, "document declared as %s, but parsed as %s" % (result['encoding'], proposed_encoding)
+ except CharacterEncodingOverride, bozo_exception:
+ result['bozo'] = 1
+ result['bozo_exception'] = bozo_exception
+ result['encoding'] = proposed_encoding
+ encoding_set = 1
+ break
+ if not encoding_set:
+ result['bozo'] = 1
+ result['bozo_exception'] = bozo_exception
+ use_strict_parser = 0
+ if use_strict_parser:
+ source = xml.sax.xmlreader.InputSource()
+ source.setByteStream(_StringIO(data))
if hasattr(saxparser, '_ns_stack'):
# work around bug in built-in SAX parser (doesn't recognize xml: namespace)
# PyXML doesn't have this problem, and it doesn't have _ns_stack either
try:
saxparser.parse(source)
except Exception, e:
- # SAX parser is supposed to catch all of these and call feedparser.fatal_error,
- # which captures them. For some reason, some Unicode-related errors go
- # uncaught on some combination of platform, XML library, Python version,
- # and phase of the moon.
+ if _debug: sys.stderr.write('xml parsing failed\n')
feedparser.bozo = 1
- feedparser.bozo_exception = e
+ feedparser.bozo_exception = feedparser.exc or e
if feedparser.bozo:
# feed is not well-formed XML, fall back on regex-based parser
- if _debug: sys.stderr.write('xml parsing failed, using regexes. now you have two problems...\n')
result['bozo'] = 1
- result['bozo_exception'] = feedparser.exc
- # munge short tags, e.g. <description/> becomes <description></description>
- data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
- feedparser = _LooseFeedParser(baseuri)
- feedparser.feed(data)
- else:
- if _debug: sys.stderr.write('no xml libraries available, using regexes\n')
- data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
- feedparser = _LooseFeedParser(baseuri)
+ result['bozo_exception'] = feedparser.bozo_exception
+ use_strict_parser = 0
+ if not use_strict_parser:
+ if _debug: sys.stderr.write('using regexes, now you have two problems\n')
+ feedparser = _LooseFeedParser(baseuri, result['encoding'])
feedparser.feed(data)
- result['channel'] = feedparser.channel
- result['items'] = feedparser.items
- result['version'] = feedparser.version
+ result['feed'] = feedparser.feeddata
+ result['entries'] = feedparser.entries
+ result['version'] = result['version'] or feedparser.version
return result
-_TEST_SUITE = ('http://www.pocketsoap.com/rssTests/rss1.0withModules.xml',
- 'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNS.xml',
- 'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNSLocalNameClash.xml',
- 'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModules.xml',
- 'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModulesLocalNameClash.xml',
- 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModules.xml',
- 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNS.xml',
- 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNSLocalNameClash.xml')
-
if __name__ == '__main__':
- if sys.argv[1:]:
- urls = sys.argv[1:]
+ if not sys.argv[1:]:
+ print __doc__
+ sys.exit(0)
else:
- urls = _TEST_SUITE
+ urls = sys.argv[1:]
from pprint import pprint
for url in urls:
print url
#TODO
#- image
-#- textinput/textInput
-#- comments
-#
-#encoding notes:
-#- RFC 3023
-#- content-type.startswith('text/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else "us-ascii"
-#- content-type.startswith('application/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else look for encoding="(.*?)" in document, else "utf-8"
-#- parsing encoding: http://www.w3.org/TR/REC-xml#NT-EncodingDecl
#
#REVISION HISTORY
#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
# blogspot.com sites); added _debug variable
#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
-#3.0 - MAP - parse entire feed with real XML parser (if available); added several
-# new supported namespaces; fixed bug tracking naked markup in description;
-# added support for enclosure; added support for source; re-added support for
-# cloud which got dropped somehow; added support for expirationDate; fixed
-# xml:lang inheritance; fixed multiple bugs tracking xml:base URI, one for
-# documents that don't define one explicitly and one for documents that define
-# an outer and an inner xml:base that goes out of scope before the end of the
-# document; fixed bug parsing multiple links at feed level; added feed type and
-# version detection, results["version"] will be one of SUPPORTED_VERSIONS.keys()
-# or empty string if unrecognized; added support for creativeCommons:license and
-# cc:license; added support for full Atom content model in title, tagline, info,
-# copyright, summary; fixed bug with gzip encoding (not always telling server
-# we support it when we do); support Atom-style author element in author_detail
+#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
+# added several new supported namespaces; fixed bug tracking naked markup in
+# description; added support for enclosure; added support for source; re-added
+# support for cloud which got dropped somehow; added support for expirationDate
+#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
+# xml:base URI, one for documents that don't define one explicitly and one for
+# documents that define an outer and an inner xml:base that goes out of scope
+# before the end of the document
+#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
+#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result["version"]
+# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
+# added support for creativeCommons:license and cc:license; added support for
+# full Atom content model in title, tagline, info, copyright, summary; fixed bug
+# with gzip encoding (not always telling server we support it when we do)
+#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
# (dictionary of "name", "url", "email"); map author to author_detail if author
-# contains name + email address; better handling of empty HTML tags (br, hr, img,
-# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />);
-# fixed CDATA handling in non-wellformed feeds under Python 2.1
+# contains name + email address
+#3.0b8 - 1/28/2004 - MAP - added support for contributor
+#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
+# support for summary
+#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
+# xml.util.iso8601
+#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
+# dangerous markup; fiddled with decodeEntities (not right); liberalized
+# date parsing even further
+#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
+# added support to Atom 0.2 subtitle; added support for Atom content model
+# in copyright; better sanitizing of dangerous HTML elements with end tags
+# (script, frameset)
+#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
+# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
+#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
+# Python 2.1
+#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
+# fixed bug capturing author and contributor URL; fixed bug resolving relative
+# links in author and contributor URL; fixed bug resolvin relative links in
+# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
+# namespace tests, and included them permanently in the test suite with his
+# permission; fixed namespace handling under Python 2.1
+#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
+#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
+#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
+# use libxml2 (if available)
+#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
+# name was in parentheses; removed ultra-problematic mxTidy support; patch to
+# workaround crash in PyXML/expat when encountering invalid entities
+# (MarkMoraes); support for textinput/textInput
+#3.0b20 - 4/7/2004 - MAP - added CDF support
+#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
+#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
+# results dict; changed results dict to allow getting values with results.key
+# as well as results[key]; work around embedded illformed HTML with half
+# a DOCTYPE; work around malformed Content-Type header; if character encoding
+# is wrong, try several common ones before falling back to regexes (if this
+# works, bozo_exception is set to CharacterEncodingOverride); fixed character
+# encoding issues in BaseHTMLProcessor by tracking encoding and converting
+# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
+# convert each value in results to Unicode (if possible), even if using
+# regex-based parsing