Fixes for dodgy blogs, new feedparser
authordavyd <davyd>
Mon, 3 May 2004 16:15:31 +0000 (16:15 +0000)
committerdavyd <davyd>
Mon, 3 May 2004 16:15:31 +0000 (16:15 +0000)
Changelog
XMLParse2.py
extra/feedparser.py
feedlist
update-planet

index bf8e029..ebcf42f 100644 (file)
--- a/Changelog
+++ b/Changelog
@@ -1,3 +1,12 @@
+2004-04-28
+==========
+ * update-planet
+   Cacheability hacks for Adrian Woodley's blog.
+ * XMLParse2.py
+   Logging tweaks.
+ * extras/feedparser.py
+   Upgraded to new version of feedparser.
+
 2004-03-22
 ==========
  * CacheHandler.py
index 861b333..d1ca814 100644 (file)
@@ -42,19 +42,20 @@ class XMLParse:
                "Return a single Blog object"
                item            = Blog()
                if self.blogObject and self.blogObject.cache:
-                       sys.stdout.write('Downloading feed %s...' % self.feedURL)
+                       sys.stdout.write('Downloading feed %s... ' % self.feedURL)
                        try:
                                data    = feedparser.parse(self.feedURL, self.blogObject.cache.etag, self.blogObject.cache.date)
-                               sys.stdout.write('done.\n')
+                               # check to see what we got returned
+                               if data['items'] == [] and data['channel'] == {}:
+                                       sys.stdout.write('cached.\n')
+                                       return self.blogObject
+                               else:
+                                       sys.stdout.write('done.\n')
                        except:
                                sys.stdout.write('failed.\n')
                                return None
-                       # check to see what we got returned
-                       if data['items'] == [] and data['channel'] == {}:
-                               sys.stdout.write('Feed %s is upto date.\n' % self.feedURL)
-                               return self.blogObject
                else:
-                       sys.stdout.write('Downloading feed from %s (no cache)...' % self.feedURL)
+                       sys.stdout.write('Downloading feed (no cache) %s... ' % self.feedURL)
                        try:
                                data    = feedparser.parse(self.feedURL)
                                sys.stdout.write('done.\n')
index 024194e..4c4afd9 100644 (file)
@@ -3,41 +3,14 @@
 
 Visit http://diveintomark.org/projects/feed_parser/ for the latest version
 
-Handles RSS 0.9x, RSS 1.0, RSS 2.0, Atom feeds
-
-Things it handles that choke other parsers:
-- bastard combinations of RSS 0.9x and RSS 1.0
-- illegal 8-bit XML characters
-- naked and/or invalid HTML in description
-- content:encoded, xhtml:body, fullitem
-- guid
-- elements in non-standard namespaces or non-default namespaces
-- multiple content items per entry (Atom)
-- multiple links per entry (Atom)
-
-Other features:
-- resolves relative URIs in some elements
-  - uses xml:base to define base URI
-  - uses URI of feed if no xml:base is given
-  - to control which elements are resolved, set _FeedParserMixin.can_be_relative_uri
-- resolves relative URIs within embedded markup
-  - to control which elements are resolved, set _FeedParserMixin.can_contain_relative_uris
-- sanitizes embedded markup in some elements
-  - to allow/disallow HTML elements, set _HTMLSanitizer.acceptable_elements
-  - to allow/disallow HTML attributes, set _HTMLSanitizer.acceptable_attributes
-  - to control which feed elements are sanitized, set _FeedParserMixin.can_contain_dangerous_markup
-  - to disable entirely (NOT RECOMMENDED), set _FeedParserMixin.can_contain_dangerous_markup = []
-- optionally tidies embedded markup
-  - fixes malformed HTML
-  - converts to XHTML
-  - converts character entities to numeric entities
-  - requires mxTidy <http://www.lemburg.com/files/python/mxTidy.html>
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds
 
 Required: Python 2.1 or later
 Recommended: Python 2.3 or later
+Recommended: libxml2 <http://xmlsoft.org/python.html>
 """
 
-__version__ = "3.0-beta-14"
+__version__ = "3.0-beta-22"
 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
 __copyright__ = "Copyright 2002-4, Mark Pilgrim"
 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
@@ -45,12 +18,20 @@ __contributors__ = ["Jason Diamond <http://injektilo.org/>",
                     "Fazal Majid <http://www.majid.info/mylos/weblog/>"]
 __license__ = "Python"
 _debug = 0
+_debug_never_use_libxml2 = 0
 
 # if you are embedding feedparser in a larger application, you should change this to your application name and URL
 USER_AGENT = "UniversalFeedParser/%s%s +http://diveintomark.org/projects/feed_parser/" % (__version__, _debug and "-debug" or "")
 
+# If you want feedparser to automatically run HTML markup through HTML Tidy, set this to 1.
+# This is off by default because of reports of crashing on some platforms.  If it crashes
+# for you, please submit a bug report with your OS platform, Python version, and the URL
+# of the feed you were attempting to parse.
+# Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
+TIDY_MARKUP = 0
+
 # ---------- required modules (should come with any Python distribution) ----------
-import sgmllib, re, sys, copy, urlparse, time, rfc822
+import sgmllib, re, sys, copy, urlparse, time, rfc822, types
 try:
     from cStringIO import StringIO as _StringIO
 except:
@@ -66,24 +47,23 @@ except:
     
 # timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers.
 # Python 2.3 now has this functionality available in the standard socket library, so under
-# 2.3 you don't need to install anything.
-import socket
-if hasattr(socket, 'setdefaulttimeout'):
-    socket.setdefaulttimeout(10)
-else:
-    try:
-        import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
-        timeoutsocket.setDefaultSocketTimeout(10)
-    except ImportError:
-        pass
+# 2.3 you don't need to install anything.  But you probably should anyway, because the socket
+# module is buggy and timeoutsocket is better.
+try:
+    import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
+    timeoutsocket.setDefaultSocketTimeout(10)
+except ImportError:
+    import socket
+    if hasattr(socket, 'setdefaulttimeout'):
+        socket.setdefaulttimeout(10)
 import urllib2
 
-# mxtidy allows feedparser to tidy malformed embedded HTML markup in description, content, etc.
-# this does not affect HTML sanitizing, which is self-contained in the _HTMLSanitizer class
-try:
-    from mx.Tidy import Tidy as _mxtidy # http://www.lemburg.com/files/python/mxTidy.html
-except:
-    _mxtidy = None
+_mxtidy = None
+if TIDY_MARKUP:
+    try:
+        from mx.Tidy import Tidy as _mxtidy
+    except:
+        pass
 
 # If a real XML parser is available, feedparser will attempt to use it.  feedparser works
 # with both the built-in SAX parser and PyXML SAX parser.  On platforms where the Python
@@ -96,11 +76,12 @@ except:
 # using one.
 try:
     import xml.sax
-    from xml.sax.saxutils import escape as xmlescape
+    from xml.sax.saxutils import escape as _xmlescape
+    class CharacterEncodingOverride(xml.sax.SAXException): pass
     _XML_AVAILABLE = 1
 except:
     _XML_AVAILABLE = 0
-    def xmlescape(data):
+    def _xmlescape(data):
         data = data.replace("&", "&amp;")
         data = data.replace(">", "&gt;")
         data = data.replace("<", "&lt;")
@@ -129,7 +110,9 @@ SUPPORTED_VERSIONS = {'': 'unknown',
                       'atom01': 'Atom 0.1',
                       'atom02': 'Atom 0.2',
                       'atom03': 'Atom 0.3',
-                      'atom': 'Atom (unknown version)'
+                      'atom': 'Atom (unknown version)',
+                      'cdf': 'CDF',
+                      'hotrss': 'Hot RSS'
                       }
 
 try:
@@ -142,10 +125,29 @@ except NameError:
             rc[k] = v
         return rc
 
+from UserDict import UserDict
+class FeedParserDict(UserDict):
+    def __getitem__(self, key):
+        if key == 'channel': key = 'feed'
+        if key == 'items': key = 'entries'
+        return UserDict.__getitem__(self, key)
+
+    def __getattr__(self, key):
+        try:
+            return self.__dict__[key]
+        except KeyError:
+            pass
+        try:
+            return self.__getitem__(key)
+        except:
+            raise AttributeError, "object has no attribute '%s'" % key
+
 class _FeedParserMixin:
-    namespaces = {"http://backend.userland.com/rss": "",
+    namespaces = {"": "",
+                  "http://backend.userland.com/rss": "",
                   "http://blogs.law.harvard.edu/tech/rss": "",
                   "http://purl.org/rss/1.0/": "",
+                  "http://my.netscape.com/rdf/simple/0.9/": "",
                   "http://example.com/newformat#": "",
                   "http://example.com/necho": "",
                   "http://purl.org/echo/": "",
@@ -196,28 +198,29 @@ class _FeedParserMixin:
                   "http://www.w3.org/XML/1998/namespace":                 "xml"
 }
 
-    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentRSS', 'docs', 'url', 'comments']
+    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'comments']
     can_contain_relative_uris = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
     can_contain_dangerous_markup = ['content', 'description', 'title', 'summary', 'info', 'tagline', 'copyright']
     html_types = ['text/html', 'application/xhtml+xml']
     
-    def __init__(self, baseuri=None):
+    def __init__(self, baseuri=None, encoding='utf-8'):
         if _debug: sys.stderr.write("initializing FeedParser\n")
-        self.channel = {} # channel- or feed-level data
-        self.items = [] # list of item- or entry-level data
+        self.feeddata = FeedParserDict() # feed-level data
+        self.encoding = encoding # character encoding
+        self.entries = [] # list of entry-level data
         self.version = '' # feed type/version, see SUPPORTED_VERSIONS
 
         # the following are used internally to track state;
         # some of this is kind of out of control and should
         # probably be refactored into a finite state machine
-        self.inchannel = 0
-        self.initem = 0
+        self.infeed = 0
+        self.inentry = 0
         self.incontent = 0
         self.intextinput = 0
         self.inimage = 0
         self.inauthor = 0
         self.incontributor = 0
-        self.contentparams = {}
+        self.contentparams = FeedParserDict()
         self.namespacemap = {}
         self.elementstack = []
         self.basestack = []
@@ -233,11 +236,11 @@ class _FeedParserMixin:
         
         # track xml:base and xml:lang
         attrsD = dict(attrs)
-        baseuri = attrsD.get('xml:base')
+        baseuri = attrsD.get('xml:base', attrsD.get('base'))
         if baseuri:
             if _debug: sys.stderr.write('self.baseuri=%s\n' % baseuri)
             self.baseuri = baseuri
-        lang = attrsD.get('xml:lang')
+        lang = attrsD.get('xml:lang', attrsD.get('lang'))
         if lang:
             self.lang = lang
         self.basestack.append(baseuri)
@@ -267,9 +270,9 @@ class _FeedParserMixin:
             return self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0)
 
         # match namespaces
-        try:
+        if tag.find(':') <> -1:
             prefix, suffix = tag.split(':', 1)
-        except ValueError:
+        else:
             prefix, suffix = '', tag
         prefix = self.namespacemap.get(prefix, prefix)
         if prefix:
@@ -286,9 +289,9 @@ class _FeedParserMixin:
     def unknown_endtag(self, tag):
         if _debug: sys.stderr.write('end %s\n' % tag)
         # match namespaces
-        try:
+        if tag.find(':') <> -1:
             prefix, suffix = tag.split(':', 1)
-        except ValueError:
+        else:
             prefix, suffix = '', tag
         prefix = self.namespacemap.get(prefix, prefix)
         if prefix:
@@ -340,8 +343,9 @@ class _FeedParserMixin:
         # called for each block of plain text, i.e. outside of any tag and
         # not containing any character or entity references
         if not self.elementstack: return
+#        if _debug: sys.stderr.write(text)
         if escape and self.contentparams.get('mode') == 'xml':
-            text = xmlescape(text)
+            text = _xmlescape(text)
         self.elementstack[-1][2].append(text)
 
     def handle_comment(self, text):
@@ -353,39 +357,15 @@ class _FeedParserMixin:
         pass
 
     def handle_decl(self, text):
-        # called for the DOCTYPE, if present, e.g.
-        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
-        #     "http://www.w3.org/TR/html4/loose.dtd">
-        if text.count('http://my.netscape.com/publish/formats/rss-0.91.dtd'):
-            self.version = 'rss091n'
-
-    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
-    def _scan_name(self, i, declstartpos):
-        rawdata = self.rawdata
-        n = len(rawdata)
-        if i == n:
-            return None, -1
-        m = self._new_declname_match(rawdata, i)
-        if m:
-            s = m.group()
-            name = s.strip()
-            if (i + len(s)) == n:
-                return None, -1  # end of buffer
-            return name.lower(), m.end()
-        else:
-            self.updatepos(declstartpos, i)
-            self.error("expected name token")
+        pass
 
     def parse_declaration(self, i):
         # override internal declaration handler to handle CDATA blocks
         if _debug: sys.stderr.write("entering parse_declaration\n")
-        if re.search(r'^<!DOCTYPE\s+?rss\s+?PUBLIC\s+?"-//Netscape Communications//DTD RSS 0.91//EN"\s+?"http://my.netscape.com/publish/formats/rss-0.91.dtd">', self.rawdata[i:]):
-            if _debug: sys.stderr.write("found Netscape DOCTYPE\n")
-            self.version = 'rss091n'
         if self.rawdata[i:i+9] == '<![CDATA[':
             k = self.rawdata.find(']]>', i)
             if k == -1: k = len(self.rawdata)
-            self.handle_data(xmlescape(self.rawdata[i+9:k]), 0)
+            self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
             return k+3
         else:
             k = self.rawdata.find('>', i)
@@ -394,6 +374,8 @@ class _FeedParserMixin:
     def trackNamespace(self, prefix, uri):
         if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
             self.version = 'rss090'
+        if (prefix, uri) == (None, 'http://purl.org/rss/1.0/') and not self.version:
+            self.version = 'rss10'
         if not prefix: return
         if uri.find('backend.userland.com/rss') <> -1:
             # match any backend.userland.com namespace
@@ -414,13 +396,11 @@ class _FeedParserMixin:
         return data
         
     def push(self, element, expectingText):
-#        print 'push', element, expectingText
 #        while self.elementstack and self.elementstack[-1][1]:
 #            self.pop(self.elementstack[-1][0])
         self.elementstack.append([element, expectingText, []])
 
     def pop(self, element):
-#        print 'pop', element
         if not self.elementstack: return
 #        while self.elementstack[-1][0] != element: self.pop(self.elementstack[-1][0])
         if self.elementstack[-1][0] != element: return
@@ -448,47 +428,56 @@ class _FeedParserMixin:
 
         # resolve relative URIs within embedded markup
         if element in self.can_contain_relative_uris:
-            output = _resolveRelativeURIs(output, self.baseuri)
+            output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
         
         # sanitize embedded markup
         if element in self.can_contain_dangerous_markup:
-            output = _sanitizeHTML(output)
+            output = _sanitizeHTML(output, self.encoding)
+
+        if type(output) == types.StringType:
+            try:
+                output = unicode(output, self.encoding)
+            except:
+                pass
             
         # store output in appropriate place(s)
-        if self.initem:
+        if self.inentry:
             if element == 'content':
-                self.items[-1].setdefault(element, [])
+                self.entries[-1].setdefault(element, [])
                 contentparams = copy.deepcopy(self.contentparams)
                 contentparams['value'] = output
-                self.items[-1][element].append(contentparams)
+                self.entries[-1][element].append(contentparams)
             elif element == 'category':
-                self.items[-1][element] = output
-                domain = self.items[-1]['categories'][-1][0]
-                self.items[-1]['categories'][-1] = (domain, output)
+                self.entries[-1][element] = output
+                domain = self.entries[-1]['categories'][-1][0]
+                self.entries[-1]['categories'][-1] = (domain, output)
             elif element == 'source':
-                self.items[-1]['source']['value'] = output
+                self.entries[-1]['source']['value'] = output
             elif element == 'link':
-                self.items[-1][element] = output
+                self.entries[-1][element] = output
                 if output:
-                    self.items[-1]['links'][-1]['href'] = output
+                    self.entries[-1]['links'][-1]['href'] = output
             else:
-                if self.incontent and element != 'description':
+                self.entries[-1][element] = output
+                if self.incontent:
+                    if element == 'description':
+                        element = 'summary'
                     contentparams = copy.deepcopy(self.contentparams)
                     contentparams['value'] = output
-                    self.items[-1][element + '_detail'] = contentparams
-                self.items[-1][element] = output
-        elif self.inchannel and (not self.intextinput) and (not self.inimage):
+                    self.entries[-1][element + '_detail'] = contentparams
+        elif self.infeed and (not self.intextinput) and (not self.inimage):
+            self.feeddata[element] = output
             if element == 'category':
-                domain = self.channel['categories'][-1][0]
-                self.channel['categories'][-1] = (domain, output)
+                domain = self.feeddata['categories'][-1][0]
+                self.feeddata['categories'][-1] = (domain, output)
             elif element == 'link':
-                self.channel['links'][-1]['href'] = output
-            else:
-                if self.incontent and element != 'description':
-                    contentparams = copy.deepcopy(self.contentparams)
-                    contentparams['value'] = output
-                    self.channel[element + '_detail'] = contentparams
-            self.channel[element] = output
+                self.feeddata['links'][-1]['href'] = output
+            elif self.incontent:
+                if element == 'description':
+                    element = 'tagline'
+                contentparams = copy.deepcopy(self.contentparams)
+                contentparams['value'] = output
+                self.feeddata[element + '_detail'] = contentparams
         return output
 
     def _mapToStandardPrefix(self, name):
@@ -505,10 +494,10 @@ class _FeedParserMixin:
 
     def _save(self, key, value):
         if value:
-            if self.initem:
-                self.items[-1].setdefault(key, value)
-            elif self.channel:
-                self.channel.setdefault(key, value)
+            if self.inentry:
+                self.entries[-1].setdefault(key, value)
+            elif self.feeddata:
+                self.feeddata.setdefault(key, value)
 
     def _start_rss(self, attrsD):
         versionmap = {'0.91': 'rss091u',
@@ -524,12 +513,28 @@ class _FeedParserMixin:
                 self.version = 'rss20'
             else:
                 self.version = 'rss'
+    
+    def _start_dlhottitles(self, attrsD):
+        self.version = 'hotrss'
 
     def _start_channel(self, attrsD):
-        self.inchannel = 1
-
+        self.infeed = 1
+        self._cdf_common(attrsD)
+    _start_feedinfo = _start_channel
+
+    def _cdf_common(self, attrsD):
+        if attrsD.has_key('lastmod'):
+            if _debug: sys.stderr.write(attrsD['lastmod'] + '\n')
+            self._start_modified({})
+            self.elementstack[-1][-1] = attrsD['lastmod']
+            self._end_modified()
+        if attrsD.has_key('href'):
+            self._start_link({})
+            self.elementstack[-1][-1] = attrsD['href']
+            self._end_link()
+    
     def _start_feed(self, attrsD):
-        self.inchannel = 1
+        self.infeed = 1
         versionmap = {'0.1': 'atom01',
                       '0.2': 'atom02',
                       '0.3': 'atom03'}
@@ -542,7 +547,7 @@ class _FeedParserMixin:
                 self.version = 'atom'
 
     def _end_channel(self):
-        self.inchannel = 0
+        self.infeed = 0
     _end_feed = _end_channel
     
     def _start_image(self, attrsD):
@@ -553,9 +558,13 @@ class _FeedParserMixin:
                 
     def _start_textinput(self, attrsD):
         self.intextinput = 1
+        self.push('textinput', 0)
+        context = self._getContext()
+        context.setdefault('textinput', FeedParserDict())
     _start_textInput = _start_textinput
     
     def _end_textinput(self):
+        self.pop('textinput')
         self.intextinput = 0
     _end_textInput = _end_textinput
 
@@ -578,7 +587,7 @@ class _FeedParserMixin:
         self.incontributor = 1
         context = self._getContext()
         context.setdefault('contributors', [])
-        context['contributors'].append({})
+        context['contributors'].append(FeedParserDict())
         self.push('contributor', 0)
 
     def _end_contributor(self):
@@ -594,13 +603,12 @@ class _FeedParserMixin:
             self._save_author('name', value)
         elif self.incontributor:
             self._save_contributor('name', value)
-            pass
         elif self.intextinput:
-            # TODO
-            pass
+            context = self._getContext()
+            context['textinput']['name'] = value
 
     def _start_url(self, attrsD):
-        self.push('url', 0)
+        self.push('url', 1)
     _start_homepage = _start_url
     _start_uri = _start_url
 
@@ -614,7 +622,7 @@ class _FeedParserMixin:
             # TODO
             pass
         elif self.intextinput:
-            # TODO
+            # TODO (map to link)
             pass
     _end_homepage = _end_url
     _end_uri = _end_url
@@ -629,29 +637,23 @@ class _FeedParserMixin:
         elif self.incontributor:
             self._save_contributor('email', value)
             pass
-        elif self.inimage:
-            # TODO
-            pass
-        elif self.intextinput:
-            # TODO
-            pass
 
     def _getContext(self):
-        if self.initem:
-            context = self.items[-1]
+        if self.inentry:
+            context = self.entries[-1]
         else:
-            context = self.channel
+            context = self.feeddata
         return context
 
     def _save_author(self, key, value):
         context = self._getContext()
-        context.setdefault('author_detail', {})
+        context.setdefault('author_detail', FeedParserDict())
         context['author_detail'][key] = value
         self._sync_author_detail()
 
     def _save_contributor(self, key, value):
         context = self._getContext()
-        context.setdefault('contributors', [{}])
+        context.setdefault('contributors', [FeedParserDict()])
         context['contributors'][-1][key] = value
 
     def _sync_author_detail(self):
@@ -672,19 +674,25 @@ class _FeedParserMixin:
             emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author)
             if not emailmatch: return
             email = emailmatch.group(0)
+            # probably a better way to do the following, but it passes all the tests
             author = author.replace(email, '')
             author = author.replace('()', '')
             author = author.strip()
-            context.setdefault('author_detail', {})
+            if author and (author[0] == '('):
+                author = author[1:]
+            if author and (author[-1] == ')'):
+                author = author[:-1]
+            author = author.strip()
+            context.setdefault('author_detail', FeedParserDict())
             context['author_detail']['name'] = author
             context['author_detail']['email'] = email
             
     def _start_tagline(self, attrsD):
         self.incontent += 1
-        self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
                               'type': attrsD.get('type', 'text/plain'),
                               'language': attrsD.get('xml:lang', self.lang),
-                              'base': attrsD.get('xml:base', self.baseuri)}
+                              'base': attrsD.get('xml:base', self.baseuri)})
         self.push('tagline', 1)
     _start_subtitle = _start_tagline
 
@@ -692,16 +700,16 @@ class _FeedParserMixin:
         value = self.pop('tagline')
         self.incontent -= 1
         self.contentparams.clear()
-        if self.inchannel:
-            self.channel['description'] = value
+        if self.infeed:
+            self.feeddata['description'] = value
     _end_subtitle = _end_tagline
             
     def _start_copyright(self, attrsD):
         self.incontent += 1
-        self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
                               'type': attrsD.get('type', 'text/plain'),
                               'language': attrsD.get('xml:lang', self.lang),
-                              'base': attrsD.get('xml:base', self.baseuri)}
+                              'base': attrsD.get('xml:base', self.baseuri)})
         self.push('copyright', 1)
     _start_dc_rights = _start_copyright
 
@@ -712,14 +720,16 @@ class _FeedParserMixin:
     _end_dc_rights = _end_copyright
 
     def _start_item(self, attrsD):
-        self.items.append({})
+        self.entries.append(FeedParserDict())
         self.push('item', 0)
-        self.initem = 1
+        self.inentry = 1
+        self._cdf_common(attrsD)
     _start_entry = _start_item
+    _start_product = _start_item
 
     def _end_item(self):
         self.pop('item')
-        self.initem = 0
+        self.inentry = 0
     _end_entry = _end_item
 
     def _start_dc_language(self, attrsD):
@@ -764,6 +774,7 @@ class _FeedParserMixin:
 
     def _end_dcterms_modified(self):
         value = self.pop('modified')
+        if _debug: sys.stderr.write('_end_dcterms_modified, value=' + value + '\n')
         parsed_value = _parse_date(value)
         self._save('date', value)
         self._save('date_parsed', parsed_value)
@@ -795,41 +806,51 @@ class _FeedParserMixin:
         self.push('category', 1)
         domain = self._getAttribute(attrsD, 'domain')
         cats = []
-        if self.initem:
-            cats = self.items[-1].setdefault('categories', [])
-        elif self.inchannel:
-            cats = self.channel.setdefault('categories', [])
+        if self.inentry:
+            cats = self.entries[-1].setdefault('categories', [])
+        elif self.infeed:
+            cats = self.feeddata.setdefault('categories', [])
         cats.append((domain, None))
     _start_dc_subject = _start_category
+    _start_keywords = _start_category
         
     def _end_category(self):
         self.pop('category')
     _end_dc_subject = _end_category
+    _end_keywords = _end_category
         
     def _start_cloud(self, attrsD):
-        self.channel['cloud'] = attrsD
+        self.feeddata['cloud'] = attrsD
         
     def _start_link(self, attrsD):
         attrsD.setdefault('rel', 'alternate')
         attrsD.setdefault('type', 'text/html')
         if attrsD.has_key('href'):
             attrsD['href'] = self.resolveURI(attrsD['href'])
-        expectingText = self.inchannel or self.initem
-        if self.initem:
-            self.items[-1].setdefault('links', [])
-            self.items[-1]['links'].append(attrsD)
-        elif self.inchannel:
-            self.channel.setdefault('links', [])
-            self.channel['links'].append(attrsD)
+        expectingText = self.infeed or self.inentry
+        if self.inentry:
+            self.entries[-1].setdefault('links', [])
+            self.entries[-1]['links'].append(attrsD)
+        elif self.infeed:
+            self.feeddata.setdefault('links', [])
+            self.feeddata['links'].append(attrsD)
         if attrsD.has_key('href'):
             expectingText = 0
             if attrsD.get('type', '') in self.html_types:
-                if self.initem:
-                    self.items[-1]['link'] = attrsD['href']
-                elif self.inchannel:
-                    self.channel['link'] = attrsD['href']
+                if self.inentry:
+                    self.entries[-1]['link'] = attrsD['href']
+                elif self.infeed:
+                    self.feeddata['link'] = attrsD['href']
         else:
             self.push('link', expectingText)
+    _start_producturl = _start_link
+
+    def _end_link(self):
+        value = self.pop('link')
+        if self.intextinput:
+            context = self._getContext()
+            context['textinput']['link'] = value
+    _end_producturl = _end_link
 
     def _start_guid(self, attrsD):
         self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
@@ -852,42 +873,52 @@ class _FeedParserMixin:
             
     def _start_title(self, attrsD):
         self.incontent += 1
-        self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
                               'type': attrsD.get('type', 'text/plain'),
                               'language': attrsD.get('xml:lang', self.lang),
-                              'base': attrsD.get('xml:base', self.baseuri)}
-        self.push('title', self.inchannel or self.initem)
+                              'base': attrsD.get('xml:base', self.baseuri)})
+        self.push('title', self.infeed or self.inentry)
     _start_dc_title = _start_title
 
     def _end_title(self):
-        self.pop('title')
+        value = self.pop('title')
         self.incontent -= 1
         self.contentparams.clear()
+        if self.intextinput:
+            context = self._getContext()
+            context['textinput']['title'] = value
     _end_dc_title = _end_title
 
-    def _start_description(self, attrsD):
+    def _start_description(self, attrsD, default_content_type='text/html'):
         self.incontent += 1
-        self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
-                              'type': attrsD.get('type', 'text/html'),
+        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
+                              'type': attrsD.get('type', default_content_type),
                               'language': attrsD.get('xml:lang', self.lang),
-                              'base': attrsD.get('xml:base', self.baseuri)}
-        self.push('description', self.inchannel or self.initem)
+                              'base': attrsD.get('xml:base', self.baseuri)})
+        self.push('description', self.infeed or self.inentry)
+
+    def _start_abstract(self, attrsD):
+        return self._start_description(attrsD, 'text/plain')
 
     def _end_description(self):
         value = self.pop('description')
-        if self.initem:
-            self.items[-1]['summary'] = value
-        elif self.inchannel:
-            self.channel['tagline'] = value
         self.incontent -= 1
         self.contentparams.clear()
-        
+        context = self._getContext()
+        if self.intextinput:
+            context['textinput']['description'] = value
+        elif self.inentry:
+            context['summary'] = value
+        elif self.infeed:
+            context['tagline'] = value
+    _end_abstract = _end_description
+
     def _start_info(self, attrsD):
         self.incontent += 1
-        self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
                               'type': attrsD.get('type', 'text/plain'),
                               'language': attrsD.get('xml:lang', self.lang),
-                              'base': attrsD.get('xml:base', self.baseuri)}
+                              'base': attrsD.get('xml:base', self.baseuri)})
         self.push('info', 1)
 
     def _end_info(self):
@@ -897,13 +928,15 @@ class _FeedParserMixin:
 
     def _start_generator(self, attrsD):
         if attrsD:
-            self.channel['generator_detail'] = attrsD
+            if attrsD.has_key('url'):
+                attrsD['url'] = self.resolveURI(attrsD['url'])
+            self.feeddata['generator_detail'] = attrsD
         self.push('generator', 1)
 
     def _end_generator(self):
         value = self.pop('generator')
-        if self.channel.has_key('generator_detail'):
-            self.channel['generator_detail']['name'] = value
+        if self.feeddata.has_key('generator_detail'):
+            self.feeddata['generator_detail']['name'] = value
             
     def _start_admin_generatoragent(self, attrsD):
         self.push('generator', 1)
@@ -921,27 +954,27 @@ class _FeedParserMixin:
         
     def _start_summary(self, attrsD):
         self.incontent += 1
-        self.contentparams = {'mode': attrsD.get('mode', 'escaped'),
+        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
                               'type': attrsD.get('type', 'text/plain'),
                               'language': attrsD.get('xml:lang', self.lang),
-                              'base': attrsD.get('xml:base', self.baseuri)}
+                              'base': attrsD.get('xml:base', self.baseuri)})
         self.push('summary', 1)
 
     def _end_summary(self):
         value = self.pop('summary')
-        if self.items:
-            self.items[-1]['description'] = value
+        if self.entries:
+            self.entries[-1]['description'] = value
         self.incontent -= 1
         self.contentparams.clear()
         
     def _start_enclosure(self, attrsD):
-        if self.initem:
-            self.items[-1].setdefault('enclosures', [])
-            self.items[-1]['enclosures'].append(attrsD)
+        if self.inentry:
+            self.entries[-1].setdefault('enclosures', [])
+            self.entries[-1]['enclosures'].append(attrsD)
             
     def _start_source(self, attrsD):
-        if self.initem:
-            self.items[-1]['source'] = attrsD
+        if self.inentry:
+            self.entries[-1]['source'] = attrsD
         self.push('source', 1)
 
     def _end_source(self):
@@ -949,27 +982,35 @@ class _FeedParserMixin:
 
     def _start_content(self, attrsD):
         self.incontent += 1
-        self.contentparams = {'mode': attrsD.get('mode', 'xml'),
+        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
                               'type': attrsD.get('type', 'text/plain'),
                               'language': attrsD.get('xml:lang', self.lang),
-                              'base': attrsD.get('xml:base', self.baseuri)}
+                              'base': attrsD.get('xml:base', self.baseuri)})
+        self.push('content', 1)
+
+    def _start_prodlink(self, attrsD):
+        self.incontent += 1
+        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
+                              'type': attrsD.get('type', 'text/html'),
+                              'language': attrsD.get('xml:lang', self.lang),
+                              'base': attrsD.get('xml:base', self.baseuri)})
         self.push('content', 1)
 
     def _start_body(self, attrsD):
         self.incontent += 1
-        self.contentparams = {'mode': 'xml',
+        self.contentparams = FeedParserDict({'mode': 'xml',
                               'type': 'application/xhtml+xml',
                               'language': attrsD.get('xml:lang', self.lang),
-                              'base': attrsD.get('xml:base', self.baseuri)}
+                              'base': attrsD.get('xml:base', self.baseuri)})
         self.push('content', 1)
     _start_xhtml_body = _start_body
 
     def _start_content_encoded(self, attrsD):
         self.incontent += 1
-        self.contentparams = {'mode': 'escaped',
+        self.contentparams = FeedParserDict({'mode': 'escaped',
                               'type': 'text/html',
                               'language': attrsD.get('xml:lang', self.lang),
-                              'base': attrsD.get('xml:base', self.baseuri)}
+                              'base': attrsD.get('xml:base', self.baseuri)})
         self.push('content', 1)
     _start_fullitem = _start_content_encoded
 
@@ -983,13 +1024,14 @@ class _FeedParserMixin:
     _end_xhtml_body = _end_content
     _end_content_encoded = _end_content
     _end_fullitem = _end_content
+    _end_prodlink = _end_content
 
 if _XML_AVAILABLE:
-    class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):#, xml.sax.handler.DTDHandler):
-        def __init__(self, baseuri):
+    class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler, xml.sax.handler.EntityResolver):#, xml.sax.handler.DTDHandler):
+        def __init__(self, baseuri, encoding):
             if _debug: sys.stderr.write('trying StrictFeedParser\n')
             xml.sax.handler.ContentHandler.__init__(self)
-            _FeedParserMixin.__init__(self, baseuri)
+            _FeedParserMixin.__init__(self, baseuri, encoding)
             self.bozo = 0
             self.exc = None
         
@@ -998,8 +1040,11 @@ if _XML_AVAILABLE:
         
         def startElementNS(self, name, qname, attrs):
             namespace, localname = name
-            namespace = str(namespace)
-            prefix = self.namespaces.get(namespace, '')
+            namespace = str(namespace or '')
+            if namespace.find('backend.userland.com/rss') <> -1:
+                # match any backend.userland.com namespace
+                namespace = 'http://backend.userland.com/rss'
+            prefix = self.namespaces.get(namespace, 'unknown')
             if prefix:
                 localname = prefix + ':' + localname
             localname = str(localname).lower()
@@ -1036,28 +1081,35 @@ if _XML_AVAILABLE:
             localname = str(localname).lower()
             self.unknown_endtag(localname)
 
-        def fatalError(self, exc):
+        def error(self, exc):
             self.bozo = 1
             self.exc = exc
-        error = fatalError
-
-class _LooseFeedParser(_FeedParserMixin, sgmllib.SGMLParser):
-    def __init__(self, baseuri):
-        sgmllib.SGMLParser.__init__(self)
-        _FeedParserMixin.__init__(self, baseuri)
+            
+        def fatalError(self, exc):
+            self.error(exc)
+            raise exc
 
 class _BaseHTMLProcessor(sgmllib.SGMLParser):
     elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
       'img', 'input', 'isindex', 'link', 'meta', 'param']
     
-    def __init__(self):
+    def __init__(self, encoding):
+        self.encoding = encoding
         sgmllib.SGMLParser.__init__(self)
         
     def reset(self):
-        # extend (called by sgmllib.SGMLParser.__init__)
         self.pieces = []
         sgmllib.SGMLParser.reset(self)
 
+    def feed(self, data):
+        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
+        data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
+        data = data.replace('&#39;', "'")
+        data = data.replace('&#34;', '"')
+        if type(data) == types.UnicodeType:
+            data = data.encode(self.encoding)
+        sgmllib.SGMLParser.feed(self, data)
+
     def normalize_attrs(self, attrs):
         # utility method to be called by descendants
         attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
@@ -1068,6 +1120,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
         # called for each start tag
         # attrs is a list of (attr, value) tuples
         # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
         strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
         if tag in self.elements_no_end_tag:
             self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
@@ -1094,6 +1147,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
         # called for each block of plain text, i.e. outside of any tag and
         # not containing any character or entity references
         # Store the original text verbatim.
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
         self.pieces.append(text)
         
     def handle_comment(self, text):
@@ -1113,9 +1167,37 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
         # Reconstruct original DOCTYPE
         self.pieces.append("<!%(text)s>" % locals())
         
+    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
+    def _scan_name(self, i, declstartpos):
+        rawdata = self.rawdata
+        if _debug: sys.stderr.write("i=%s, declstartpos=%s, rawdata=%s\n" % (i, declstartpos, rawdata))
+        n = len(rawdata)
+        if i == n:
+            return None, -1
+        m = self._new_declname_match(rawdata, i)
+        if m:
+            s = m.group()
+            name = s.strip()
+            if (i + len(s)) == n:
+                return None, -1  # end of buffer
+            return name.lower(), m.end()
+        else:
+            self.handle_data(rawdata)
+#            self.updatepos(declstartpos, i)
+            return None, -1
+
     def output(self):
         """Return processed HTML as a single string"""
-        return "".join(self.pieces)
+        if _debug:
+            for p in self.pieces:
+                sys.stderr.write(p)
+            sys.stderr.write('\n')
+        return "".join([str(p) for p in self.pieces])
+
+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
+    def __init__(self, baseuri, encoding):
+        sgmllib.SGMLParser.__init__(self)
+        _FeedParserMixin.__init__(self, baseuri, encoding)
 
 class _RelativeURIResolver(_BaseHTMLProcessor):
     relative_uris = [('a', 'href'),
@@ -1144,8 +1226,8 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
                      ('q', 'cite'),
                      ('script', 'src')]
 
-    def __init__(self, baseuri):
-        _BaseHTMLProcessor.__init__(self)
+    def __init__(self, baseuri, encoding):
+        _BaseHTMLProcessor.__init__(self, encoding)
         self.baseuri = baseuri
 
     def resolveURI(self, uri):
@@ -1156,8 +1238,10 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
         
-def _resolveRelativeURIs(htmlSource, baseURI):
-    p = _RelativeURIResolver(baseURI)
+def _resolveRelativeURIs(htmlSource, baseURI, encoding):
+    if _debug: sys.stderr.write("entering _resolveRelativeURIs\n")
+    p = _RelativeURIResolver(baseURI, encoding)
+    if _debug: sys.stderr.write(repr(type(htmlSource)) + '\n')
     p.feed(htmlSource)
     return p.output()
 
@@ -1214,11 +1298,11 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
         if not self.unacceptablestack:
             _BaseHTMLProcessor.handle_data(self, text)
 
-def _sanitizeHTML(htmlSource):
-    p = _HTMLSanitizer()
+def _sanitizeHTML(htmlSource, encoding):
+    p = _HTMLSanitizer(encoding)
     p.feed(htmlSource)
     data = p.output()
-    if _mxtidy:
+    if _mxtidy and TIDY_MARKUP:
         nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
         if data.count('<body'):
             data = data.split('<body', 1)[1]
@@ -1281,36 +1365,43 @@ def _open_resource(url_file_stream_or_string, etag=None, modified=None, agent=No
     if url_file_stream_or_string == "-":
         return sys.stdin
 
-    if not agent:
-        agent = USER_AGENT
+    if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
+        if not agent:
+            agent = USER_AGENT
         
-    # try to open with urllib2 (to use optional headers)
-    request = urllib2.Request(url_file_stream_or_string)
-    if etag:
-        request.add_header("If-None-Match", etag)
-    if modified:
-        # format into an RFC 1123-compliant timestamp. We can't use
-        # time.strftime() since the %a and %b directives can be affected
-        # by the current locale, but RFC 2616 states that dates must be
-        # in English.
-        short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
-        months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
-        request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
-    request.add_header("User-Agent", agent)
-    if referrer:
-        request.add_header("Referer", referrer)
-    if gzip:
-        request.add_header("Accept-encoding", "gzip")
-    opener = urllib2.build_opener(_FeedURLHandler())
-    opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
-    try:
+        # try to open with urllib2 (to use optional headers)
+        request = urllib2.Request(url_file_stream_or_string)
+        request.add_header("User-Agent", agent)
+        if etag:
+            request.add_header("If-None-Match", etag)
+        if modified:
+            # format into an RFC 1123-compliant timestamp. We can't use
+            # time.strftime() since the %a and %b directives can be affected
+            # by the current locale, but RFC 2616 states that dates must be
+            # in English.
+            short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
+            months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+            request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
+        if referrer:
+            request.add_header("Referer", referrer)
+        if gzip:
+            request.add_header("Accept-encoding", "gzip")
+        opener = urllib2.build_opener(_FeedURLHandler())
+        opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
         try:
-            return opener.open(request)
-        except:
-            # url_file_stream_or_string is not a valid URL, but it might be a valid filename
-            pass
-    finally:
-        opener.close() # JohnD
+            try:
+                return opener.open(request)
+#            except ValueError:
+#                # not a valid URL, but might be a valid filename
+#                pass
+#            except AssertionError:
+#                # under Python 2.1, non-URLs will fail with an AssertionError;
+#                # still might be a valid filename, so fall through
+#                pass
+            except:
+                return _StringIO('')
+        finally:
+            opener.close() # JohnD
     
     # try to open with native open function (if url_file_stream_or_string is a filename)
     try:
@@ -1554,9 +1645,123 @@ def _parse_date(date):
     except:
         return None
 
+def _getCharacterEncoding(http_headers, xml_data):
+    """Get the character encoding of the XML document
+
+    http_headers is a dictionary
+    xml_data is a raw string (not Unicode)
+    
+    This is so much trickier than it sounds,
+    it's not even funny.  According to RFC 3023 ("XML Media Types"), if
+    the HTTP Content-Type is application/xml, application/*+xml,
+    application/xml-external-parsed-entity, or application/xml-dtd,
+    the encoding given in the charset parameter of the HTTP Content-Type
+    takes precedence over the encoding given in the XML prefix within the
+    document, and defaults to "utf-8" if neither are specified.  But, if
+    the HTTP Content-Type is text/xml, text/*+xml, or
+    text/xml-external-parsed-entity, the encoding given in the XML prefix
+    within the document is ALWAYS IGNORED and only the encoding given in
+    the charset parameter of the HTTP Content-Type header should be
+    respected, and it defaults to "us-ascii" if not specified.  If
+    Content-Type is unspecified (input was local file or non-HTTP source)
+    or unrecognized (server just got it totally wrong), then go by the
+    encoding given in the XML prefix of the document and default to
+    "utf-8" as per the XML specification.
+    """
+
+    def _parseHTTPContentType(content_type):
+        """takes HTTP Content-Type header and returns (content type, charset)
+
+        If no charset is specified, returns (content type, '')
+        If no content type is specified, returns ('', '')
+        Both return parameters are guaranteed to be lowercase strings
+        """
+        if not content_type:
+            return '', ''
+        content_type = content_type.strip()
+        paramstr = content_type.split(';')[1:]
+        if not paramstr:
+            return content_type, ''
+        content_type = content_type.split(';', 1)[0].strip().lower()
+        if not paramstr[0]:
+            # declaration like "text/xml;" (note ending semicolon)
+            # dunno if this is malformed but it sure was hard to track down
+            return content_type, ''
+        import string
+        params = dict([map(string.lower, map(string.strip, p.strip().split('=', 1))) for p in paramstr])
+        charset = params.get('charset')
+        if not charset:
+            return content_type, ''
+        if charset[0] in ('"', "'"):
+            charset = charset[1:]
+        if charset and charset[-1] in ('"', "'"):
+            charset = charset[:-1]
+        charset = charset.strip()
+        return content_type, charset
+
+    true_encoding = None
+    http_content_type, http_encoding = _parseHTTPContentType(http_headers.get("content-type"))
+    xml_encoding_match = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
+    xml_encoding = xml_encoding_match and xml_encoding_match.groups()[0].lower() or ''
+    if (http_content_type == 'application/xml') or \
+       (http_content_type == 'application/xml-dtd') or \
+       (http_content_type == 'application/xml-external-parsed-entity') or \
+       (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
+        if http_encoding:
+            true_encoding = http_encoding
+        elif xml_encoding:
+            true_encoding = xml_encoding
+        else:
+            true_encoding = 'utf-8'
+    elif (http_content_type == 'text/xml') or \
+         (http_content_type == 'text/xml-external-parsed-entity') or \
+         (http_content_type.startswith('text/') and http_content_type.endswith('+xml')):
+        if http_encoding:
+            true_encoding = http_encoding
+        else:
+            true_encoding = 'us-ascii'
+    else:
+        true_encoding = xml_encoding or 'utf-8'
+    return true_encoding, http_encoding, xml_encoding
+    
+def _changeEncodingDeclaration(data, encoding):
+    """Changes an XML data stream on the fly to specify a new encoding
+
+    data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
+    encoding is a string recognized by encodings.aliases
+    """
+    if _debug: sys.stderr.write('entering _changeEncodingDeclaration\n')
+    if _debug: sys.stderr.write('proposed encoding: %s\n' % encoding)
+    #import cjkcodecs.aliases
+    #import japanese
+    data = unicode(data, encoding)
+    declmatch = re.compile(u'^<\?xml[^>]*?>')
+    newdecl = unicode("""<?xml version='1.0' encoding='%s'?>""" % encoding, encoding)
+    if declmatch.search(data):
+        data = declmatch.sub(newdecl, data)
+    else:
+        data = newdecl + u'\n' + data
+    return data.encode(encoding)
+
+def _stripDoctype(data):
+    """Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
+
+    rss_version may be "rss091n" or None
+    stripped_data is the same XML document, minus the DOCTYPE
+    """
+    doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
+    doctype_results = doctype_pattern.findall(data)
+    doctype = doctype_results and doctype_results[0] or ''
+    if doctype.lower().count('netscape'):
+        version = 'rss091n'
+    else:
+        version = None
+    data = doctype_pattern.sub('', data)
+    return version, data
+    
 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None):
     """Parse a feed from a URL, file, stream, or string"""
-    result = {}
+    result = FeedParserDict()
     f = _open_resource(url_file_stream_or_string, etag=etag, modified=modified, agent=agent, referrer=referrer)
     data = f.read()
     if hasattr(f, "headers"):
@@ -1579,35 +1784,71 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
         result["status"] = f.status
     if hasattr(f, "headers"):
         result["headers"] = f.headers.dict
-    # get the xml encoding
-    xmlheaderRe = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>') # Andrei's version
-    match = xmlheaderRe.match(data)
-    if match:
-        result["encoding"] = match.groups()[0].lower()
     f.close()
-    result['channel'] = {}
-    result['items'] = {}
+    if result.get("status", 0) == 304:
+        result['feed'] = FeedParserDict()
+        result['entries'] = []
+        result['debug_message'] = "The feed has not changed since you last checked, so the server sent no data.  This is a feature, not a bug!"
+        return result
+    result['encoding'], http_encoding, xml_encoding = _getCharacterEncoding(result.get("headers", {}), data)
+    result['version'], data = _stripDoctype(data)
     baseuri = result.get('headers', {}).get('content-location', result.get('url'))
     # try true XML parser first
-    if _XML_AVAILABLE:
+    if not _XML_AVAILABLE:
+        if _debug: sys.stderr.write('no xml libraries available\n')
+    use_strict_parser = _XML_AVAILABLE
+    if use_strict_parser:
         if _debug: sys.stderr.write('using xml library\n')
         result['bozo'] = 0
-        feedparser = _StrictFeedParser(baseuri)
-        if re.search(r'<!DOCTYPE\s+?rss\s+?PUBLIC\s+?"-//Netscape Communications//DTD RSS 0.91//EN"\s+?"http://my.netscape.com/publish/formats/rss-0.91.dtd">', data):
-            feedparser.version = 'rss091n'
-        source = xml.sax.xmlreader.InputSource()
-        source.setByteStream(_StringIO(data))
-        saxparser = xml.sax.make_parser()#["drv_libxml2"])
+        feedparser = _StrictFeedParser(baseuri, result['encoding'])
+        if _debug and _debug_never_use_libxml2:
+            sys.stderr.write('not using libxml2 (even if available)\n')
+            additional_parsers = []
+        else:
+            additional_parsers = ["drv_libxml2"]
+        saxparser = xml.sax.make_parser(additional_parsers)
         saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
         saxparser.setContentHandler(feedparser)
         saxparser.setErrorHandler(feedparser)
         try:
             saxparser.setDTDHandler(feedparser)
+        except xml.sax.SAXNotSupportedException:
+            # libxml2 driver does not support DTDHandler
+            if _debug: sys.stderr.write('using an xml library that does not support DTDHandler (not a big deal)\n')
+        try:
             saxparser.setEntityResolver(feedparser)
         except xml.sax.SAXNotSupportedException:
-            if _debug: sys.stderr.write('using an xml library that does not support DTDHandler and EntityResolver (this is not a problem)\n')
-            # libxml2 driver does not currently support DTDHandler or EntityResolver
-            pass
+            # libxml2 driver does not support EntityResolver
+            if _debug: sys.stderr.write('using an xml library that does not support EntityResolver (not a big deal)\n')
+        encoding_set = (result['encoding'] == xml_encoding)
+        if not encoding_set:
+            bozo_exception = None
+            proposed_encodings = [result['encoding'], xml_encoding, 'utf-8', 'iso-8859-1', 'windows-1252']
+            tried_encodings = []
+            for proposed_encoding in proposed_encodings:
+                if proposed_encodings in tried_encodings: continue
+                tried_encodings.append(proposed_encoding)
+                try:
+                    data = _changeEncodingDeclaration(data, proposed_encoding)
+                except Exception, bozo_exception:
+                    if _debug: sys.stderr.write('character encoding is wrong\n')
+                else:
+                    if proposed_encoding != result['encoding']:
+                        try:
+                            raise CharacterEncodingOverride, "document declared as %s, but parsed as %s" % (result['encoding'], proposed_encoding)
+                        except CharacterEncodingOverride, bozo_exception:
+                            result['bozo'] = 1
+                            result['bozo_exception'] = bozo_exception
+                    result['encoding'] = proposed_encoding
+                    encoding_set = 1
+                    break
+        if not encoding_set:
+            result['bozo'] = 1
+            result['bozo_exception'] = bozo_exception
+            use_strict_parser = 0
+    if use_strict_parser:
+        source = xml.sax.xmlreader.InputSource()
+        source.setByteStream(_StringIO(data))
         if hasattr(saxparser, '_ns_stack'):
             # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
             # PyXML doesn't have this problem, and it doesn't have _ns_stack either
@@ -1615,45 +1856,29 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
         try:
             saxparser.parse(source)
         except Exception, e:
-            # SAX parser is supposed to catch all of these and call feedparser.fatal_error,
-            # which captures them.  For some reason, some Unicode-related errors go
-            # uncaught on some combination of platform, XML library, Python version,
-            # and phase of the moon.
+            if _debug: sys.stderr.write('xml parsing failed\n')
             feedparser.bozo = 1
-            feedparser.bozo_exception = e
+            feedparser.bozo_exception = feedparser.exc or e
         if feedparser.bozo:
             # feed is not well-formed XML, fall back on regex-based parser
-            if _debug: sys.stderr.write('xml parsing failed, using regexes.  now you have two problems...\n')
             result['bozo'] = 1
-            result['bozo_exception'] = feedparser.exc
-            # munge short tags, e.g. <description/> becomes <description></description>
-            data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
-            feedparser = _LooseFeedParser(baseuri)
-            feedparser.feed(data)
-    else:
-        if _debug: sys.stderr.write('no xml libraries available, using regexes\n')
-        data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
-        feedparser = _LooseFeedParser(baseuri)
+            result['bozo_exception'] = feedparser.bozo_exception
+            use_strict_parser = 0
+    if not use_strict_parser:
+        if _debug: sys.stderr.write('using regexes, now you have two problems\n')
+        feedparser = _LooseFeedParser(baseuri, result['encoding'])
         feedparser.feed(data)
-    result['channel'] = feedparser.channel
-    result['items'] = feedparser.items
-    result['version'] = feedparser.version
+    result['feed'] = feedparser.feeddata
+    result['entries'] = feedparser.entries
+    result['version'] = result['version'] or feedparser.version
     return result
 
-_TEST_SUITE = ('http://www.pocketsoap.com/rssTests/rss1.0withModules.xml',
-              'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNS.xml',
-              'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNSLocalNameClash.xml',
-              'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModules.xml',
-              'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModulesLocalNameClash.xml',
-              'http://www.pocketsoap.com/rssTests/rss2.0NSwithModules.xml',
-              'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNS.xml',
-              'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNSLocalNameClash.xml')
-
 if __name__ == '__main__':
-    if sys.argv[1:]:
-        urls = sys.argv[1:]
+    if not sys.argv[1:]:
+        print __doc__
+        sys.exit(0)
     else:
-        urls = _TEST_SUITE
+        urls = sys.argv[1:]
     from pprint import pprint
     for url in urls:
         print url
@@ -1664,14 +1889,6 @@ if __name__ == '__main__':
 
 #TODO
 #- image
-#- textinput/textInput
-#- comments
-#
-#encoding notes:
-#- RFC 3023
-#- content-type.startswith('text/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else "us-ascii"
-#- content-type.startswith('application/') and content-type.endswith('xml') --> look for charset="(.*?)" in HTTP content-type header, else look for encoding="(.*?)" in document, else "utf-8"
-#- parsing encoding: http://www.w3.org/TR/REC-xml#NT-EncodingDecl
 #
 #REVISION HISTORY
 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
@@ -1751,20 +1968,62 @@ if __name__ == '__main__':
 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
 #  blogspot.com sites); added _debug variable
 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
-#3.0 - MAP - parse entire feed with real XML parser (if available); added several
-#  new supported namespaces; fixed bug tracking naked markup in description;
-#  added support for enclosure; added support for source; re-added support for
-#  cloud which got dropped somehow; added support for expirationDate; fixed
-#  xml:lang inheritance; fixed multiple bugs tracking xml:base URI, one for
-#  documents that don't define one explicitly and one for documents that define
-#  an outer and an inner xml:base that goes out of scope before the end of the
-#  document; fixed bug parsing multiple links at feed level; added feed type and
-#  version detection, results["version"] will be one of SUPPORTED_VERSIONS.keys()
-#  or empty string if unrecognized; added support for creativeCommons:license and
-#  cc:license; added support for full Atom content model in title, tagline, info,
-#  copyright, summary; fixed bug with gzip encoding (not always telling server
-#  we support it when we do); support Atom-style author element in author_detail
+#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
+#  added several new supported namespaces; fixed bug tracking naked markup in
+#  description; added support for enclosure; added support for source; re-added
+#  support for cloud which got dropped somehow; added support for expirationDate
+#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
+#  xml:base URI, one for documents that don't define one explicitly and one for
+#  documents that define an outer and an inner xml:base that goes out of scope
+#  before the end of the document
+#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
+#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result["version"]
+#  will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
+#  added support for creativeCommons:license and cc:license; added support for
+#  full Atom content model in title, tagline, info, copyright, summary; fixed bug
+#  with gzip encoding (not always telling server we support it when we do)
+#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
 #  (dictionary of "name", "url", "email"); map author to author_detail if author
-#  contains name + email address; better handling of empty HTML tags (br, hr, img,
-#  etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />);
-#  fixed CDATA handling in non-wellformed feeds under Python 2.1
+#  contains name + email address
+#3.0b8 - 1/28/2004 - MAP - added support for contributor
+#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
+#  support for summary
+#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
+#  xml.util.iso8601
+#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
+#  dangerous markup; fiddled with decodeEntities (not right); liberalized
+#  date parsing even further
+#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
+#  added support to Atom 0.2 subtitle; added support for Atom content model
+#  in copyright; better sanitizing of dangerous HTML elements with end tags
+#  (script, frameset)
+#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
+#  etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
+#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
+#  Python 2.1
+#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
+#  fixed bug capturing author and contributor URL; fixed bug resolving relative
+#  links in author and contributor URL; fixed bug resolvin relative links in
+#  generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
+#  namespace tests, and included them permanently in the test suite with his
+#  permission; fixed namespace handling under Python 2.1
+#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
+#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
+#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
+#  use libxml2 (if available)
+#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
+#  name was in parentheses; removed ultra-problematic mxTidy support; patch to
+#  workaround crash in PyXML/expat when encountering invalid entities
+#  (MarkMoraes); support for textinput/textInput
+#3.0b20 - 4/7/2004 - MAP - added CDF support
+#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
+#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
+#  results dict; changed results dict to allow getting values with results.key
+#  as well as results[key]; work around embedded illformed HTML with half
+#  a DOCTYPE; work around malformed Content-Type header; if character encoding
+#  is wrong, try several common ones before falling back to regexes (if this
+#  works, bozo_exception is set to CharacterEncodingOverride); fixed character
+#  encoding issues in BaseHTMLProcessor by tracking encoding and converting
+#  from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
+#  convert each value in results to Unicode (if possible), even if using
+#  regex-based parsing
index edd69a5..a7e3373 100644 (file)
--- a/feedlist
+++ b/feedlist
@@ -4,10 +4,8 @@
 # name                 url
 #
 Davyd Madeley          http://www.livejournal.com/users/davyd/data/rss
-# This feed is broken for caching, also slow
 Ian McKellar           http://www.livejournal.com/users/loic/data/rss
-# Ian McKellar         http://ian.mckellar.org/wp-rss2.php
-# Grahame Bowland      http://www.livejournal.com/users/grahame/data/rss
+Grahame Bowland                http://www.advogato.org/person/gbowland/rss.xml
 Adam Wright            http://www.livejournal.com/users/hipikat/data/rss
 Adrian Chadd           http://blog.cacheboy.net/blogs/cacheboy/index.rdf
 Trent Lloyd            http://www.livejournal.com/users/lathiat/data/rss
@@ -25,3 +23,19 @@ Aaron Alderman               http://www.livejournal.com/users/palaceboy/data/rss
 Brad Wake              http://www.livejournal.com/users/thebmw/data/rss
 Paul Marinceu          http://www.advogato.org/person/elixxir/rss.xml
 David Thackaberry      http://www.livejournal.com/users/tryce/data/rss
+Rhys Bevilaqua         http://www.livejournal.com/users/norp/data/rss
+Colm Kiely             http://www.livejournal.com/users/col_ki/data/rss
+Ben Murrihy            http://www.livejournal.com/users/benmurrihy/data/rss
+Davis Griffin          http://www.livejournal.com/users/c_avdas/data/rss
+Ewan MacLeod           http://www.livejournal.com/users/drayke_/data/rss
+Rob Slaughter          http://www.livejournal.com/users/robthesilent/data/rss
+Alex Dawson            http://www.livejournal.com/users/theducks/data/rss
+Tracey Brown           http://www.livejournal.com/users/tazaria/data/rss
+Lionel Pryce           http://www.livejournal.com/users/jetblackvalias/data/rss
+Carlo Andreacchio      http://www.livejournal.com/users/the_icon_of_sin/data/rss
+Rohan Joyce            http://www.livejournal.com/users/booto/data/rss
+Greg Cresp             http://www.livejournal.com/users/the_riviera_kid/data/rss
+Adrian Woodley         http://www.diskworld.com.au/blog/adrian/index.rss
+Chris Harris           http://www.diskworld.com.au/blog/chris/index.rss
+Chris Grubb            http://www.livejournal.com/users/maelstrm/data/rss
+Michael Grubb          http://www.livejournal.com/users/grubbmr/data/rss
index 4e23e5b..0de1c28 100755 (executable)
@@ -38,8 +38,10 @@ for feed in feeds:
                blog.feedURL    = feed[1]
                blogs.append(blog)
                # check the old copy of the cache, vs the new copy
-               if not feed[2] or not feed[2].cache or not blog or not blog.cache or feed[2].cache != blog.cache:
+               if not feed[2] or not feed[2].cache or not blog.cache or feed[2].cache != blog.cache:
                        tainted = True
+               elif len(blog.items) > 0 and len(feed[2].items) > 0 and (blog.items[0].itemTitle != feed[2].items[0].itemTitle or blog.items[0].contents != feed[2].items[0].contents):
+                       tainted = True
                # write the cache back down to disk
                cache.storeBlog(blog)
        else:

UCC git Repository :: git.ucc.asn.au