+def _getCharacterEncoding(http_headers, xml_data):
+ """Get the character encoding of the XML document
+
+ http_headers is a dictionary
+ xml_data is a raw string (not Unicode)
+
+ This is so much trickier than it sounds,
+ it's not even funny. According to RFC 3023 ("XML Media Types"), if
+ the HTTP Content-Type is application/xml, application/*+xml,
+ application/xml-external-parsed-entity, or application/xml-dtd,
+ the encoding given in the charset parameter of the HTTP Content-Type
+ takes precedence over the encoding given in the XML prefix within the
+ document, and defaults to "utf-8" if neither are specified. But, if
+ the HTTP Content-Type is text/xml, text/*+xml, or
+ text/xml-external-parsed-entity, the encoding given in the XML prefix
+ within the document is ALWAYS IGNORED and only the encoding given in
+ the charset parameter of the HTTP Content-Type header should be
+ respected, and it defaults to "us-ascii" if not specified. If
+ Content-Type is unspecified (input was local file or non-HTTP source)
+ or unrecognized (server just got it totally wrong), then go by the
+ encoding given in the XML prefix of the document and default to
+ "utf-8" as per the XML specification.
+ """
+
+ def _parseHTTPContentType(content_type):
+ """takes HTTP Content-Type header and returns (content type, charset)
+
+ If no charset is specified, returns (content type, '')
+ If no content type is specified, returns ('', '')
+ Both return parameters are guaranteed to be lowercase strings
+ """
+ if not content_type:
+ return '', ''
+ content_type = content_type.strip()
+ paramstr = content_type.split(';')[1:]
+ if not paramstr:
+ return content_type, ''
+ content_type = content_type.split(';', 1)[0].strip().lower()
+ if not paramstr[0]:
+ # declaration like "text/xml;" (note ending semicolon)
+ # dunno if this is malformed but it sure was hard to track down
+ return content_type, ''
+ import string
+ params = dict([map(string.lower, map(string.strip, p.strip().split('=', 1))) for p in paramstr])
+ charset = params.get('charset')
+ if not charset:
+ return content_type, ''
+ if charset[0] in ('"', "'"):
+ charset = charset[1:]
+ if charset and charset[-1] in ('"', "'"):
+ charset = charset[:-1]
+ charset = charset.strip()
+ return content_type, charset
+
+ true_encoding = None
+ http_content_type, http_encoding = _parseHTTPContentType(http_headers.get("content-type"))
+ xml_encoding_match = re.compile('<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
+ xml_encoding = xml_encoding_match and xml_encoding_match.groups()[0].lower() or ''
+ if (http_content_type == 'application/xml') or \
+ (http_content_type == 'application/xml-dtd') or \
+ (http_content_type == 'application/xml-external-parsed-entity') or \
+ (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
+ if http_encoding:
+ true_encoding = http_encoding
+ elif xml_encoding:
+ true_encoding = xml_encoding
+ else:
+ true_encoding = 'utf-8'
+ elif (http_content_type == 'text/xml') or \
+ (http_content_type == 'text/xml-external-parsed-entity') or \
+ (http_content_type.startswith('text/') and http_content_type.endswith('+xml')):
+ if http_encoding:
+ true_encoding = http_encoding
+ else:
+ true_encoding = 'us-ascii'
+ else:
+ true_encoding = xml_encoding or 'utf-8'
+ return true_encoding, http_encoding, xml_encoding
+
+def _changeEncodingDeclaration(data, encoding):
+ """Changes an XML data stream on the fly to specify a new encoding
+
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
+ encoding is a string recognized by encodings.aliases
+ """
+ if _debug: sys.stderr.write('entering _changeEncodingDeclaration\n')
+ if _debug: sys.stderr.write('proposed encoding: %s\n' % encoding)
+ #import cjkcodecs.aliases
+ #import japanese
+ data = unicode(data, encoding)
+ declmatch = re.compile(u'^<\?xml[^>]*?>')
+ newdecl = unicode("""<?xml version='1.0' encoding='%s'?>""" % encoding, encoding)
+ if declmatch.search(data):
+ data = declmatch.sub(newdecl, data)
+ else:
+ data = newdecl + u'\n' + data
+ return data.encode(encoding)
+
+def _stripDoctype(data):
+ """Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
+
+ rss_version may be "rss091n" or None
+ stripped_data is the same XML document, minus the DOCTYPE
+ """
+ doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
+ doctype_results = doctype_pattern.findall(data)
+ doctype = doctype_results and doctype_results[0] or ''
+ if doctype.lower().count('netscape'):
+ version = 'rss091n'
+ else:
+ version = None
+ data = doctype_pattern.sub('', data)
+ return version, data
+