Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""Beautiful Soup bonus library: Unicode, Dammit | |
This library converts a bytestream to Unicode through any means | |
necessary. It is heavily based on code from Mark Pilgrim's Universal | |
Feed Parser. It works best on XML and HTML, but it does not rewrite the | |
XML or HTML to reflect a new encoding; that's the tree builder's job. | |
""" | |
# Use of this source code is governed by the MIT license. | |
__license__ = "MIT" | |
from html.entities import codepoint2name | |
from collections import defaultdict | |
import codecs | |
import re | |
import logging | |
import string | |
# Import a library to autodetect character encodings. We'll support | |
# any of a number of libraries that all support the same API: | |
# | |
# * cchardet | |
# * chardet | |
# * charset-normalizer | |
chardet_module = None | |
try: | |
# PyPI package: cchardet | |
import cchardet as chardet_module | |
except ImportError: | |
try: | |
# Debian package: python-chardet | |
# PyPI package: chardet | |
import chardet as chardet_module | |
except ImportError: | |
try: | |
# PyPI package: charset-normalizer | |
import charset_normalizer as chardet_module | |
except ImportError: | |
# No chardet available. | |
chardet_module = None | |
if chardet_module: | |
def chardet_dammit(s): | |
if isinstance(s, str): | |
return None | |
return chardet_module.detect(s)['encoding'] | |
else: | |
def chardet_dammit(s): | |
return None | |
# Build bytestring and Unicode versions of regular expressions for finding | |
# a declared encoding inside an XML or HTML document. | |
xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' | |
html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' | |
encoding_res = dict() | |
encoding_res[bytes] = { | |
'html' : re.compile(html_meta.encode("ascii"), re.I), | |
'xml' : re.compile(xml_encoding.encode("ascii"), re.I), | |
} | |
encoding_res[str] = { | |
'html' : re.compile(html_meta, re.I), | |
'xml' : re.compile(xml_encoding, re.I) | |
} | |
from html.entities import html5 | |
class EntitySubstitution(object): | |
"""The ability to substitute XML or HTML entities for certain characters.""" | |
def _populate_class_variables(): | |
"""Initialize variables used by this class to manage the plethora of | |
HTML5 named entities. | |
This function returns a 3-tuple containing two dictionaries | |
and a regular expression: | |
unicode_to_name - A mapping of Unicode strings like "⦨" to | |
entity names like "angmsdaa". When a single Unicode string has | |
multiple entity names, we try to choose the most commonly-used | |
name. | |
name_to_unicode: A mapping of entity names like "angmsdaa" to | |
Unicode strings like "⦨". | |
named_entity_re: A regular expression matching (almost) any | |
Unicode string that corresponds to an HTML5 named entity. | |
""" | |
unicode_to_name = {} | |
name_to_unicode = {} | |
short_entities = set() | |
long_entities_by_first_character = defaultdict(set) | |
for name_with_semicolon, character in sorted(html5.items()): | |
# "It is intentional, for legacy compatibility, that many | |
# code points have multiple character reference names. For | |
# example, some appear both with and without the trailing | |
# semicolon, or with different capitalizations." | |
# - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references | |
# | |
# The parsers are in charge of handling (or not) character | |
# references with no trailing semicolon, so we remove the | |
# semicolon whenever it appears. | |
if name_with_semicolon.endswith(';'): | |
name = name_with_semicolon[:-1] | |
else: | |
name = name_with_semicolon | |
# When parsing HTML, we want to recognize any known named | |
# entity and convert it to a sequence of Unicode | |
# characters. | |
if name not in name_to_unicode: | |
name_to_unicode[name] = character | |
# When _generating_ HTML, we want to recognize special | |
# character sequences that _could_ be converted to named | |
# entities. | |
unicode_to_name[character] = name | |
# We also need to build a regular expression that lets us | |
# _find_ those characters in output strings so we can | |
# replace them. | |
# | |
# This is tricky, for two reasons. | |
if (len(character) == 1 and ord(character) < 128 | |
and character not in '<>&'): | |
# First, it would be annoying to turn single ASCII | |
# characters like | into named entities like | |
# |. The exceptions are <>&, which we _must_ | |
# turn into named entities to produce valid HTML. | |
continue | |
if len(character) > 1 and all(ord(x) < 128 for x in character): | |
# We also do not want to turn _combinations_ of ASCII | |
# characters like 'fj' into named entities like 'fj', | |
# though that's more debateable. | |
continue | |
# Second, some named entities have a Unicode value that's | |
# a subset of the Unicode value for some _other_ named | |
# entity. As an example, \u2267' is ≧, | |
# but '\u2267\u0338' is ≧̸. Our regular | |
# expression needs to match the first two characters of | |
# "\u2267\u0338foo", but only the first character of | |
# "\u2267foo". | |
# | |
# In this step, we build two sets of characters that | |
# _eventually_ need to go into the regular expression. But | |
# we won't know exactly what the regular expression needs | |
# to look like until we've gone through the entire list of | |
# named entities. | |
if len(character) == 1: | |
short_entities.add(character) | |
else: | |
long_entities_by_first_character[character[0]].add(character) | |
# Now that we've been through the entire list of entities, we | |
# can create a regular expression that matches any of them. | |
particles = set() | |
for short in short_entities: | |
long_versions = long_entities_by_first_character[short] | |
if not long_versions: | |
particles.add(short) | |
else: | |
ignore = "".join([x[1] for x in long_versions]) | |
# This finds, e.g. \u2267 but only if it is _not_ | |
# followed by \u0338. | |
particles.add("%s(?![%s])" % (short, ignore)) | |
for long_entities in list(long_entities_by_first_character.values()): | |
for long_entity in long_entities: | |
particles.add(long_entity) | |
re_definition = "(%s)" % "|".join(particles) | |
# If an entity shows up in both html5 and codepoint2name, it's | |
# likely that HTML5 gives it several different names, such as | |
# 'rsquo' and 'rsquor'. When converting Unicode characters to | |
# named entities, the codepoint2name name should take | |
# precedence where possible, since that's the more easily | |
# recognizable one. | |
for codepoint, name in list(codepoint2name.items()): | |
character = chr(codepoint) | |
unicode_to_name[character] = name | |
return unicode_to_name, name_to_unicode, re.compile(re_definition) | |
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, | |
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() | |
CHARACTER_TO_XML_ENTITY = { | |
"'": "apos", | |
'"': "quot", | |
"&": "amp", | |
"<": "lt", | |
">": "gt", | |
} | |
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" | |
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" | |
")") | |
AMPERSAND_OR_BRACKET = re.compile("([<>&])") | |
def _substitute_html_entity(cls, matchobj): | |
"""Used with a regular expression to substitute the | |
appropriate HTML entity for a special character string.""" | |
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) | |
return "&%s;" % entity | |
def _substitute_xml_entity(cls, matchobj): | |
"""Used with a regular expression to substitute the | |
appropriate XML entity for a special character string.""" | |
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] | |
return "&%s;" % entity | |
def quoted_attribute_value(self, value): | |
"""Make a value into a quoted XML attribute, possibly escaping it. | |
Most strings will be quoted using double quotes. | |
Bob's Bar -> "Bob's Bar" | |
If a string contains double quotes, it will be quoted using | |
single quotes. | |
Welcome to "my bar" -> 'Welcome to "my bar"' | |
If a string contains both single and double quotes, the | |
double quotes will be escaped, and the string will be quoted | |
using double quotes. | |
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" | |
""" | |
quote_with = '"' | |
if '"' in value: | |
if "'" in value: | |
# The string contains both single and double | |
# quotes. Turn the double quotes into | |
# entities. We quote the double quotes rather than | |
# the single quotes because the entity name is | |
# """ whether this is HTML or XML. If we | |
# quoted the single quotes, we'd have to decide | |
# between ' and &squot;. | |
replace_with = """ | |
value = value.replace('"', replace_with) | |
else: | |
# There are double quotes but no single quotes. | |
# We can use single quotes to quote the attribute. | |
quote_with = "'" | |
return quote_with + value + quote_with | |
def substitute_xml(cls, value, make_quoted_attribute=False): | |
"""Substitute XML entities for special XML characters. | |
:param value: A string to be substituted. The less-than sign | |
will become <, the greater-than sign will become >, | |
and any ampersands will become &. If you want ampersands | |
that appear to be part of an entity definition to be left | |
alone, use substitute_xml_containing_entities() instead. | |
:param make_quoted_attribute: If True, then the string will be | |
quoted, as befits an attribute value. | |
""" | |
# Escape angle brackets and ampersands. | |
value = cls.AMPERSAND_OR_BRACKET.sub( | |
cls._substitute_xml_entity, value) | |
if make_quoted_attribute: | |
value = cls.quoted_attribute_value(value) | |
return value | |
def substitute_xml_containing_entities( | |
cls, value, make_quoted_attribute=False): | |
"""Substitute XML entities for special XML characters. | |
:param value: A string to be substituted. The less-than sign will | |
become <, the greater-than sign will become >, and any | |
ampersands that are not part of an entity defition will | |
become &. | |
:param make_quoted_attribute: If True, then the string will be | |
quoted, as befits an attribute value. | |
""" | |
# Escape angle brackets, and ampersands that aren't part of | |
# entities. | |
value = cls.BARE_AMPERSAND_OR_BRACKET.sub( | |
cls._substitute_xml_entity, value) | |
if make_quoted_attribute: | |
value = cls.quoted_attribute_value(value) | |
return value | |
def substitute_html(cls, s): | |
"""Replace certain Unicode characters with named HTML entities. | |
This differs from data.encode(encoding, 'xmlcharrefreplace') | |
in that the goal is to make the result more readable (to those | |
with ASCII displays) rather than to recover from | |
errors. There's absolutely nothing wrong with a UTF-8 string | |
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that | |
character with "é" will make it more readable to some | |
people. | |
:param s: A Unicode string. | |
""" | |
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( | |
cls._substitute_html_entity, s) | |
class EncodingDetector: | |
"""Suggests a number of possible encodings for a bytestring. | |
Order of precedence: | |
1. Encodings you specifically tell EncodingDetector to try first | |
(the known_definite_encodings argument to the constructor). | |
2. An encoding determined by sniffing the document's byte-order mark. | |
3. Encodings you specifically tell EncodingDetector to try if | |
byte-order mark sniffing fails (the user_encodings argument to the | |
constructor). | |
4. An encoding declared within the bytestring itself, either in an | |
XML declaration (if the bytestring is to be interpreted as an XML | |
document), or in a <meta> tag (if the bytestring is to be | |
interpreted as an HTML document.) | |
5. An encoding detected through textual analysis by chardet, | |
cchardet, or a similar external library. | |
4. UTF-8. | |
5. Windows-1252. | |
""" | |
def __init__(self, markup, known_definite_encodings=None, | |
is_html=False, exclude_encodings=None, | |
user_encodings=None, override_encodings=None): | |
"""Constructor. | |
:param markup: Some markup in an unknown encoding. | |
:param known_definite_encodings: When determining the encoding | |
of `markup`, these encodings will be tried first, in | |
order. In HTML terms, this corresponds to the "known | |
definite encoding" step defined here: | |
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding | |
:param user_encodings: These encodings will be tried after the | |
`known_definite_encodings` have been tried and failed, and | |
after an attempt to sniff the encoding by looking at a | |
byte order mark has failed. In HTML terms, this | |
corresponds to the step "user has explicitly instructed | |
the user agent to override the document's character | |
encoding", defined here: | |
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding | |
:param override_encodings: A deprecated alias for | |
known_definite_encodings. Any encodings here will be tried | |
immediately after the encodings in | |
known_definite_encodings. | |
:param is_html: If True, this markup is considered to be | |
HTML. Otherwise it's assumed to be XML. | |
:param exclude_encodings: These encodings will not be tried, | |
even if they otherwise would be. | |
""" | |
self.known_definite_encodings = list(known_definite_encodings or []) | |
if override_encodings: | |
self.known_definite_encodings += override_encodings | |
self.user_encodings = user_encodings or [] | |
exclude_encodings = exclude_encodings or [] | |
self.exclude_encodings = set([x.lower() for x in exclude_encodings]) | |
self.chardet_encoding = None | |
self.is_html = is_html | |
self.declared_encoding = None | |
# First order of business: strip a byte-order mark. | |
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) | |
def _usable(self, encoding, tried): | |
"""Should we even bother to try this encoding? | |
:param encoding: Name of an encoding. | |
:param tried: Encodings that have already been tried. This will be modified | |
as a side effect. | |
""" | |
if encoding is not None: | |
encoding = encoding.lower() | |
if encoding in self.exclude_encodings: | |
return False | |
if encoding not in tried: | |
tried.add(encoding) | |
return True | |
return False | |
def encodings(self): | |
"""Yield a number of encodings that might work for this markup. | |
:yield: A sequence of strings. | |
""" | |
tried = set() | |
# First, try the known definite encodings | |
for e in self.known_definite_encodings: | |
if self._usable(e, tried): | |
yield e | |
# Did the document originally start with a byte-order mark | |
# that indicated its encoding? | |
if self._usable(self.sniffed_encoding, tried): | |
yield self.sniffed_encoding | |
# Sniffing the byte-order mark did nothing; try the user | |
# encodings. | |
for e in self.user_encodings: | |
if self._usable(e, tried): | |
yield e | |
# Look within the document for an XML or HTML encoding | |
# declaration. | |
if self.declared_encoding is None: | |
self.declared_encoding = self.find_declared_encoding( | |
self.markup, self.is_html) | |
if self._usable(self.declared_encoding, tried): | |
yield self.declared_encoding | |
# Use third-party character set detection to guess at the | |
# encoding. | |
if self.chardet_encoding is None: | |
self.chardet_encoding = chardet_dammit(self.markup) | |
if self._usable(self.chardet_encoding, tried): | |
yield self.chardet_encoding | |
# As a last-ditch effort, try utf-8 and windows-1252. | |
for e in ('utf-8', 'windows-1252'): | |
if self._usable(e, tried): | |
yield e | |
def strip_byte_order_mark(cls, data): | |
"""If a byte-order mark is present, strip it and return the encoding it implies. | |
:param data: Some markup. | |
:return: A 2-tuple (modified data, implied encoding) | |
""" | |
encoding = None | |
if isinstance(data, str): | |
# Unicode data cannot have a byte-order mark. | |
return data, encoding | |
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ | |
and (data[2:4] != '\x00\x00'): | |
encoding = 'utf-16be' | |
data = data[2:] | |
elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ | |
and (data[2:4] != '\x00\x00'): | |
encoding = 'utf-16le' | |
data = data[2:] | |
elif data[:3] == b'\xef\xbb\xbf': | |
encoding = 'utf-8' | |
data = data[3:] | |
elif data[:4] == b'\x00\x00\xfe\xff': | |
encoding = 'utf-32be' | |
data = data[4:] | |
elif data[:4] == b'\xff\xfe\x00\x00': | |
encoding = 'utf-32le' | |
data = data[4:] | |
return data, encoding | |
def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): | |
"""Given a document, tries to find its declared encoding. | |
An XML encoding is declared at the beginning of the document. | |
An HTML encoding is declared in a <meta> tag, hopefully near the | |
beginning of the document. | |
:param markup: Some markup. | |
:param is_html: If True, this markup is considered to be HTML. Otherwise | |
it's assumed to be XML. | |
:param search_entire_document: Since an encoding is supposed to declared near the beginning | |
of the document, most of the time it's only necessary to search a few kilobytes of data. | |
Set this to True to force this method to search the entire document. | |
""" | |
if search_entire_document: | |
xml_endpos = html_endpos = len(markup) | |
else: | |
xml_endpos = 1024 | |
html_endpos = max(2048, int(len(markup) * 0.05)) | |
if isinstance(markup, bytes): | |
res = encoding_res[bytes] | |
else: | |
res = encoding_res[str] | |
xml_re = res['xml'] | |
html_re = res['html'] | |
declared_encoding = None | |
declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) | |
if not declared_encoding_match and is_html: | |
declared_encoding_match = html_re.search(markup, endpos=html_endpos) | |
if declared_encoding_match is not None: | |
declared_encoding = declared_encoding_match.groups()[0] | |
if declared_encoding: | |
if isinstance(declared_encoding, bytes): | |
declared_encoding = declared_encoding.decode('ascii', 'replace') | |
return declared_encoding.lower() | |
return None | |
class UnicodeDammit: | |
"""A class for detecting the encoding of a *ML document and | |
converting it to a Unicode string. If the source encoding is | |
windows-1252, can replace MS smart quotes with their HTML or XML | |
equivalents.""" | |
# This dictionary maps commonly seen values for "charset" in HTML | |
# meta tags to the corresponding Python codec names. It only covers | |
# values that aren't in Python's aliases and can't be determined | |
# by the heuristics in find_codec. | |
CHARSET_ALIASES = {"macintosh": "mac-roman", | |
"x-sjis": "shift-jis"} | |
ENCODINGS_WITH_SMART_QUOTES = [ | |
"windows-1252", | |
"iso-8859-1", | |
"iso-8859-2", | |
] | |
def __init__(self, markup, known_definite_encodings=[], | |
smart_quotes_to=None, is_html=False, exclude_encodings=[], | |
user_encodings=None, override_encodings=None | |
): | |
"""Constructor. | |
:param markup: A bytestring representing markup in an unknown encoding. | |
:param known_definite_encodings: When determining the encoding | |
of `markup`, these encodings will be tried first, in | |
order. In HTML terms, this corresponds to the "known | |
definite encoding" step defined here: | |
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding | |
:param user_encodings: These encodings will be tried after the | |
`known_definite_encodings` have been tried and failed, and | |
after an attempt to sniff the encoding by looking at a | |
byte order mark has failed. In HTML terms, this | |
corresponds to the step "user has explicitly instructed | |
the user agent to override the document's character | |
encoding", defined here: | |
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding | |
:param override_encodings: A deprecated alias for | |
known_definite_encodings. Any encodings here will be tried | |
immediately after the encodings in | |
known_definite_encodings. | |
:param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted | |
to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. | |
Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' | |
will convert them to HTML entity references. | |
:param is_html: If True, this markup is considered to be HTML. Otherwise | |
it's assumed to be XML. | |
:param exclude_encodings: These encodings will not be considered, even | |
if the sniffing code thinks they might make sense. | |
""" | |
self.smart_quotes_to = smart_quotes_to | |
self.tried_encodings = [] | |
self.contains_replacement_characters = False | |
self.is_html = is_html | |
self.log = logging.getLogger(__name__) | |
self.detector = EncodingDetector( | |
markup, known_definite_encodings, is_html, exclude_encodings, | |
user_encodings, override_encodings | |
) | |
# Short-circuit if the data is in Unicode to begin with. | |
if isinstance(markup, str) or markup == '': | |
self.markup = markup | |
self.unicode_markup = str(markup) | |
self.original_encoding = None | |
return | |
# The encoding detector may have stripped a byte-order mark. | |
# Use the stripped markup from this point on. | |
self.markup = self.detector.markup | |
u = None | |
for encoding in self.detector.encodings: | |
markup = self.detector.markup | |
u = self._convert_from(encoding) | |
if u is not None: | |
break | |
if not u: | |
# None of the encodings worked. As an absolute last resort, | |
# try them again with character replacement. | |
for encoding in self.detector.encodings: | |
if encoding != "ascii": | |
u = self._convert_from(encoding, "replace") | |
if u is not None: | |
self.log.warning( | |
"Some characters could not be decoded, and were " | |
"replaced with REPLACEMENT CHARACTER." | |
) | |
self.contains_replacement_characters = True | |
break | |
# If none of that worked, we could at this point force it to | |
# ASCII, but that would destroy so much data that I think | |
# giving up is better. | |
self.unicode_markup = u | |
if not u: | |
self.original_encoding = None | |
def _sub_ms_char(self, match): | |
"""Changes a MS smart quote character to an XML or HTML | |
entity, or an ASCII character.""" | |
orig = match.group(1) | |
if self.smart_quotes_to == 'ascii': | |
sub = self.MS_CHARS_TO_ASCII.get(orig).encode() | |
else: | |
sub = self.MS_CHARS.get(orig) | |
if type(sub) == tuple: | |
if self.smart_quotes_to == 'xml': | |
sub = '&#x'.encode() + sub[1].encode() + ';'.encode() | |
else: | |
sub = '&'.encode() + sub[0].encode() + ';'.encode() | |
else: | |
sub = sub.encode() | |
return sub | |
def _convert_from(self, proposed, errors="strict"): | |
"""Attempt to convert the markup to the proposed encoding. | |
:param proposed: The name of a character encoding. | |
""" | |
proposed = self.find_codec(proposed) | |
if not proposed or (proposed, errors) in self.tried_encodings: | |
return None | |
self.tried_encodings.append((proposed, errors)) | |
markup = self.markup | |
# Convert smart quotes to HTML if coming from an encoding | |
# that might have them. | |
if (self.smart_quotes_to is not None | |
and proposed in self.ENCODINGS_WITH_SMART_QUOTES): | |
smart_quotes_re = b"([\x80-\x9f])" | |
smart_quotes_compiled = re.compile(smart_quotes_re) | |
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) | |
try: | |
#print("Trying to convert document to %s (errors=%s)" % ( | |
# proposed, errors)) | |
u = self._to_unicode(markup, proposed, errors) | |
self.markup = u | |
self.original_encoding = proposed | |
except Exception as e: | |
#print("That didn't work!") | |
#print(e) | |
return None | |
#print("Correct encoding: %s" % proposed) | |
return self.markup | |
def _to_unicode(self, data, encoding, errors="strict"): | |
"""Given a string and its encoding, decodes the string into Unicode. | |
:param encoding: The name of an encoding. | |
""" | |
return str(data, encoding, errors) | |
def declared_html_encoding(self): | |
"""If the markup is an HTML document, returns the encoding declared _within_ | |
the document. | |
""" | |
if not self.is_html: | |
return None | |
return self.detector.declared_encoding | |
def find_codec(self, charset): | |
"""Convert the name of a character set to a codec name. | |
:param charset: The name of a character set. | |
:return: The name of a codec. | |
""" | |
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) | |
or (charset and self._codec(charset.replace("-", ""))) | |
or (charset and self._codec(charset.replace("-", "_"))) | |
or (charset and charset.lower()) | |
or charset | |
) | |
if value: | |
return value.lower() | |
return None | |
def _codec(self, charset): | |
if not charset: | |
return charset | |
codec = None | |
try: | |
codecs.lookup(charset) | |
codec = charset | |
except (LookupError, ValueError): | |
pass | |
return codec | |
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. | |
MS_CHARS = {b'\x80': ('euro', '20AC'), | |
b'\x81': ' ', | |
b'\x82': ('sbquo', '201A'), | |
b'\x83': ('fnof', '192'), | |
b'\x84': ('bdquo', '201E'), | |
b'\x85': ('hellip', '2026'), | |
b'\x86': ('dagger', '2020'), | |
b'\x87': ('Dagger', '2021'), | |
b'\x88': ('circ', '2C6'), | |
b'\x89': ('permil', '2030'), | |
b'\x8A': ('Scaron', '160'), | |
b'\x8B': ('lsaquo', '2039'), | |
b'\x8C': ('OElig', '152'), | |
b'\x8D': '?', | |
b'\x8E': ('#x17D', '17D'), | |
b'\x8F': '?', | |
b'\x90': '?', | |
b'\x91': ('lsquo', '2018'), | |
b'\x92': ('rsquo', '2019'), | |
b'\x93': ('ldquo', '201C'), | |
b'\x94': ('rdquo', '201D'), | |
b'\x95': ('bull', '2022'), | |
b'\x96': ('ndash', '2013'), | |
b'\x97': ('mdash', '2014'), | |
b'\x98': ('tilde', '2DC'), | |
b'\x99': ('trade', '2122'), | |
b'\x9a': ('scaron', '161'), | |
b'\x9b': ('rsaquo', '203A'), | |
b'\x9c': ('oelig', '153'), | |
b'\x9d': '?', | |
b'\x9e': ('#x17E', '17E'), | |
b'\x9f': ('Yuml', ''),} | |
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains | |
# horrors like stripping diacritical marks to turn á into a, but also | |
# contains non-horrors like turning “ into ". | |
MS_CHARS_TO_ASCII = { | |
b'\x80' : 'EUR', | |
b'\x81' : ' ', | |
b'\x82' : ',', | |
b'\x83' : 'f', | |
b'\x84' : ',,', | |
b'\x85' : '...', | |
b'\x86' : '+', | |
b'\x87' : '++', | |
b'\x88' : '^', | |
b'\x89' : '%', | |
b'\x8a' : 'S', | |
b'\x8b' : '<', | |
b'\x8c' : 'OE', | |
b'\x8d' : '?', | |
b'\x8e' : 'Z', | |
b'\x8f' : '?', | |
b'\x90' : '?', | |
b'\x91' : "'", | |
b'\x92' : "'", | |
b'\x93' : '"', | |
b'\x94' : '"', | |
b'\x95' : '*', | |
b'\x96' : '-', | |
b'\x97' : '--', | |
b'\x98' : '~', | |
b'\x99' : '(TM)', | |
b'\x9a' : 's', | |
b'\x9b' : '>', | |
b'\x9c' : 'oe', | |
b'\x9d' : '?', | |
b'\x9e' : 'z', | |
b'\x9f' : 'Y', | |
b'\xa0' : ' ', | |
b'\xa1' : '!', | |
b'\xa2' : 'c', | |
b'\xa3' : 'GBP', | |
b'\xa4' : '$', #This approximation is especially parochial--this is the | |
#generic currency symbol. | |
b'\xa5' : 'YEN', | |
b'\xa6' : '|', | |
b'\xa7' : 'S', | |
b'\xa8' : '..', | |
b'\xa9' : '', | |
b'\xaa' : '(th)', | |
b'\xab' : '<<', | |
b'\xac' : '!', | |
b'\xad' : ' ', | |
b'\xae' : '(R)', | |
b'\xaf' : '-', | |
b'\xb0' : 'o', | |
b'\xb1' : '+-', | |
b'\xb2' : '2', | |
b'\xb3' : '3', | |
b'\xb4' : ("'", 'acute'), | |
b'\xb5' : 'u', | |
b'\xb6' : 'P', | |
b'\xb7' : '*', | |
b'\xb8' : ',', | |
b'\xb9' : '1', | |
b'\xba' : '(th)', | |
b'\xbb' : '>>', | |
b'\xbc' : '1/4', | |
b'\xbd' : '1/2', | |
b'\xbe' : '3/4', | |
b'\xbf' : '?', | |
b'\xc0' : 'A', | |
b'\xc1' : 'A', | |
b'\xc2' : 'A', | |
b'\xc3' : 'A', | |
b'\xc4' : 'A', | |
b'\xc5' : 'A', | |
b'\xc6' : 'AE', | |
b'\xc7' : 'C', | |
b'\xc8' : 'E', | |
b'\xc9' : 'E', | |
b'\xca' : 'E', | |
b'\xcb' : 'E', | |
b'\xcc' : 'I', | |
b'\xcd' : 'I', | |
b'\xce' : 'I', | |
b'\xcf' : 'I', | |
b'\xd0' : 'D', | |
b'\xd1' : 'N', | |
b'\xd2' : 'O', | |
b'\xd3' : 'O', | |
b'\xd4' : 'O', | |
b'\xd5' : 'O', | |
b'\xd6' : 'O', | |
b'\xd7' : '*', | |
b'\xd8' : 'O', | |
b'\xd9' : 'U', | |
b'\xda' : 'U', | |
b'\xdb' : 'U', | |
b'\xdc' : 'U', | |
b'\xdd' : 'Y', | |
b'\xde' : 'b', | |
b'\xdf' : 'B', | |
b'\xe0' : 'a', | |
b'\xe1' : 'a', | |
b'\xe2' : 'a', | |
b'\xe3' : 'a', | |
b'\xe4' : 'a', | |
b'\xe5' : 'a', | |
b'\xe6' : 'ae', | |
b'\xe7' : 'c', | |
b'\xe8' : 'e', | |
b'\xe9' : 'e', | |
b'\xea' : 'e', | |
b'\xeb' : 'e', | |
b'\xec' : 'i', | |
b'\xed' : 'i', | |
b'\xee' : 'i', | |
b'\xef' : 'i', | |
b'\xf0' : 'o', | |
b'\xf1' : 'n', | |
b'\xf2' : 'o', | |
b'\xf3' : 'o', | |
b'\xf4' : 'o', | |
b'\xf5' : 'o', | |
b'\xf6' : 'o', | |
b'\xf7' : '/', | |
b'\xf8' : 'o', | |
b'\xf9' : 'u', | |
b'\xfa' : 'u', | |
b'\xfb' : 'u', | |
b'\xfc' : 'u', | |
b'\xfd' : 'y', | |
b'\xfe' : 'b', | |
b'\xff' : 'y', | |
} | |
# A map used when removing rogue Windows-1252/ISO-8859-1 | |
# characters in otherwise UTF-8 documents. | |
# | |
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in | |
# Windows-1252. | |
WINDOWS_1252_TO_UTF8 = { | |
0x80 : b'\xe2\x82\xac', # € | |
0x82 : b'\xe2\x80\x9a', # ‚ | |
0x83 : b'\xc6\x92', # ƒ | |
0x84 : b'\xe2\x80\x9e', # „ | |
0x85 : b'\xe2\x80\xa6', # … | |
0x86 : b'\xe2\x80\xa0', # † | |
0x87 : b'\xe2\x80\xa1', # ‡ | |
0x88 : b'\xcb\x86', # ˆ | |
0x89 : b'\xe2\x80\xb0', # ‰ | |
0x8a : b'\xc5\xa0', # Š | |
0x8b : b'\xe2\x80\xb9', # ‹ | |
0x8c : b'\xc5\x92', # Œ | |
0x8e : b'\xc5\xbd', # Ž | |
0x91 : b'\xe2\x80\x98', # ‘ | |
0x92 : b'\xe2\x80\x99', # ’ | |
0x93 : b'\xe2\x80\x9c', # “ | |
0x94 : b'\xe2\x80\x9d', # ” | |
0x95 : b'\xe2\x80\xa2', # • | |
0x96 : b'\xe2\x80\x93', # – | |
0x97 : b'\xe2\x80\x94', # — | |
0x98 : b'\xcb\x9c', # ˜ | |
0x99 : b'\xe2\x84\xa2', # ™ | |
0x9a : b'\xc5\xa1', # š | |
0x9b : b'\xe2\x80\xba', # › | |
0x9c : b'\xc5\x93', # œ | |
0x9e : b'\xc5\xbe', # ž | |
0x9f : b'\xc5\xb8', # Ÿ | |
0xa0 : b'\xc2\xa0', # | |
0xa1 : b'\xc2\xa1', # ¡ | |
0xa2 : b'\xc2\xa2', # ¢ | |
0xa3 : b'\xc2\xa3', # £ | |
0xa4 : b'\xc2\xa4', # ¤ | |
0xa5 : b'\xc2\xa5', # ¥ | |
0xa6 : b'\xc2\xa6', # ¦ | |
0xa7 : b'\xc2\xa7', # § | |
0xa8 : b'\xc2\xa8', # ¨ | |
0xa9 : b'\xc2\xa9', # © | |
0xaa : b'\xc2\xaa', # ª | |
0xab : b'\xc2\xab', # « | |
0xac : b'\xc2\xac', # ¬ | |
0xad : b'\xc2\xad', # | |
0xae : b'\xc2\xae', # ® | |
0xaf : b'\xc2\xaf', # ¯ | |
0xb0 : b'\xc2\xb0', # ° | |
0xb1 : b'\xc2\xb1', # ± | |
0xb2 : b'\xc2\xb2', # ² | |
0xb3 : b'\xc2\xb3', # ³ | |
0xb4 : b'\xc2\xb4', # ´ | |
0xb5 : b'\xc2\xb5', # µ | |
0xb6 : b'\xc2\xb6', # ¶ | |
0xb7 : b'\xc2\xb7', # · | |
0xb8 : b'\xc2\xb8', # ¸ | |
0xb9 : b'\xc2\xb9', # ¹ | |
0xba : b'\xc2\xba', # º | |
0xbb : b'\xc2\xbb', # » | |
0xbc : b'\xc2\xbc', # ¼ | |
0xbd : b'\xc2\xbd', # ½ | |
0xbe : b'\xc2\xbe', # ¾ | |
0xbf : b'\xc2\xbf', # ¿ | |
0xc0 : b'\xc3\x80', # À | |
0xc1 : b'\xc3\x81', # Á | |
0xc2 : b'\xc3\x82', # Â | |
0xc3 : b'\xc3\x83', # Ã | |
0xc4 : b'\xc3\x84', # Ä | |
0xc5 : b'\xc3\x85', # Å | |
0xc6 : b'\xc3\x86', # Æ | |
0xc7 : b'\xc3\x87', # Ç | |
0xc8 : b'\xc3\x88', # È | |
0xc9 : b'\xc3\x89', # É | |
0xca : b'\xc3\x8a', # Ê | |
0xcb : b'\xc3\x8b', # Ë | |
0xcc : b'\xc3\x8c', # Ì | |
0xcd : b'\xc3\x8d', # Í | |
0xce : b'\xc3\x8e', # Î | |
0xcf : b'\xc3\x8f', # Ï | |
0xd0 : b'\xc3\x90', # Ð | |
0xd1 : b'\xc3\x91', # Ñ | |
0xd2 : b'\xc3\x92', # Ò | |
0xd3 : b'\xc3\x93', # Ó | |
0xd4 : b'\xc3\x94', # Ô | |
0xd5 : b'\xc3\x95', # Õ | |
0xd6 : b'\xc3\x96', # Ö | |
0xd7 : b'\xc3\x97', # × | |
0xd8 : b'\xc3\x98', # Ø | |
0xd9 : b'\xc3\x99', # Ù | |
0xda : b'\xc3\x9a', # Ú | |
0xdb : b'\xc3\x9b', # Û | |
0xdc : b'\xc3\x9c', # Ü | |
0xdd : b'\xc3\x9d', # Ý | |
0xde : b'\xc3\x9e', # Þ | |
0xdf : b'\xc3\x9f', # ß | |
0xe0 : b'\xc3\xa0', # à | |
0xe1 : b'\xa1', # á | |
0xe2 : b'\xc3\xa2', # â | |
0xe3 : b'\xc3\xa3', # ã | |
0xe4 : b'\xc3\xa4', # ä | |
0xe5 : b'\xc3\xa5', # å | |
0xe6 : b'\xc3\xa6', # æ | |
0xe7 : b'\xc3\xa7', # ç | |
0xe8 : b'\xc3\xa8', # è | |
0xe9 : b'\xc3\xa9', # é | |
0xea : b'\xc3\xaa', # ê | |
0xeb : b'\xc3\xab', # ë | |
0xec : b'\xc3\xac', # ì | |
0xed : b'\xc3\xad', # í | |
0xee : b'\xc3\xae', # î | |
0xef : b'\xc3\xaf', # ï | |
0xf0 : b'\xc3\xb0', # ð | |
0xf1 : b'\xc3\xb1', # ñ | |
0xf2 : b'\xc3\xb2', # ò | |
0xf3 : b'\xc3\xb3', # ó | |
0xf4 : b'\xc3\xb4', # ô | |
0xf5 : b'\xc3\xb5', # õ | |
0xf6 : b'\xc3\xb6', # ö | |
0xf7 : b'\xc3\xb7', # ÷ | |
0xf8 : b'\xc3\xb8', # ø | |
0xf9 : b'\xc3\xb9', # ù | |
0xfa : b'\xc3\xba', # ú | |
0xfb : b'\xc3\xbb', # û | |
0xfc : b'\xc3\xbc', # ü | |
0xfd : b'\xc3\xbd', # ý | |
0xfe : b'\xc3\xbe', # þ | |
} | |
MULTIBYTE_MARKERS_AND_SIZES = [ | |
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF | |
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF | |
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 | |
] | |
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] | |
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] | |
def detwingle(cls, in_bytes, main_encoding="utf8", | |
embedded_encoding="windows-1252"): | |
"""Fix characters from one encoding embedded in some other encoding. | |
Currently the only situation supported is Windows-1252 (or its | |
subset ISO-8859-1), embedded in UTF-8. | |
:param in_bytes: A bytestring that you suspect contains | |
characters from multiple encodings. Note that this _must_ | |
be a bytestring. If you've already converted the document | |
to Unicode, you're too late. | |
:param main_encoding: The primary encoding of `in_bytes`. | |
:param embedded_encoding: The encoding that was used to embed characters | |
in the main document. | |
:return: A bytestring in which `embedded_encoding` | |
characters have been converted to their `main_encoding` | |
equivalents. | |
""" | |
if embedded_encoding.replace('_', '-').lower() not in ( | |
'windows-1252', 'windows_1252'): | |
raise NotImplementedError( | |
"Windows-1252 and ISO-8859-1 are the only currently supported " | |
"embedded encodings.") | |
if main_encoding.lower() not in ('utf8', 'utf-8'): | |
raise NotImplementedError( | |
"UTF-8 is the only currently supported main encoding.") | |
byte_chunks = [] | |
chunk_start = 0 | |
pos = 0 | |
while pos < len(in_bytes): | |
byte = in_bytes[pos] | |
if not isinstance(byte, int): | |
# Python 2.x | |
byte = ord(byte) | |
if (byte >= cls.FIRST_MULTIBYTE_MARKER | |
and byte <= cls.LAST_MULTIBYTE_MARKER): | |
# This is the start of a UTF-8 multibyte character. Skip | |
# to the end. | |
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: | |
if byte >= start and byte <= end: | |
pos += size | |
break | |
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: | |
# We found a Windows-1252 character! | |
# Save the string up to this point as a chunk. | |
byte_chunks.append(in_bytes[chunk_start:pos]) | |
# Now translate the Windows-1252 character into UTF-8 | |
# and add it as another, one-byte chunk. | |
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) | |
pos += 1 | |
chunk_start = pos | |
else: | |
# Go on to the next character. | |
pos += 1 | |
if chunk_start == 0: | |
# The string is unchanged. | |
return in_bytes | |
else: | |
# Store the final chunk. | |
byte_chunks.append(in_bytes[chunk_start:]) | |
return b''.join(byte_chunks) | |