xref: /OK3568_Linux_fs/yocto/poky/bitbake/lib/bs4/dammit.py (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun# -*- coding: utf-8 -*-
2*4882a593Smuzhiyun"""Beautiful Soup bonus library: Unicode, Dammit
3*4882a593Smuzhiyun
4*4882a593SmuzhiyunThis library converts a bytestream to Unicode through any means
5*4882a593Smuzhiyunnecessary. It is heavily based on code from Mark Pilgrim's Universal
6*4882a593SmuzhiyunFeed Parser. It works best on XML and HTML, but it does not rewrite the
7*4882a593SmuzhiyunXML or HTML to reflect a new encoding; that's the tree builder's job.
8*4882a593Smuzhiyun"""
9*4882a593Smuzhiyun__license__ = "MIT"
10*4882a593Smuzhiyun
11*4882a593Smuzhiyunimport codecs
12*4882a593Smuzhiyunfrom html.entities import codepoint2name
13*4882a593Smuzhiyunimport re
14*4882a593Smuzhiyunimport logging
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun# Import a library to autodetect character encodings.
17*4882a593Smuzhiyunchardet_type = None
18*4882a593Smuzhiyuntry:
19*4882a593Smuzhiyun    # First try the fast C implementation.
20*4882a593Smuzhiyun    #  PyPI package: cchardet
21*4882a593Smuzhiyun    import cchardet
22*4882a593Smuzhiyun    def chardet_dammit(s):
23*4882a593Smuzhiyun        return cchardet.detect(s)['encoding']
24*4882a593Smuzhiyunexcept ImportError:
25*4882a593Smuzhiyun    try:
26*4882a593Smuzhiyun        # Fall back to the pure Python implementation
27*4882a593Smuzhiyun        #  Debian package: python-chardet
28*4882a593Smuzhiyun        #  PyPI package: chardet
29*4882a593Smuzhiyun        import chardet
30*4882a593Smuzhiyun        def chardet_dammit(s):
31*4882a593Smuzhiyun            return chardet.detect(s)['encoding']
32*4882a593Smuzhiyun        #import chardet.constants
33*4882a593Smuzhiyun        #chardet.constants._debug = 1
34*4882a593Smuzhiyun    except ImportError:
35*4882a593Smuzhiyun        # No chardet available.
36*4882a593Smuzhiyun        def chardet_dammit(s):
37*4882a593Smuzhiyun            return None
38*4882a593Smuzhiyun
39*4882a593Smuzhiyunxml_encoding_re = re.compile(
40*4882a593Smuzhiyun    r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
41*4882a593Smuzhiyunhtml_meta_re = re.compile(
42*4882a593Smuzhiyun    r'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
43*4882a593Smuzhiyun
44*4882a593Smuzhiyunclass EntitySubstitution(object):
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun    """Substitute XML or HTML entities for the corresponding characters."""
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun    def _populate_class_variables():
49*4882a593Smuzhiyun        lookup = {}
50*4882a593Smuzhiyun        reverse_lookup = {}
51*4882a593Smuzhiyun        characters_for_re = []
52*4882a593Smuzhiyun        for codepoint, name in list(codepoint2name.items()):
53*4882a593Smuzhiyun            character = chr(codepoint)
54*4882a593Smuzhiyun            if codepoint != 34:
55*4882a593Smuzhiyun                # There's no point in turning the quotation mark into
56*4882a593Smuzhiyun                # &quot;, unless it happens within an attribute value, which
57*4882a593Smuzhiyun                # is handled elsewhere.
58*4882a593Smuzhiyun                characters_for_re.append(character)
59*4882a593Smuzhiyun                lookup[character] = name
60*4882a593Smuzhiyun            # But we do want to turn &quot; into the quotation mark.
61*4882a593Smuzhiyun            reverse_lookup[name] = character
62*4882a593Smuzhiyun        re_definition = "[%s]" % "".join(characters_for_re)
63*4882a593Smuzhiyun        return lookup, reverse_lookup, re.compile(re_definition)
64*4882a593Smuzhiyun    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
65*4882a593Smuzhiyun     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun    CHARACTER_TO_XML_ENTITY = {
68*4882a593Smuzhiyun        "'": "apos",
69*4882a593Smuzhiyun        '"': "quot",
70*4882a593Smuzhiyun        "&": "amp",
71*4882a593Smuzhiyun        "<": "lt",
72*4882a593Smuzhiyun        ">": "gt",
73*4882a593Smuzhiyun        }
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun    BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|"
76*4882a593Smuzhiyun                                           r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
77*4882a593Smuzhiyun                                           r")")
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun    AMPERSAND_OR_BRACKET = re.compile(r"([<>&])")
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun    @classmethod
82*4882a593Smuzhiyun    def _substitute_html_entity(cls, matchobj):
83*4882a593Smuzhiyun        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
84*4882a593Smuzhiyun        return "&%s;" % entity
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun    @classmethod
87*4882a593Smuzhiyun    def _substitute_xml_entity(cls, matchobj):
88*4882a593Smuzhiyun        """Used with a regular expression to substitute the
89*4882a593Smuzhiyun        appropriate XML entity for an XML special character."""
90*4882a593Smuzhiyun        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
91*4882a593Smuzhiyun        return "&%s;" % entity
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun    @classmethod
94*4882a593Smuzhiyun    def quoted_attribute_value(self, value):
95*4882a593Smuzhiyun        """Make a value into a quoted XML attribute, possibly escaping it.
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun         Most strings will be quoted using double quotes.
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun          Bob's Bar -> "Bob's Bar"
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun         If a string contains double quotes, it will be quoted using
102*4882a593Smuzhiyun         single quotes.
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun          Welcome to "my bar" -> 'Welcome to "my bar"'
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun         If a string contains both single and double quotes, the
107*4882a593Smuzhiyun         double quotes will be escaped, and the string will be quoted
108*4882a593Smuzhiyun         using double quotes.
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
111*4882a593Smuzhiyun        """
112*4882a593Smuzhiyun        quote_with = '"'
113*4882a593Smuzhiyun        if '"' in value:
114*4882a593Smuzhiyun            if "'" in value:
115*4882a593Smuzhiyun                # The string contains both single and double
116*4882a593Smuzhiyun                # quotes.  Turn the double quotes into
117*4882a593Smuzhiyun                # entities. We quote the double quotes rather than
118*4882a593Smuzhiyun                # the single quotes because the entity name is
119*4882a593Smuzhiyun                # "&quot;" whether this is HTML or XML.  If we
120*4882a593Smuzhiyun                # quoted the single quotes, we'd have to decide
121*4882a593Smuzhiyun                # between &apos; and &squot;.
122*4882a593Smuzhiyun                replace_with = "&quot;"
123*4882a593Smuzhiyun                value = value.replace('"', replace_with)
124*4882a593Smuzhiyun            else:
125*4882a593Smuzhiyun                # There are double quotes but no single quotes.
126*4882a593Smuzhiyun                # We can use single quotes to quote the attribute.
127*4882a593Smuzhiyun                quote_with = "'"
128*4882a593Smuzhiyun        return quote_with + value + quote_with
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun    @classmethod
131*4882a593Smuzhiyun    def substitute_xml(cls, value, make_quoted_attribute=False):
132*4882a593Smuzhiyun        """Substitute XML entities for special XML characters.
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun        :param value: A string to be substituted. The less-than sign
135*4882a593Smuzhiyun          will become &lt;, the greater-than sign will become &gt;,
136*4882a593Smuzhiyun          and any ampersands will become &amp;. If you want ampersands
137*4882a593Smuzhiyun          that appear to be part of an entity definition to be left
138*4882a593Smuzhiyun          alone, use substitute_xml_containing_entities() instead.
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun        :param make_quoted_attribute: If True, then the string will be
141*4882a593Smuzhiyun         quoted, as befits an attribute value.
142*4882a593Smuzhiyun        """
143*4882a593Smuzhiyun        # Escape angle brackets and ampersands.
144*4882a593Smuzhiyun        value = cls.AMPERSAND_OR_BRACKET.sub(
145*4882a593Smuzhiyun            cls._substitute_xml_entity, value)
146*4882a593Smuzhiyun
147*4882a593Smuzhiyun        if make_quoted_attribute:
148*4882a593Smuzhiyun            value = cls.quoted_attribute_value(value)
149*4882a593Smuzhiyun        return value
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun    @classmethod
152*4882a593Smuzhiyun    def substitute_xml_containing_entities(
153*4882a593Smuzhiyun        cls, value, make_quoted_attribute=False):
154*4882a593Smuzhiyun        """Substitute XML entities for special XML characters.
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun        :param value: A string to be substituted. The less-than sign will
157*4882a593Smuzhiyun          become &lt;, the greater-than sign will become &gt;, and any
158*4882a593Smuzhiyun          ampersands that are not part of an entity defition will
159*4882a593Smuzhiyun          become &amp;.
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun        :param make_quoted_attribute: If True, then the string will be
162*4882a593Smuzhiyun         quoted, as befits an attribute value.
163*4882a593Smuzhiyun        """
164*4882a593Smuzhiyun        # Escape angle brackets, and ampersands that aren't part of
165*4882a593Smuzhiyun        # entities.
166*4882a593Smuzhiyun        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
167*4882a593Smuzhiyun            cls._substitute_xml_entity, value)
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun        if make_quoted_attribute:
170*4882a593Smuzhiyun            value = cls.quoted_attribute_value(value)
171*4882a593Smuzhiyun        return value
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun    @classmethod
174*4882a593Smuzhiyun    def substitute_html(cls, s):
175*4882a593Smuzhiyun        """Replace certain Unicode characters with named HTML entities.
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun        This differs from data.encode(encoding, 'xmlcharrefreplace')
178*4882a593Smuzhiyun        in that the goal is to make the result more readable (to those
179*4882a593Smuzhiyun        with ASCII displays) rather than to recover from
180*4882a593Smuzhiyun        errors. There's absolutely nothing wrong with a UTF-8 string
181*4882a593Smuzhiyun        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
182*4882a593Smuzhiyun        character with "&eacute;" will make it more readable to some
183*4882a593Smuzhiyun        people.
184*4882a593Smuzhiyun        """
185*4882a593Smuzhiyun        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
186*4882a593Smuzhiyun            cls._substitute_html_entity, s)
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun
189*4882a593Smuzhiyunclass EncodingDetector:
190*4882a593Smuzhiyun    """Suggests a number of possible encodings for a bytestring.
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun    Order of precedence:
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun    1. Encodings you specifically tell EncodingDetector to try first
195*4882a593Smuzhiyun    (the override_encodings argument to the constructor).
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun    2. An encoding declared within the bytestring itself, either in an
198*4882a593Smuzhiyun    XML declaration (if the bytestring is to be interpreted as an XML
199*4882a593Smuzhiyun    document), or in a <meta> tag (if the bytestring is to be
200*4882a593Smuzhiyun    interpreted as an HTML document.)
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun    3. An encoding detected through textual analysis by chardet,
203*4882a593Smuzhiyun    cchardet, or a similar external library.
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun    4. UTF-8.
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun    5. Windows-1252.
208*4882a593Smuzhiyun    """
209*4882a593Smuzhiyun    def __init__(self, markup, override_encodings=None, is_html=False,
210*4882a593Smuzhiyun                 exclude_encodings=None):
211*4882a593Smuzhiyun        self.override_encodings = override_encodings or []
212*4882a593Smuzhiyun        exclude_encodings = exclude_encodings or []
213*4882a593Smuzhiyun        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
214*4882a593Smuzhiyun        self.chardet_encoding = None
215*4882a593Smuzhiyun        self.is_html = is_html
216*4882a593Smuzhiyun        self.declared_encoding = None
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun        # First order of business: strip a byte-order mark.
219*4882a593Smuzhiyun        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun    def _usable(self, encoding, tried):
222*4882a593Smuzhiyun        if encoding is not None:
223*4882a593Smuzhiyun            encoding = encoding.lower()
224*4882a593Smuzhiyun            if encoding in self.exclude_encodings:
225*4882a593Smuzhiyun                return False
226*4882a593Smuzhiyun            if encoding not in tried:
227*4882a593Smuzhiyun                tried.add(encoding)
228*4882a593Smuzhiyun                return True
229*4882a593Smuzhiyun        return False
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun    @property
232*4882a593Smuzhiyun    def encodings(self):
233*4882a593Smuzhiyun        """Yield a number of encodings that might work for this markup."""
234*4882a593Smuzhiyun        tried = set()
235*4882a593Smuzhiyun        for e in self.override_encodings:
236*4882a593Smuzhiyun            if self._usable(e, tried):
237*4882a593Smuzhiyun                yield e
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun        # Did the document originally start with a byte-order mark
240*4882a593Smuzhiyun        # that indicated its encoding?
241*4882a593Smuzhiyun        if self._usable(self.sniffed_encoding, tried):
242*4882a593Smuzhiyun            yield self.sniffed_encoding
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun        # Look within the document for an XML or HTML encoding
245*4882a593Smuzhiyun        # declaration.
246*4882a593Smuzhiyun        if self.declared_encoding is None:
247*4882a593Smuzhiyun            self.declared_encoding = self.find_declared_encoding(
248*4882a593Smuzhiyun                self.markup, self.is_html)
249*4882a593Smuzhiyun        if self._usable(self.declared_encoding, tried):
250*4882a593Smuzhiyun            yield self.declared_encoding
251*4882a593Smuzhiyun
252*4882a593Smuzhiyun        # Use third-party character set detection to guess at the
253*4882a593Smuzhiyun        # encoding.
254*4882a593Smuzhiyun        if self.chardet_encoding is None:
255*4882a593Smuzhiyun            self.chardet_encoding = chardet_dammit(self.markup)
256*4882a593Smuzhiyun        if self._usable(self.chardet_encoding, tried):
257*4882a593Smuzhiyun            yield self.chardet_encoding
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun        # As a last-ditch effort, try utf-8 and windows-1252.
260*4882a593Smuzhiyun        for e in ('utf-8', 'windows-1252'):
261*4882a593Smuzhiyun            if self._usable(e, tried):
262*4882a593Smuzhiyun                yield e
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun    @classmethod
265*4882a593Smuzhiyun    def strip_byte_order_mark(cls, data):
266*4882a593Smuzhiyun        """If a byte-order mark is present, strip it and return the encoding it implies."""
267*4882a593Smuzhiyun        encoding = None
268*4882a593Smuzhiyun        if isinstance(data, str):
269*4882a593Smuzhiyun            # Unicode data cannot have a byte-order mark.
270*4882a593Smuzhiyun            return data, encoding
271*4882a593Smuzhiyun        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
272*4882a593Smuzhiyun               and (data[2:4] != '\x00\x00'):
273*4882a593Smuzhiyun            encoding = 'utf-16be'
274*4882a593Smuzhiyun            data = data[2:]
275*4882a593Smuzhiyun        elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
276*4882a593Smuzhiyun                 and (data[2:4] != '\x00\x00'):
277*4882a593Smuzhiyun            encoding = 'utf-16le'
278*4882a593Smuzhiyun            data = data[2:]
279*4882a593Smuzhiyun        elif data[:3] == b'\xef\xbb\xbf':
280*4882a593Smuzhiyun            encoding = 'utf-8'
281*4882a593Smuzhiyun            data = data[3:]
282*4882a593Smuzhiyun        elif data[:4] == b'\x00\x00\xfe\xff':
283*4882a593Smuzhiyun            encoding = 'utf-32be'
284*4882a593Smuzhiyun            data = data[4:]
285*4882a593Smuzhiyun        elif data[:4] == b'\xff\xfe\x00\x00':
286*4882a593Smuzhiyun            encoding = 'utf-32le'
287*4882a593Smuzhiyun            data = data[4:]
288*4882a593Smuzhiyun        return data, encoding
289*4882a593Smuzhiyun
290*4882a593Smuzhiyun    @classmethod
291*4882a593Smuzhiyun    def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
292*4882a593Smuzhiyun        """Given a document, tries to find its declared encoding.
293*4882a593Smuzhiyun
294*4882a593Smuzhiyun        An XML encoding is declared at the beginning of the document.
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun        An HTML encoding is declared in a <meta> tag, hopefully near the
297*4882a593Smuzhiyun        beginning of the document.
298*4882a593Smuzhiyun        """
299*4882a593Smuzhiyun        if search_entire_document:
300*4882a593Smuzhiyun            xml_endpos = html_endpos = len(markup)
301*4882a593Smuzhiyun        else:
302*4882a593Smuzhiyun            xml_endpos = 1024
303*4882a593Smuzhiyun            html_endpos = max(2048, int(len(markup) * 0.05))
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun        declared_encoding = None
306*4882a593Smuzhiyun        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
307*4882a593Smuzhiyun        if not declared_encoding_match and is_html:
308*4882a593Smuzhiyun            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
309*4882a593Smuzhiyun        if declared_encoding_match is not None:
310*4882a593Smuzhiyun            declared_encoding = declared_encoding_match.groups()[0].decode(
311*4882a593Smuzhiyun                'ascii', 'replace')
312*4882a593Smuzhiyun        if declared_encoding:
313*4882a593Smuzhiyun            return declared_encoding.lower()
314*4882a593Smuzhiyun        return None
315*4882a593Smuzhiyun
316*4882a593Smuzhiyunclass UnicodeDammit:
317*4882a593Smuzhiyun    """A class for detecting the encoding of a *ML document and
318*4882a593Smuzhiyun    converting it to a Unicode string. If the source encoding is
319*4882a593Smuzhiyun    windows-1252, can replace MS smart quotes with their HTML or XML
320*4882a593Smuzhiyun    equivalents."""
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun    # This dictionary maps commonly seen values for "charset" in HTML
323*4882a593Smuzhiyun    # meta tags to the corresponding Python codec names. It only covers
324*4882a593Smuzhiyun    # values that aren't in Python's aliases and can't be determined
325*4882a593Smuzhiyun    # by the heuristics in find_codec.
326*4882a593Smuzhiyun    CHARSET_ALIASES = {"macintosh": "mac-roman",
327*4882a593Smuzhiyun                       "x-sjis": "shift-jis"}
328*4882a593Smuzhiyun
329*4882a593Smuzhiyun    ENCODINGS_WITH_SMART_QUOTES = [
330*4882a593Smuzhiyun        "windows-1252",
331*4882a593Smuzhiyun        "iso-8859-1",
332*4882a593Smuzhiyun        "iso-8859-2",
333*4882a593Smuzhiyun        ]
334*4882a593Smuzhiyun
335*4882a593Smuzhiyun    def __init__(self, markup, override_encodings=[],
336*4882a593Smuzhiyun                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
337*4882a593Smuzhiyun        self.smart_quotes_to = smart_quotes_to
338*4882a593Smuzhiyun        self.tried_encodings = []
339*4882a593Smuzhiyun        self.contains_replacement_characters = False
340*4882a593Smuzhiyun        self.is_html = is_html
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun        self.detector = EncodingDetector(
343*4882a593Smuzhiyun            markup, override_encodings, is_html, exclude_encodings)
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun        # Short-circuit if the data is in Unicode to begin with.
346*4882a593Smuzhiyun        if isinstance(markup, str) or markup == '':
347*4882a593Smuzhiyun            self.markup = markup
348*4882a593Smuzhiyun            self.unicode_markup = str(markup)
349*4882a593Smuzhiyun            self.original_encoding = None
350*4882a593Smuzhiyun            return
351*4882a593Smuzhiyun
352*4882a593Smuzhiyun        # The encoding detector may have stripped a byte-order mark.
353*4882a593Smuzhiyun        # Use the stripped markup from this point on.
354*4882a593Smuzhiyun        self.markup = self.detector.markup
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun        u = None
357*4882a593Smuzhiyun        for encoding in self.detector.encodings:
358*4882a593Smuzhiyun            markup = self.detector.markup
359*4882a593Smuzhiyun            u = self._convert_from(encoding)
360*4882a593Smuzhiyun            if u is not None:
361*4882a593Smuzhiyun                break
362*4882a593Smuzhiyun
363*4882a593Smuzhiyun        if not u:
364*4882a593Smuzhiyun            # None of the encodings worked. As an absolute last resort,
365*4882a593Smuzhiyun            # try them again with character replacement.
366*4882a593Smuzhiyun
367*4882a593Smuzhiyun            for encoding in self.detector.encodings:
368*4882a593Smuzhiyun                if encoding != "ascii":
369*4882a593Smuzhiyun                    u = self._convert_from(encoding, "replace")
370*4882a593Smuzhiyun                if u is not None:
371*4882a593Smuzhiyun                    logging.warning(
372*4882a593Smuzhiyun                            "Some characters could not be decoded, and were "
373*4882a593Smuzhiyun                            "replaced with REPLACEMENT CHARACTER.")
374*4882a593Smuzhiyun                    self.contains_replacement_characters = True
375*4882a593Smuzhiyun                    break
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun        # If none of that worked, we could at this point force it to
378*4882a593Smuzhiyun        # ASCII, but that would destroy so much data that I think
379*4882a593Smuzhiyun        # giving up is better.
380*4882a593Smuzhiyun        self.unicode_markup = u
381*4882a593Smuzhiyun        if not u:
382*4882a593Smuzhiyun            self.original_encoding = None
383*4882a593Smuzhiyun
384*4882a593Smuzhiyun    def _sub_ms_char(self, match):
385*4882a593Smuzhiyun        """Changes a MS smart quote character to an XML or HTML
386*4882a593Smuzhiyun        entity, or an ASCII character."""
387*4882a593Smuzhiyun        orig = match.group(1)
388*4882a593Smuzhiyun        if self.smart_quotes_to == 'ascii':
389*4882a593Smuzhiyun            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
390*4882a593Smuzhiyun        else:
391*4882a593Smuzhiyun            sub = self.MS_CHARS.get(orig)
392*4882a593Smuzhiyun            if type(sub) == tuple:
393*4882a593Smuzhiyun                if self.smart_quotes_to == 'xml':
394*4882a593Smuzhiyun                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
395*4882a593Smuzhiyun                else:
396*4882a593Smuzhiyun                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
397*4882a593Smuzhiyun            else:
398*4882a593Smuzhiyun                sub = sub.encode()
399*4882a593Smuzhiyun        return sub
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun    def _convert_from(self, proposed, errors="strict"):
402*4882a593Smuzhiyun        proposed = self.find_codec(proposed)
403*4882a593Smuzhiyun        if not proposed or (proposed, errors) in self.tried_encodings:
404*4882a593Smuzhiyun            return None
405*4882a593Smuzhiyun        self.tried_encodings.append((proposed, errors))
406*4882a593Smuzhiyun        markup = self.markup
407*4882a593Smuzhiyun        # Convert smart quotes to HTML if coming from an encoding
408*4882a593Smuzhiyun        # that might have them.
409*4882a593Smuzhiyun        if (self.smart_quotes_to is not None
410*4882a593Smuzhiyun            and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
411*4882a593Smuzhiyun            smart_quotes_re = b"([\x80-\x9f])"
412*4882a593Smuzhiyun            smart_quotes_compiled = re.compile(smart_quotes_re)
413*4882a593Smuzhiyun            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
414*4882a593Smuzhiyun
415*4882a593Smuzhiyun        try:
416*4882a593Smuzhiyun            #print "Trying to convert document to %s (errors=%s)" % (
417*4882a593Smuzhiyun            #    proposed, errors)
418*4882a593Smuzhiyun            u = self._to_unicode(markup, proposed, errors)
419*4882a593Smuzhiyun            self.markup = u
420*4882a593Smuzhiyun            self.original_encoding = proposed
421*4882a593Smuzhiyun        except Exception as e:
422*4882a593Smuzhiyun            #print "That didn't work!"
423*4882a593Smuzhiyun            #print e
424*4882a593Smuzhiyun            return None
425*4882a593Smuzhiyun        #print "Correct encoding: %s" % proposed
426*4882a593Smuzhiyun        return self.markup
427*4882a593Smuzhiyun
428*4882a593Smuzhiyun    def _to_unicode(self, data, encoding, errors="strict"):
429*4882a593Smuzhiyun        '''Given a string and its encoding, decodes the string into Unicode.
430*4882a593Smuzhiyun        %encoding is a string recognized by encodings.aliases'''
431*4882a593Smuzhiyun        return str(data, encoding, errors)
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun    @property
434*4882a593Smuzhiyun    def declared_html_encoding(self):
435*4882a593Smuzhiyun        if not self.is_html:
436*4882a593Smuzhiyun            return None
437*4882a593Smuzhiyun        return self.detector.declared_encoding
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun    def find_codec(self, charset):
440*4882a593Smuzhiyun        value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
441*4882a593Smuzhiyun               or (charset and self._codec(charset.replace("-", "")))
442*4882a593Smuzhiyun               or (charset and self._codec(charset.replace("-", "_")))
443*4882a593Smuzhiyun               or (charset and charset.lower())
444*4882a593Smuzhiyun               or charset
445*4882a593Smuzhiyun                )
446*4882a593Smuzhiyun        if value:
447*4882a593Smuzhiyun            return value.lower()
448*4882a593Smuzhiyun        return None
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun    def _codec(self, charset):
451*4882a593Smuzhiyun        if not charset:
452*4882a593Smuzhiyun            return charset
453*4882a593Smuzhiyun        codec = None
454*4882a593Smuzhiyun        try:
455*4882a593Smuzhiyun            codecs.lookup(charset)
456*4882a593Smuzhiyun            codec = charset
457*4882a593Smuzhiyun        except (LookupError, ValueError):
458*4882a593Smuzhiyun            pass
459*4882a593Smuzhiyun        return codec
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
463*4882a593Smuzhiyun    MS_CHARS = {b'\x80': ('euro', '20AC'),
464*4882a593Smuzhiyun                b'\x81': ' ',
465*4882a593Smuzhiyun                b'\x82': ('sbquo', '201A'),
466*4882a593Smuzhiyun                b'\x83': ('fnof', '192'),
467*4882a593Smuzhiyun                b'\x84': ('bdquo', '201E'),
468*4882a593Smuzhiyun                b'\x85': ('hellip', '2026'),
469*4882a593Smuzhiyun                b'\x86': ('dagger', '2020'),
470*4882a593Smuzhiyun                b'\x87': ('Dagger', '2021'),
471*4882a593Smuzhiyun                b'\x88': ('circ', '2C6'),
472*4882a593Smuzhiyun                b'\x89': ('permil', '2030'),
473*4882a593Smuzhiyun                b'\x8A': ('Scaron', '160'),
474*4882a593Smuzhiyun                b'\x8B': ('lsaquo', '2039'),
475*4882a593Smuzhiyun                b'\x8C': ('OElig', '152'),
476*4882a593Smuzhiyun                b'\x8D': '?',
477*4882a593Smuzhiyun                b'\x8E': ('#x17D', '17D'),
478*4882a593Smuzhiyun                b'\x8F': '?',
479*4882a593Smuzhiyun                b'\x90': '?',
480*4882a593Smuzhiyun                b'\x91': ('lsquo', '2018'),
481*4882a593Smuzhiyun                b'\x92': ('rsquo', '2019'),
482*4882a593Smuzhiyun                b'\x93': ('ldquo', '201C'),
483*4882a593Smuzhiyun                b'\x94': ('rdquo', '201D'),
484*4882a593Smuzhiyun                b'\x95': ('bull', '2022'),
485*4882a593Smuzhiyun                b'\x96': ('ndash', '2013'),
486*4882a593Smuzhiyun                b'\x97': ('mdash', '2014'),
487*4882a593Smuzhiyun                b'\x98': ('tilde', '2DC'),
488*4882a593Smuzhiyun                b'\x99': ('trade', '2122'),
489*4882a593Smuzhiyun                b'\x9a': ('scaron', '161'),
490*4882a593Smuzhiyun                b'\x9b': ('rsaquo', '203A'),
491*4882a593Smuzhiyun                b'\x9c': ('oelig', '153'),
492*4882a593Smuzhiyun                b'\x9d': '?',
493*4882a593Smuzhiyun                b'\x9e': ('#x17E', '17E'),
494*4882a593Smuzhiyun                b'\x9f': ('Yuml', ''),}
495*4882a593Smuzhiyun
496*4882a593Smuzhiyun    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
497*4882a593Smuzhiyun    # horrors like stripping diacritical marks to turn á into a, but also
498*4882a593Smuzhiyun    # contains non-horrors like turning “ into ".
499*4882a593Smuzhiyun    MS_CHARS_TO_ASCII = {
500*4882a593Smuzhiyun        b'\x80' : 'EUR',
501*4882a593Smuzhiyun        b'\x81' : ' ',
502*4882a593Smuzhiyun        b'\x82' : ',',
503*4882a593Smuzhiyun        b'\x83' : 'f',
504*4882a593Smuzhiyun        b'\x84' : ',,',
505*4882a593Smuzhiyun        b'\x85' : '...',
506*4882a593Smuzhiyun        b'\x86' : '+',
507*4882a593Smuzhiyun        b'\x87' : '++',
508*4882a593Smuzhiyun        b'\x88' : '^',
509*4882a593Smuzhiyun        b'\x89' : '%',
510*4882a593Smuzhiyun        b'\x8a' : 'S',
511*4882a593Smuzhiyun        b'\x8b' : '<',
512*4882a593Smuzhiyun        b'\x8c' : 'OE',
513*4882a593Smuzhiyun        b'\x8d' : '?',
514*4882a593Smuzhiyun        b'\x8e' : 'Z',
515*4882a593Smuzhiyun        b'\x8f' : '?',
516*4882a593Smuzhiyun        b'\x90' : '?',
517*4882a593Smuzhiyun        b'\x91' : "'",
518*4882a593Smuzhiyun        b'\x92' : "'",
519*4882a593Smuzhiyun        b'\x93' : '"',
520*4882a593Smuzhiyun        b'\x94' : '"',
521*4882a593Smuzhiyun        b'\x95' : '*',
522*4882a593Smuzhiyun        b'\x96' : '-',
523*4882a593Smuzhiyun        b'\x97' : '--',
524*4882a593Smuzhiyun        b'\x98' : '~',
525*4882a593Smuzhiyun        b'\x99' : '(TM)',
526*4882a593Smuzhiyun        b'\x9a' : 's',
527*4882a593Smuzhiyun        b'\x9b' : '>',
528*4882a593Smuzhiyun        b'\x9c' : 'oe',
529*4882a593Smuzhiyun        b'\x9d' : '?',
530*4882a593Smuzhiyun        b'\x9e' : 'z',
531*4882a593Smuzhiyun        b'\x9f' : 'Y',
532*4882a593Smuzhiyun        b'\xa0' : ' ',
533*4882a593Smuzhiyun        b'\xa1' : '!',
534*4882a593Smuzhiyun        b'\xa2' : 'c',
535*4882a593Smuzhiyun        b'\xa3' : 'GBP',
536*4882a593Smuzhiyun        b'\xa4' : '$', #This approximation is especially parochial--this is the
537*4882a593Smuzhiyun                       #generic currency symbol.
538*4882a593Smuzhiyun        b'\xa5' : 'YEN',
539*4882a593Smuzhiyun        b'\xa6' : '|',
540*4882a593Smuzhiyun        b'\xa7' : 'S',
541*4882a593Smuzhiyun        b'\xa8' : '..',
542*4882a593Smuzhiyun        b'\xa9' : '',
543*4882a593Smuzhiyun        b'\xaa' : '(th)',
544*4882a593Smuzhiyun        b'\xab' : '<<',
545*4882a593Smuzhiyun        b'\xac' : '!',
546*4882a593Smuzhiyun        b'\xad' : ' ',
547*4882a593Smuzhiyun        b'\xae' : '(R)',
548*4882a593Smuzhiyun        b'\xaf' : '-',
549*4882a593Smuzhiyun        b'\xb0' : 'o',
550*4882a593Smuzhiyun        b'\xb1' : '+-',
551*4882a593Smuzhiyun        b'\xb2' : '2',
552*4882a593Smuzhiyun        b'\xb3' : '3',
553*4882a593Smuzhiyun        b'\xb4' : ("'", 'acute'),
554*4882a593Smuzhiyun        b'\xb5' : 'u',
555*4882a593Smuzhiyun        b'\xb6' : 'P',
556*4882a593Smuzhiyun        b'\xb7' : '*',
557*4882a593Smuzhiyun        b'\xb8' : ',',
558*4882a593Smuzhiyun        b'\xb9' : '1',
559*4882a593Smuzhiyun        b'\xba' : '(th)',
560*4882a593Smuzhiyun        b'\xbb' : '>>',
561*4882a593Smuzhiyun        b'\xbc' : '1/4',
562*4882a593Smuzhiyun        b'\xbd' : '1/2',
563*4882a593Smuzhiyun        b'\xbe' : '3/4',
564*4882a593Smuzhiyun        b'\xbf' : '?',
565*4882a593Smuzhiyun        b'\xc0' : 'A',
566*4882a593Smuzhiyun        b'\xc1' : 'A',
567*4882a593Smuzhiyun        b'\xc2' : 'A',
568*4882a593Smuzhiyun        b'\xc3' : 'A',
569*4882a593Smuzhiyun        b'\xc4' : 'A',
570*4882a593Smuzhiyun        b'\xc5' : 'A',
571*4882a593Smuzhiyun        b'\xc6' : 'AE',
572*4882a593Smuzhiyun        b'\xc7' : 'C',
573*4882a593Smuzhiyun        b'\xc8' : 'E',
574*4882a593Smuzhiyun        b'\xc9' : 'E',
575*4882a593Smuzhiyun        b'\xca' : 'E',
576*4882a593Smuzhiyun        b'\xcb' : 'E',
577*4882a593Smuzhiyun        b'\xcc' : 'I',
578*4882a593Smuzhiyun        b'\xcd' : 'I',
579*4882a593Smuzhiyun        b'\xce' : 'I',
580*4882a593Smuzhiyun        b'\xcf' : 'I',
581*4882a593Smuzhiyun        b'\xd0' : 'D',
582*4882a593Smuzhiyun        b'\xd1' : 'N',
583*4882a593Smuzhiyun        b'\xd2' : 'O',
584*4882a593Smuzhiyun        b'\xd3' : 'O',
585*4882a593Smuzhiyun        b'\xd4' : 'O',
586*4882a593Smuzhiyun        b'\xd5' : 'O',
587*4882a593Smuzhiyun        b'\xd6' : 'O',
588*4882a593Smuzhiyun        b'\xd7' : '*',
589*4882a593Smuzhiyun        b'\xd8' : 'O',
590*4882a593Smuzhiyun        b'\xd9' : 'U',
591*4882a593Smuzhiyun        b'\xda' : 'U',
592*4882a593Smuzhiyun        b'\xdb' : 'U',
593*4882a593Smuzhiyun        b'\xdc' : 'U',
594*4882a593Smuzhiyun        b'\xdd' : 'Y',
595*4882a593Smuzhiyun        b'\xde' : 'b',
596*4882a593Smuzhiyun        b'\xdf' : 'B',
597*4882a593Smuzhiyun        b'\xe0' : 'a',
598*4882a593Smuzhiyun        b'\xe1' : 'a',
599*4882a593Smuzhiyun        b'\xe2' : 'a',
600*4882a593Smuzhiyun        b'\xe3' : 'a',
601*4882a593Smuzhiyun        b'\xe4' : 'a',
602*4882a593Smuzhiyun        b'\xe5' : 'a',
603*4882a593Smuzhiyun        b'\xe6' : 'ae',
604*4882a593Smuzhiyun        b'\xe7' : 'c',
605*4882a593Smuzhiyun        b'\xe8' : 'e',
606*4882a593Smuzhiyun        b'\xe9' : 'e',
607*4882a593Smuzhiyun        b'\xea' : 'e',
608*4882a593Smuzhiyun        b'\xeb' : 'e',
609*4882a593Smuzhiyun        b'\xec' : 'i',
610*4882a593Smuzhiyun        b'\xed' : 'i',
611*4882a593Smuzhiyun        b'\xee' : 'i',
612*4882a593Smuzhiyun        b'\xef' : 'i',
613*4882a593Smuzhiyun        b'\xf0' : 'o',
614*4882a593Smuzhiyun        b'\xf1' : 'n',
615*4882a593Smuzhiyun        b'\xf2' : 'o',
616*4882a593Smuzhiyun        b'\xf3' : 'o',
617*4882a593Smuzhiyun        b'\xf4' : 'o',
618*4882a593Smuzhiyun        b'\xf5' : 'o',
619*4882a593Smuzhiyun        b'\xf6' : 'o',
620*4882a593Smuzhiyun        b'\xf7' : '/',
621*4882a593Smuzhiyun        b'\xf8' : 'o',
622*4882a593Smuzhiyun        b'\xf9' : 'u',
623*4882a593Smuzhiyun        b'\xfa' : 'u',
624*4882a593Smuzhiyun        b'\xfb' : 'u',
625*4882a593Smuzhiyun        b'\xfc' : 'u',
626*4882a593Smuzhiyun        b'\xfd' : 'y',
627*4882a593Smuzhiyun        b'\xfe' : 'b',
628*4882a593Smuzhiyun        b'\xff' : 'y',
629*4882a593Smuzhiyun        }
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun    # A map used when removing rogue Windows-1252/ISO-8859-1
632*4882a593Smuzhiyun    # characters in otherwise UTF-8 documents.
633*4882a593Smuzhiyun    #
634*4882a593Smuzhiyun    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
635*4882a593Smuzhiyun    # Windows-1252.
636*4882a593Smuzhiyun    WINDOWS_1252_TO_UTF8 = {
637*4882a593Smuzhiyun        0x80 : b'\xe2\x82\xac', # €
638*4882a593Smuzhiyun        0x82 : b'\xe2\x80\x9a', # ‚
639*4882a593Smuzhiyun        0x83 : b'\xc6\x92',     # ƒ
640*4882a593Smuzhiyun        0x84 : b'\xe2\x80\x9e', # „
641*4882a593Smuzhiyun        0x85 : b'\xe2\x80\xa6', # …
642*4882a593Smuzhiyun        0x86 : b'\xe2\x80\xa0', # †
643*4882a593Smuzhiyun        0x87 : b'\xe2\x80\xa1', # ‡
644*4882a593Smuzhiyun        0x88 : b'\xcb\x86',     # ˆ
645*4882a593Smuzhiyun        0x89 : b'\xe2\x80\xb0', # ‰
646*4882a593Smuzhiyun        0x8a : b'\xc5\xa0',     # Š
647*4882a593Smuzhiyun        0x8b : b'\xe2\x80\xb9', # ‹
648*4882a593Smuzhiyun        0x8c : b'\xc5\x92',     # Œ
649*4882a593Smuzhiyun        0x8e : b'\xc5\xbd',     # Ž
650*4882a593Smuzhiyun        0x91 : b'\xe2\x80\x98', # ‘
651*4882a593Smuzhiyun        0x92 : b'\xe2\x80\x99', # ’
652*4882a593Smuzhiyun        0x93 : b'\xe2\x80\x9c', # “
653*4882a593Smuzhiyun        0x94 : b'\xe2\x80\x9d', # ”
654*4882a593Smuzhiyun        0x95 : b'\xe2\x80\xa2', # •
655*4882a593Smuzhiyun        0x96 : b'\xe2\x80\x93', # –
656*4882a593Smuzhiyun        0x97 : b'\xe2\x80\x94', # —
657*4882a593Smuzhiyun        0x98 : b'\xcb\x9c',     # ˜
658*4882a593Smuzhiyun        0x99 : b'\xe2\x84\xa2', # ™
659*4882a593Smuzhiyun        0x9a : b'\xc5\xa1',     # š
660*4882a593Smuzhiyun        0x9b : b'\xe2\x80\xba', # ›
661*4882a593Smuzhiyun        0x9c : b'\xc5\x93',     # œ
662*4882a593Smuzhiyun        0x9e : b'\xc5\xbe',     # ž
663*4882a593Smuzhiyun        0x9f : b'\xc5\xb8',     # Ÿ
664*4882a593Smuzhiyun        0xa0 : b'\xc2\xa0',     #  
665*4882a593Smuzhiyun        0xa1 : b'\xc2\xa1',     # ¡
666*4882a593Smuzhiyun        0xa2 : b'\xc2\xa2',     # ¢
667*4882a593Smuzhiyun        0xa3 : b'\xc2\xa3',     # £
668*4882a593Smuzhiyun        0xa4 : b'\xc2\xa4',     # ¤
669*4882a593Smuzhiyun        0xa5 : b'\xc2\xa5',     # ¥
670*4882a593Smuzhiyun        0xa6 : b'\xc2\xa6',     # ¦
671*4882a593Smuzhiyun        0xa7 : b'\xc2\xa7',     # §
672*4882a593Smuzhiyun        0xa8 : b'\xc2\xa8',     # ¨
673*4882a593Smuzhiyun        0xa9 : b'\xc2\xa9',     # ©
674*4882a593Smuzhiyun        0xaa : b'\xc2\xaa',     # ª
675*4882a593Smuzhiyun        0xab : b'\xc2\xab',     # «
676*4882a593Smuzhiyun        0xac : b'\xc2\xac',     # ¬
677*4882a593Smuzhiyun        0xad : b'\xc2\xad',     # ­
678*4882a593Smuzhiyun        0xae : b'\xc2\xae',     # ®
679*4882a593Smuzhiyun        0xaf : b'\xc2\xaf',     # ¯
680*4882a593Smuzhiyun        0xb0 : b'\xc2\xb0',     # °
681*4882a593Smuzhiyun        0xb1 : b'\xc2\xb1',     # ±
682*4882a593Smuzhiyun        0xb2 : b'\xc2\xb2',     # ²
683*4882a593Smuzhiyun        0xb3 : b'\xc2\xb3',     # ³
684*4882a593Smuzhiyun        0xb4 : b'\xc2\xb4',     # ´
685*4882a593Smuzhiyun        0xb5 : b'\xc2\xb5',     # µ
686*4882a593Smuzhiyun        0xb6 : b'\xc2\xb6',     # ¶
687*4882a593Smuzhiyun        0xb7 : b'\xc2\xb7',     # ·
688*4882a593Smuzhiyun        0xb8 : b'\xc2\xb8',     # ¸
689*4882a593Smuzhiyun        0xb9 : b'\xc2\xb9',     # ¹
690*4882a593Smuzhiyun        0xba : b'\xc2\xba',     # º
691*4882a593Smuzhiyun        0xbb : b'\xc2\xbb',     # »
692*4882a593Smuzhiyun        0xbc : b'\xc2\xbc',     # ¼
693*4882a593Smuzhiyun        0xbd : b'\xc2\xbd',     # ½
694*4882a593Smuzhiyun        0xbe : b'\xc2\xbe',     # ¾
695*4882a593Smuzhiyun        0xbf : b'\xc2\xbf',     # ¿
696*4882a593Smuzhiyun        0xc0 : b'\xc3\x80',     # À
697*4882a593Smuzhiyun        0xc1 : b'\xc3\x81',     # Á
698*4882a593Smuzhiyun        0xc2 : b'\xc3\x82',     # Â
699*4882a593Smuzhiyun        0xc3 : b'\xc3\x83',     # Ã
700*4882a593Smuzhiyun        0xc4 : b'\xc3\x84',     # Ä
701*4882a593Smuzhiyun        0xc5 : b'\xc3\x85',     # Å
702*4882a593Smuzhiyun        0xc6 : b'\xc3\x86',     # Æ
703*4882a593Smuzhiyun        0xc7 : b'\xc3\x87',     # Ç
704*4882a593Smuzhiyun        0xc8 : b'\xc3\x88',     # È
705*4882a593Smuzhiyun        0xc9 : b'\xc3\x89',     # É
706*4882a593Smuzhiyun        0xca : b'\xc3\x8a',     # Ê
707*4882a593Smuzhiyun        0xcb : b'\xc3\x8b',     # Ë
708*4882a593Smuzhiyun        0xcc : b'\xc3\x8c',     # Ì
709*4882a593Smuzhiyun        0xcd : b'\xc3\x8d',     # Í
710*4882a593Smuzhiyun        0xce : b'\xc3\x8e',     # Î
711*4882a593Smuzhiyun        0xcf : b'\xc3\x8f',     # Ï
712*4882a593Smuzhiyun        0xd0 : b'\xc3\x90',     # Ð
713*4882a593Smuzhiyun        0xd1 : b'\xc3\x91',     # Ñ
714*4882a593Smuzhiyun        0xd2 : b'\xc3\x92',     # Ò
715*4882a593Smuzhiyun        0xd3 : b'\xc3\x93',     # Ó
716*4882a593Smuzhiyun        0xd4 : b'\xc3\x94',     # Ô
717*4882a593Smuzhiyun        0xd5 : b'\xc3\x95',     # Õ
718*4882a593Smuzhiyun        0xd6 : b'\xc3\x96',     # Ö
719*4882a593Smuzhiyun        0xd7 : b'\xc3\x97',     # ×
720*4882a593Smuzhiyun        0xd8 : b'\xc3\x98',     # Ø
721*4882a593Smuzhiyun        0xd9 : b'\xc3\x99',     # Ù
722*4882a593Smuzhiyun        0xda : b'\xc3\x9a',     # Ú
723*4882a593Smuzhiyun        0xdb : b'\xc3\x9b',     # Û
724*4882a593Smuzhiyun        0xdc : b'\xc3\x9c',     # Ü
725*4882a593Smuzhiyun        0xdd : b'\xc3\x9d',     # Ý
726*4882a593Smuzhiyun        0xde : b'\xc3\x9e',     # Þ
727*4882a593Smuzhiyun        0xdf : b'\xc3\x9f',     # ß
728*4882a593Smuzhiyun        0xe0 : b'\xc3\xa0',     # à
729*4882a593Smuzhiyun        0xe1 : b'\xa1',     # á
730*4882a593Smuzhiyun        0xe2 : b'\xc3\xa2',     # â
731*4882a593Smuzhiyun        0xe3 : b'\xc3\xa3',     # ã
732*4882a593Smuzhiyun        0xe4 : b'\xc3\xa4',     # ä
733*4882a593Smuzhiyun        0xe5 : b'\xc3\xa5',     # å
734*4882a593Smuzhiyun        0xe6 : b'\xc3\xa6',     # æ
735*4882a593Smuzhiyun        0xe7 : b'\xc3\xa7',     # ç
736*4882a593Smuzhiyun        0xe8 : b'\xc3\xa8',     # è
737*4882a593Smuzhiyun        0xe9 : b'\xc3\xa9',     # é
738*4882a593Smuzhiyun        0xea : b'\xc3\xaa',     # ê
739*4882a593Smuzhiyun        0xeb : b'\xc3\xab',     # ë
740*4882a593Smuzhiyun        0xec : b'\xc3\xac',     # ì
741*4882a593Smuzhiyun        0xed : b'\xc3\xad',     # í
742*4882a593Smuzhiyun        0xee : b'\xc3\xae',     # î
743*4882a593Smuzhiyun        0xef : b'\xc3\xaf',     # ï
744*4882a593Smuzhiyun        0xf0 : b'\xc3\xb0',     # ð
745*4882a593Smuzhiyun        0xf1 : b'\xc3\xb1',     # ñ
746*4882a593Smuzhiyun        0xf2 : b'\xc3\xb2',     # ò
747*4882a593Smuzhiyun        0xf3 : b'\xc3\xb3',     # ó
748*4882a593Smuzhiyun        0xf4 : b'\xc3\xb4',     # ô
749*4882a593Smuzhiyun        0xf5 : b'\xc3\xb5',     # õ
750*4882a593Smuzhiyun        0xf6 : b'\xc3\xb6',     # ö
751*4882a593Smuzhiyun        0xf7 : b'\xc3\xb7',     # ÷
752*4882a593Smuzhiyun        0xf8 : b'\xc3\xb8',     # ø
753*4882a593Smuzhiyun        0xf9 : b'\xc3\xb9',     # ù
754*4882a593Smuzhiyun        0xfa : b'\xc3\xba',     # ú
755*4882a593Smuzhiyun        0xfb : b'\xc3\xbb',     # û
756*4882a593Smuzhiyun        0xfc : b'\xc3\xbc',     # ü
757*4882a593Smuzhiyun        0xfd : b'\xc3\xbd',     # ý
758*4882a593Smuzhiyun        0xfe : b'\xc3\xbe',     # þ
759*4882a593Smuzhiyun        }
760*4882a593Smuzhiyun
761*4882a593Smuzhiyun    MULTIBYTE_MARKERS_AND_SIZES = [
762*4882a593Smuzhiyun        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
763*4882a593Smuzhiyun        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
764*4882a593Smuzhiyun        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
765*4882a593Smuzhiyun        ]
766*4882a593Smuzhiyun
767*4882a593Smuzhiyun    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
768*4882a593Smuzhiyun    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
769*4882a593Smuzhiyun
770*4882a593Smuzhiyun    @classmethod
771*4882a593Smuzhiyun    def detwingle(cls, in_bytes, main_encoding="utf8",
772*4882a593Smuzhiyun                  embedded_encoding="windows-1252"):
773*4882a593Smuzhiyun        """Fix characters from one encoding embedded in some other encoding.
774*4882a593Smuzhiyun
775*4882a593Smuzhiyun        Currently the only situation supported is Windows-1252 (or its
776*4882a593Smuzhiyun        subset ISO-8859-1), embedded in UTF-8.
777*4882a593Smuzhiyun
778*4882a593Smuzhiyun        The input must be a bytestring. If you've already converted
779*4882a593Smuzhiyun        the document to Unicode, you're too late.
780*4882a593Smuzhiyun
781*4882a593Smuzhiyun        The output is a bytestring in which `embedded_encoding`
782*4882a593Smuzhiyun        characters have been converted to their `main_encoding`
783*4882a593Smuzhiyun        equivalents.
784*4882a593Smuzhiyun        """
785*4882a593Smuzhiyun        if embedded_encoding.replace('_', '-').lower() not in (
786*4882a593Smuzhiyun            'windows-1252', 'windows_1252'):
787*4882a593Smuzhiyun            raise NotImplementedError(
788*4882a593Smuzhiyun                "Windows-1252 and ISO-8859-1 are the only currently supported "
789*4882a593Smuzhiyun                "embedded encodings.")
790*4882a593Smuzhiyun
791*4882a593Smuzhiyun        if main_encoding.lower() not in ('utf8', 'utf-8'):
792*4882a593Smuzhiyun            raise NotImplementedError(
793*4882a593Smuzhiyun                "UTF-8 is the only currently supported main encoding.")
794*4882a593Smuzhiyun
795*4882a593Smuzhiyun        byte_chunks = []
796*4882a593Smuzhiyun
797*4882a593Smuzhiyun        chunk_start = 0
798*4882a593Smuzhiyun        pos = 0
799*4882a593Smuzhiyun        while pos < len(in_bytes):
800*4882a593Smuzhiyun            byte = in_bytes[pos]
801*4882a593Smuzhiyun            if not isinstance(byte, int):
802*4882a593Smuzhiyun                # Python 2.x
803*4882a593Smuzhiyun                byte = ord(byte)
804*4882a593Smuzhiyun            if (byte >= cls.FIRST_MULTIBYTE_MARKER
805*4882a593Smuzhiyun                and byte <= cls.LAST_MULTIBYTE_MARKER):
806*4882a593Smuzhiyun                # This is the start of a UTF-8 multibyte character. Skip
807*4882a593Smuzhiyun                # to the end.
808*4882a593Smuzhiyun                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
809*4882a593Smuzhiyun                    if byte >= start and byte <= end:
810*4882a593Smuzhiyun                        pos += size
811*4882a593Smuzhiyun                        break
812*4882a593Smuzhiyun            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
813*4882a593Smuzhiyun                # We found a Windows-1252 character!
814*4882a593Smuzhiyun                # Save the string up to this point as a chunk.
815*4882a593Smuzhiyun                byte_chunks.append(in_bytes[chunk_start:pos])
816*4882a593Smuzhiyun
817*4882a593Smuzhiyun                # Now translate the Windows-1252 character into UTF-8
818*4882a593Smuzhiyun                # and add it as another, one-byte chunk.
819*4882a593Smuzhiyun                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
820*4882a593Smuzhiyun                pos += 1
821*4882a593Smuzhiyun                chunk_start = pos
822*4882a593Smuzhiyun            else:
823*4882a593Smuzhiyun                # Go on to the next character.
824*4882a593Smuzhiyun                pos += 1
825*4882a593Smuzhiyun        if chunk_start == 0:
826*4882a593Smuzhiyun            # The string is unchanged.
827*4882a593Smuzhiyun            return in_bytes
828*4882a593Smuzhiyun        else:
829*4882a593Smuzhiyun            # Store the final chunk.
830*4882a593Smuzhiyun            byte_chunks.append(in_bytes[chunk_start:])
831*4882a593Smuzhiyun        return b''.join(byte_chunks)
832*4882a593Smuzhiyun
833