1*4882a593Smuzhiyun# -*- coding: utf-8 -*- 2*4882a593Smuzhiyun"""Beautiful Soup bonus library: Unicode, Dammit 3*4882a593Smuzhiyun 4*4882a593SmuzhiyunThis library converts a bytestream to Unicode through any means 5*4882a593Smuzhiyunnecessary. It is heavily based on code from Mark Pilgrim's Universal 6*4882a593SmuzhiyunFeed Parser. It works best on XML and HTML, but it does not rewrite the 7*4882a593SmuzhiyunXML or HTML to reflect a new encoding; that's the tree builder's job. 8*4882a593Smuzhiyun""" 9*4882a593Smuzhiyun__license__ = "MIT" 10*4882a593Smuzhiyun 11*4882a593Smuzhiyunimport codecs 12*4882a593Smuzhiyunfrom html.entities import codepoint2name 13*4882a593Smuzhiyunimport re 14*4882a593Smuzhiyunimport logging 15*4882a593Smuzhiyun 16*4882a593Smuzhiyun# Import a library to autodetect character encodings. 17*4882a593Smuzhiyunchardet_type = None 18*4882a593Smuzhiyuntry: 19*4882a593Smuzhiyun # First try the fast C implementation. 20*4882a593Smuzhiyun # PyPI package: cchardet 21*4882a593Smuzhiyun import cchardet 22*4882a593Smuzhiyun def chardet_dammit(s): 23*4882a593Smuzhiyun return cchardet.detect(s)['encoding'] 24*4882a593Smuzhiyunexcept ImportError: 25*4882a593Smuzhiyun try: 26*4882a593Smuzhiyun # Fall back to the pure Python implementation 27*4882a593Smuzhiyun # Debian package: python-chardet 28*4882a593Smuzhiyun # PyPI package: chardet 29*4882a593Smuzhiyun import chardet 30*4882a593Smuzhiyun def chardet_dammit(s): 31*4882a593Smuzhiyun return chardet.detect(s)['encoding'] 32*4882a593Smuzhiyun #import chardet.constants 33*4882a593Smuzhiyun #chardet.constants._debug = 1 34*4882a593Smuzhiyun except ImportError: 35*4882a593Smuzhiyun # No chardet available. 36*4882a593Smuzhiyun def chardet_dammit(s): 37*4882a593Smuzhiyun return None 38*4882a593Smuzhiyun 39*4882a593Smuzhiyunxml_encoding_re = re.compile( 40*4882a593Smuzhiyun r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) 41*4882a593Smuzhiyunhtml_meta_re = re.compile( 42*4882a593Smuzhiyun r'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) 43*4882a593Smuzhiyun 44*4882a593Smuzhiyunclass EntitySubstitution(object): 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun """Substitute XML or HTML entities for the corresponding characters.""" 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun def _populate_class_variables(): 49*4882a593Smuzhiyun lookup = {} 50*4882a593Smuzhiyun reverse_lookup = {} 51*4882a593Smuzhiyun characters_for_re = [] 52*4882a593Smuzhiyun for codepoint, name in list(codepoint2name.items()): 53*4882a593Smuzhiyun character = chr(codepoint) 54*4882a593Smuzhiyun if codepoint != 34: 55*4882a593Smuzhiyun # There's no point in turning the quotation mark into 56*4882a593Smuzhiyun # ", unless it happens within an attribute value, which 57*4882a593Smuzhiyun # is handled elsewhere. 58*4882a593Smuzhiyun characters_for_re.append(character) 59*4882a593Smuzhiyun lookup[character] = name 60*4882a593Smuzhiyun # But we do want to turn " into the quotation mark. 61*4882a593Smuzhiyun reverse_lookup[name] = character 62*4882a593Smuzhiyun re_definition = "[%s]" % "".join(characters_for_re) 63*4882a593Smuzhiyun return lookup, reverse_lookup, re.compile(re_definition) 64*4882a593Smuzhiyun (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, 65*4882a593Smuzhiyun CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun CHARACTER_TO_XML_ENTITY = { 68*4882a593Smuzhiyun "'": "apos", 69*4882a593Smuzhiyun '"': "quot", 70*4882a593Smuzhiyun "&": "amp", 71*4882a593Smuzhiyun "<": "lt", 72*4882a593Smuzhiyun ">": "gt", 73*4882a593Smuzhiyun } 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|" 76*4882a593Smuzhiyun r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 77*4882a593Smuzhiyun r")") 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun AMPERSAND_OR_BRACKET = re.compile(r"([<>&])") 80*4882a593Smuzhiyun 81*4882a593Smuzhiyun @classmethod 82*4882a593Smuzhiyun def _substitute_html_entity(cls, matchobj): 83*4882a593Smuzhiyun entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) 84*4882a593Smuzhiyun return "&%s;" % entity 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun @classmethod 87*4882a593Smuzhiyun def _substitute_xml_entity(cls, matchobj): 88*4882a593Smuzhiyun """Used with a regular expression to substitute the 89*4882a593Smuzhiyun appropriate XML entity for an XML special character.""" 90*4882a593Smuzhiyun entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 91*4882a593Smuzhiyun return "&%s;" % entity 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun @classmethod 94*4882a593Smuzhiyun def quoted_attribute_value(self, value): 95*4882a593Smuzhiyun """Make a value into a quoted XML attribute, possibly escaping it. 96*4882a593Smuzhiyun 97*4882a593Smuzhiyun Most strings will be quoted using double quotes. 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun Bob's Bar -> "Bob's Bar" 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun If a string contains double quotes, it will be quoted using 102*4882a593Smuzhiyun single quotes. 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun Welcome to "my bar" -> 'Welcome to "my bar"' 105*4882a593Smuzhiyun 106*4882a593Smuzhiyun If a string contains both single and double quotes, the 107*4882a593Smuzhiyun double quotes will be escaped, and the string will be quoted 108*4882a593Smuzhiyun using double quotes. 109*4882a593Smuzhiyun 110*4882a593Smuzhiyun Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" 111*4882a593Smuzhiyun """ 112*4882a593Smuzhiyun quote_with = '"' 113*4882a593Smuzhiyun if '"' in value: 114*4882a593Smuzhiyun if "'" in value: 115*4882a593Smuzhiyun # The string contains both single and double 116*4882a593Smuzhiyun # quotes. Turn the double quotes into 117*4882a593Smuzhiyun # entities. We quote the double quotes rather than 118*4882a593Smuzhiyun # the single quotes because the entity name is 119*4882a593Smuzhiyun # """ whether this is HTML or XML. If we 120*4882a593Smuzhiyun # quoted the single quotes, we'd have to decide 121*4882a593Smuzhiyun # between ' and &squot;. 122*4882a593Smuzhiyun replace_with = """ 123*4882a593Smuzhiyun value = value.replace('"', replace_with) 124*4882a593Smuzhiyun else: 125*4882a593Smuzhiyun # There are double quotes but no single quotes. 126*4882a593Smuzhiyun # We can use single quotes to quote the attribute. 127*4882a593Smuzhiyun quote_with = "'" 128*4882a593Smuzhiyun return quote_with + value + quote_with 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun @classmethod 131*4882a593Smuzhiyun def substitute_xml(cls, value, make_quoted_attribute=False): 132*4882a593Smuzhiyun """Substitute XML entities for special XML characters. 133*4882a593Smuzhiyun 134*4882a593Smuzhiyun :param value: A string to be substituted. The less-than sign 135*4882a593Smuzhiyun will become <, the greater-than sign will become >, 136*4882a593Smuzhiyun and any ampersands will become &. If you want ampersands 137*4882a593Smuzhiyun that appear to be part of an entity definition to be left 138*4882a593Smuzhiyun alone, use substitute_xml_containing_entities() instead. 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun :param make_quoted_attribute: If True, then the string will be 141*4882a593Smuzhiyun quoted, as befits an attribute value. 142*4882a593Smuzhiyun """ 143*4882a593Smuzhiyun # Escape angle brackets and ampersands. 144*4882a593Smuzhiyun value = cls.AMPERSAND_OR_BRACKET.sub( 145*4882a593Smuzhiyun cls._substitute_xml_entity, value) 146*4882a593Smuzhiyun 147*4882a593Smuzhiyun if make_quoted_attribute: 148*4882a593Smuzhiyun value = cls.quoted_attribute_value(value) 149*4882a593Smuzhiyun return value 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun @classmethod 152*4882a593Smuzhiyun def substitute_xml_containing_entities( 153*4882a593Smuzhiyun cls, value, make_quoted_attribute=False): 154*4882a593Smuzhiyun """Substitute XML entities for special XML characters. 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun :param value: A string to be substituted. The less-than sign will 157*4882a593Smuzhiyun become <, the greater-than sign will become >, and any 158*4882a593Smuzhiyun ampersands that are not part of an entity defition will 159*4882a593Smuzhiyun become &. 160*4882a593Smuzhiyun 161*4882a593Smuzhiyun :param make_quoted_attribute: If True, then the string will be 162*4882a593Smuzhiyun quoted, as befits an attribute value. 163*4882a593Smuzhiyun """ 164*4882a593Smuzhiyun # Escape angle brackets, and ampersands that aren't part of 165*4882a593Smuzhiyun # entities. 166*4882a593Smuzhiyun value = cls.BARE_AMPERSAND_OR_BRACKET.sub( 167*4882a593Smuzhiyun cls._substitute_xml_entity, value) 168*4882a593Smuzhiyun 169*4882a593Smuzhiyun if make_quoted_attribute: 170*4882a593Smuzhiyun value = cls.quoted_attribute_value(value) 171*4882a593Smuzhiyun return value 172*4882a593Smuzhiyun 173*4882a593Smuzhiyun @classmethod 174*4882a593Smuzhiyun def substitute_html(cls, s): 175*4882a593Smuzhiyun """Replace certain Unicode characters with named HTML entities. 176*4882a593Smuzhiyun 177*4882a593Smuzhiyun This differs from data.encode(encoding, 'xmlcharrefreplace') 178*4882a593Smuzhiyun in that the goal is to make the result more readable (to those 179*4882a593Smuzhiyun with ASCII displays) rather than to recover from 180*4882a593Smuzhiyun errors. There's absolutely nothing wrong with a UTF-8 string 181*4882a593Smuzhiyun containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 182*4882a593Smuzhiyun character with "é" will make it more readable to some 183*4882a593Smuzhiyun people. 184*4882a593Smuzhiyun """ 185*4882a593Smuzhiyun return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( 186*4882a593Smuzhiyun cls._substitute_html_entity, s) 187*4882a593Smuzhiyun 188*4882a593Smuzhiyun 189*4882a593Smuzhiyunclass EncodingDetector: 190*4882a593Smuzhiyun """Suggests a number of possible encodings for a bytestring. 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun Order of precedence: 193*4882a593Smuzhiyun 194*4882a593Smuzhiyun 1. Encodings you specifically tell EncodingDetector to try first 195*4882a593Smuzhiyun (the override_encodings argument to the constructor). 196*4882a593Smuzhiyun 197*4882a593Smuzhiyun 2. An encoding declared within the bytestring itself, either in an 198*4882a593Smuzhiyun XML declaration (if the bytestring is to be interpreted as an XML 199*4882a593Smuzhiyun document), or in a <meta> tag (if the bytestring is to be 200*4882a593Smuzhiyun interpreted as an HTML document.) 201*4882a593Smuzhiyun 202*4882a593Smuzhiyun 3. An encoding detected through textual analysis by chardet, 203*4882a593Smuzhiyun cchardet, or a similar external library. 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun 4. UTF-8. 206*4882a593Smuzhiyun 207*4882a593Smuzhiyun 5. Windows-1252. 208*4882a593Smuzhiyun """ 209*4882a593Smuzhiyun def __init__(self, markup, override_encodings=None, is_html=False, 210*4882a593Smuzhiyun exclude_encodings=None): 211*4882a593Smuzhiyun self.override_encodings = override_encodings or [] 212*4882a593Smuzhiyun exclude_encodings = exclude_encodings or [] 213*4882a593Smuzhiyun self.exclude_encodings = set([x.lower() for x in exclude_encodings]) 214*4882a593Smuzhiyun self.chardet_encoding = None 215*4882a593Smuzhiyun self.is_html = is_html 216*4882a593Smuzhiyun self.declared_encoding = None 217*4882a593Smuzhiyun 218*4882a593Smuzhiyun # First order of business: strip a byte-order mark. 219*4882a593Smuzhiyun self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun def _usable(self, encoding, tried): 222*4882a593Smuzhiyun if encoding is not None: 223*4882a593Smuzhiyun encoding = encoding.lower() 224*4882a593Smuzhiyun if encoding in self.exclude_encodings: 225*4882a593Smuzhiyun return False 226*4882a593Smuzhiyun if encoding not in tried: 227*4882a593Smuzhiyun tried.add(encoding) 228*4882a593Smuzhiyun return True 229*4882a593Smuzhiyun return False 230*4882a593Smuzhiyun 231*4882a593Smuzhiyun @property 232*4882a593Smuzhiyun def encodings(self): 233*4882a593Smuzhiyun """Yield a number of encodings that might work for this markup.""" 234*4882a593Smuzhiyun tried = set() 235*4882a593Smuzhiyun for e in self.override_encodings: 236*4882a593Smuzhiyun if self._usable(e, tried): 237*4882a593Smuzhiyun yield e 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun # Did the document originally start with a byte-order mark 240*4882a593Smuzhiyun # that indicated its encoding? 241*4882a593Smuzhiyun if self._usable(self.sniffed_encoding, tried): 242*4882a593Smuzhiyun yield self.sniffed_encoding 243*4882a593Smuzhiyun 244*4882a593Smuzhiyun # Look within the document for an XML or HTML encoding 245*4882a593Smuzhiyun # declaration. 246*4882a593Smuzhiyun if self.declared_encoding is None: 247*4882a593Smuzhiyun self.declared_encoding = self.find_declared_encoding( 248*4882a593Smuzhiyun self.markup, self.is_html) 249*4882a593Smuzhiyun if self._usable(self.declared_encoding, tried): 250*4882a593Smuzhiyun yield self.declared_encoding 251*4882a593Smuzhiyun 252*4882a593Smuzhiyun # Use third-party character set detection to guess at the 253*4882a593Smuzhiyun # encoding. 254*4882a593Smuzhiyun if self.chardet_encoding is None: 255*4882a593Smuzhiyun self.chardet_encoding = chardet_dammit(self.markup) 256*4882a593Smuzhiyun if self._usable(self.chardet_encoding, tried): 257*4882a593Smuzhiyun yield self.chardet_encoding 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun # As a last-ditch effort, try utf-8 and windows-1252. 260*4882a593Smuzhiyun for e in ('utf-8', 'windows-1252'): 261*4882a593Smuzhiyun if self._usable(e, tried): 262*4882a593Smuzhiyun yield e 263*4882a593Smuzhiyun 264*4882a593Smuzhiyun @classmethod 265*4882a593Smuzhiyun def strip_byte_order_mark(cls, data): 266*4882a593Smuzhiyun """If a byte-order mark is present, strip it and return the encoding it implies.""" 267*4882a593Smuzhiyun encoding = None 268*4882a593Smuzhiyun if isinstance(data, str): 269*4882a593Smuzhiyun # Unicode data cannot have a byte-order mark. 270*4882a593Smuzhiyun return data, encoding 271*4882a593Smuzhiyun if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ 272*4882a593Smuzhiyun and (data[2:4] != '\x00\x00'): 273*4882a593Smuzhiyun encoding = 'utf-16be' 274*4882a593Smuzhiyun data = data[2:] 275*4882a593Smuzhiyun elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ 276*4882a593Smuzhiyun and (data[2:4] != '\x00\x00'): 277*4882a593Smuzhiyun encoding = 'utf-16le' 278*4882a593Smuzhiyun data = data[2:] 279*4882a593Smuzhiyun elif data[:3] == b'\xef\xbb\xbf': 280*4882a593Smuzhiyun encoding = 'utf-8' 281*4882a593Smuzhiyun data = data[3:] 282*4882a593Smuzhiyun elif data[:4] == b'\x00\x00\xfe\xff': 283*4882a593Smuzhiyun encoding = 'utf-32be' 284*4882a593Smuzhiyun data = data[4:] 285*4882a593Smuzhiyun elif data[:4] == b'\xff\xfe\x00\x00': 286*4882a593Smuzhiyun encoding = 'utf-32le' 287*4882a593Smuzhiyun data = data[4:] 288*4882a593Smuzhiyun return data, encoding 289*4882a593Smuzhiyun 290*4882a593Smuzhiyun @classmethod 291*4882a593Smuzhiyun def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): 292*4882a593Smuzhiyun """Given a document, tries to find its declared encoding. 293*4882a593Smuzhiyun 294*4882a593Smuzhiyun An XML encoding is declared at the beginning of the document. 295*4882a593Smuzhiyun 296*4882a593Smuzhiyun An HTML encoding is declared in a <meta> tag, hopefully near the 297*4882a593Smuzhiyun beginning of the document. 298*4882a593Smuzhiyun """ 299*4882a593Smuzhiyun if search_entire_document: 300*4882a593Smuzhiyun xml_endpos = html_endpos = len(markup) 301*4882a593Smuzhiyun else: 302*4882a593Smuzhiyun xml_endpos = 1024 303*4882a593Smuzhiyun html_endpos = max(2048, int(len(markup) * 0.05)) 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun declared_encoding = None 306*4882a593Smuzhiyun declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) 307*4882a593Smuzhiyun if not declared_encoding_match and is_html: 308*4882a593Smuzhiyun declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) 309*4882a593Smuzhiyun if declared_encoding_match is not None: 310*4882a593Smuzhiyun declared_encoding = declared_encoding_match.groups()[0].decode( 311*4882a593Smuzhiyun 'ascii', 'replace') 312*4882a593Smuzhiyun if declared_encoding: 313*4882a593Smuzhiyun return declared_encoding.lower() 314*4882a593Smuzhiyun return None 315*4882a593Smuzhiyun 316*4882a593Smuzhiyunclass UnicodeDammit: 317*4882a593Smuzhiyun """A class for detecting the encoding of a *ML document and 318*4882a593Smuzhiyun converting it to a Unicode string. If the source encoding is 319*4882a593Smuzhiyun windows-1252, can replace MS smart quotes with their HTML or XML 320*4882a593Smuzhiyun equivalents.""" 321*4882a593Smuzhiyun 322*4882a593Smuzhiyun # This dictionary maps commonly seen values for "charset" in HTML 323*4882a593Smuzhiyun # meta tags to the corresponding Python codec names. It only covers 324*4882a593Smuzhiyun # values that aren't in Python's aliases and can't be determined 325*4882a593Smuzhiyun # by the heuristics in find_codec. 326*4882a593Smuzhiyun CHARSET_ALIASES = {"macintosh": "mac-roman", 327*4882a593Smuzhiyun "x-sjis": "shift-jis"} 328*4882a593Smuzhiyun 329*4882a593Smuzhiyun ENCODINGS_WITH_SMART_QUOTES = [ 330*4882a593Smuzhiyun "windows-1252", 331*4882a593Smuzhiyun "iso-8859-1", 332*4882a593Smuzhiyun "iso-8859-2", 333*4882a593Smuzhiyun ] 334*4882a593Smuzhiyun 335*4882a593Smuzhiyun def __init__(self, markup, override_encodings=[], 336*4882a593Smuzhiyun smart_quotes_to=None, is_html=False, exclude_encodings=[]): 337*4882a593Smuzhiyun self.smart_quotes_to = smart_quotes_to 338*4882a593Smuzhiyun self.tried_encodings = [] 339*4882a593Smuzhiyun self.contains_replacement_characters = False 340*4882a593Smuzhiyun self.is_html = is_html 341*4882a593Smuzhiyun 342*4882a593Smuzhiyun self.detector = EncodingDetector( 343*4882a593Smuzhiyun markup, override_encodings, is_html, exclude_encodings) 344*4882a593Smuzhiyun 345*4882a593Smuzhiyun # Short-circuit if the data is in Unicode to begin with. 346*4882a593Smuzhiyun if isinstance(markup, str) or markup == '': 347*4882a593Smuzhiyun self.markup = markup 348*4882a593Smuzhiyun self.unicode_markup = str(markup) 349*4882a593Smuzhiyun self.original_encoding = None 350*4882a593Smuzhiyun return 351*4882a593Smuzhiyun 352*4882a593Smuzhiyun # The encoding detector may have stripped a byte-order mark. 353*4882a593Smuzhiyun # Use the stripped markup from this point on. 354*4882a593Smuzhiyun self.markup = self.detector.markup 355*4882a593Smuzhiyun 356*4882a593Smuzhiyun u = None 357*4882a593Smuzhiyun for encoding in self.detector.encodings: 358*4882a593Smuzhiyun markup = self.detector.markup 359*4882a593Smuzhiyun u = self._convert_from(encoding) 360*4882a593Smuzhiyun if u is not None: 361*4882a593Smuzhiyun break 362*4882a593Smuzhiyun 363*4882a593Smuzhiyun if not u: 364*4882a593Smuzhiyun # None of the encodings worked. As an absolute last resort, 365*4882a593Smuzhiyun # try them again with character replacement. 366*4882a593Smuzhiyun 367*4882a593Smuzhiyun for encoding in self.detector.encodings: 368*4882a593Smuzhiyun if encoding != "ascii": 369*4882a593Smuzhiyun u = self._convert_from(encoding, "replace") 370*4882a593Smuzhiyun if u is not None: 371*4882a593Smuzhiyun logging.warning( 372*4882a593Smuzhiyun "Some characters could not be decoded, and were " 373*4882a593Smuzhiyun "replaced with REPLACEMENT CHARACTER.") 374*4882a593Smuzhiyun self.contains_replacement_characters = True 375*4882a593Smuzhiyun break 376*4882a593Smuzhiyun 377*4882a593Smuzhiyun # If none of that worked, we could at this point force it to 378*4882a593Smuzhiyun # ASCII, but that would destroy so much data that I think 379*4882a593Smuzhiyun # giving up is better. 380*4882a593Smuzhiyun self.unicode_markup = u 381*4882a593Smuzhiyun if not u: 382*4882a593Smuzhiyun self.original_encoding = None 383*4882a593Smuzhiyun 384*4882a593Smuzhiyun def _sub_ms_char(self, match): 385*4882a593Smuzhiyun """Changes a MS smart quote character to an XML or HTML 386*4882a593Smuzhiyun entity, or an ASCII character.""" 387*4882a593Smuzhiyun orig = match.group(1) 388*4882a593Smuzhiyun if self.smart_quotes_to == 'ascii': 389*4882a593Smuzhiyun sub = self.MS_CHARS_TO_ASCII.get(orig).encode() 390*4882a593Smuzhiyun else: 391*4882a593Smuzhiyun sub = self.MS_CHARS.get(orig) 392*4882a593Smuzhiyun if type(sub) == tuple: 393*4882a593Smuzhiyun if self.smart_quotes_to == 'xml': 394*4882a593Smuzhiyun sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 395*4882a593Smuzhiyun else: 396*4882a593Smuzhiyun sub = '&'.encode() + sub[0].encode() + ';'.encode() 397*4882a593Smuzhiyun else: 398*4882a593Smuzhiyun sub = sub.encode() 399*4882a593Smuzhiyun return sub 400*4882a593Smuzhiyun 401*4882a593Smuzhiyun def _convert_from(self, proposed, errors="strict"): 402*4882a593Smuzhiyun proposed = self.find_codec(proposed) 403*4882a593Smuzhiyun if not proposed or (proposed, errors) in self.tried_encodings: 404*4882a593Smuzhiyun return None 405*4882a593Smuzhiyun self.tried_encodings.append((proposed, errors)) 406*4882a593Smuzhiyun markup = self.markup 407*4882a593Smuzhiyun # Convert smart quotes to HTML if coming from an encoding 408*4882a593Smuzhiyun # that might have them. 409*4882a593Smuzhiyun if (self.smart_quotes_to is not None 410*4882a593Smuzhiyun and proposed in self.ENCODINGS_WITH_SMART_QUOTES): 411*4882a593Smuzhiyun smart_quotes_re = b"([\x80-\x9f])" 412*4882a593Smuzhiyun smart_quotes_compiled = re.compile(smart_quotes_re) 413*4882a593Smuzhiyun markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 414*4882a593Smuzhiyun 415*4882a593Smuzhiyun try: 416*4882a593Smuzhiyun #print "Trying to convert document to %s (errors=%s)" % ( 417*4882a593Smuzhiyun # proposed, errors) 418*4882a593Smuzhiyun u = self._to_unicode(markup, proposed, errors) 419*4882a593Smuzhiyun self.markup = u 420*4882a593Smuzhiyun self.original_encoding = proposed 421*4882a593Smuzhiyun except Exception as e: 422*4882a593Smuzhiyun #print "That didn't work!" 423*4882a593Smuzhiyun #print e 424*4882a593Smuzhiyun return None 425*4882a593Smuzhiyun #print "Correct encoding: %s" % proposed 426*4882a593Smuzhiyun return self.markup 427*4882a593Smuzhiyun 428*4882a593Smuzhiyun def _to_unicode(self, data, encoding, errors="strict"): 429*4882a593Smuzhiyun '''Given a string and its encoding, decodes the string into Unicode. 430*4882a593Smuzhiyun %encoding is a string recognized by encodings.aliases''' 431*4882a593Smuzhiyun return str(data, encoding, errors) 432*4882a593Smuzhiyun 433*4882a593Smuzhiyun @property 434*4882a593Smuzhiyun def declared_html_encoding(self): 435*4882a593Smuzhiyun if not self.is_html: 436*4882a593Smuzhiyun return None 437*4882a593Smuzhiyun return self.detector.declared_encoding 438*4882a593Smuzhiyun 439*4882a593Smuzhiyun def find_codec(self, charset): 440*4882a593Smuzhiyun value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) 441*4882a593Smuzhiyun or (charset and self._codec(charset.replace("-", ""))) 442*4882a593Smuzhiyun or (charset and self._codec(charset.replace("-", "_"))) 443*4882a593Smuzhiyun or (charset and charset.lower()) 444*4882a593Smuzhiyun or charset 445*4882a593Smuzhiyun ) 446*4882a593Smuzhiyun if value: 447*4882a593Smuzhiyun return value.lower() 448*4882a593Smuzhiyun return None 449*4882a593Smuzhiyun 450*4882a593Smuzhiyun def _codec(self, charset): 451*4882a593Smuzhiyun if not charset: 452*4882a593Smuzhiyun return charset 453*4882a593Smuzhiyun codec = None 454*4882a593Smuzhiyun try: 455*4882a593Smuzhiyun codecs.lookup(charset) 456*4882a593Smuzhiyun codec = charset 457*4882a593Smuzhiyun except (LookupError, ValueError): 458*4882a593Smuzhiyun pass 459*4882a593Smuzhiyun return codec 460*4882a593Smuzhiyun 461*4882a593Smuzhiyun 462*4882a593Smuzhiyun # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. 463*4882a593Smuzhiyun MS_CHARS = {b'\x80': ('euro', '20AC'), 464*4882a593Smuzhiyun b'\x81': ' ', 465*4882a593Smuzhiyun b'\x82': ('sbquo', '201A'), 466*4882a593Smuzhiyun b'\x83': ('fnof', '192'), 467*4882a593Smuzhiyun b'\x84': ('bdquo', '201E'), 468*4882a593Smuzhiyun b'\x85': ('hellip', '2026'), 469*4882a593Smuzhiyun b'\x86': ('dagger', '2020'), 470*4882a593Smuzhiyun b'\x87': ('Dagger', '2021'), 471*4882a593Smuzhiyun b'\x88': ('circ', '2C6'), 472*4882a593Smuzhiyun b'\x89': ('permil', '2030'), 473*4882a593Smuzhiyun b'\x8A': ('Scaron', '160'), 474*4882a593Smuzhiyun b'\x8B': ('lsaquo', '2039'), 475*4882a593Smuzhiyun b'\x8C': ('OElig', '152'), 476*4882a593Smuzhiyun b'\x8D': '?', 477*4882a593Smuzhiyun b'\x8E': ('#x17D', '17D'), 478*4882a593Smuzhiyun b'\x8F': '?', 479*4882a593Smuzhiyun b'\x90': '?', 480*4882a593Smuzhiyun b'\x91': ('lsquo', '2018'), 481*4882a593Smuzhiyun b'\x92': ('rsquo', '2019'), 482*4882a593Smuzhiyun b'\x93': ('ldquo', '201C'), 483*4882a593Smuzhiyun b'\x94': ('rdquo', '201D'), 484*4882a593Smuzhiyun b'\x95': ('bull', '2022'), 485*4882a593Smuzhiyun b'\x96': ('ndash', '2013'), 486*4882a593Smuzhiyun b'\x97': ('mdash', '2014'), 487*4882a593Smuzhiyun b'\x98': ('tilde', '2DC'), 488*4882a593Smuzhiyun b'\x99': ('trade', '2122'), 489*4882a593Smuzhiyun b'\x9a': ('scaron', '161'), 490*4882a593Smuzhiyun b'\x9b': ('rsaquo', '203A'), 491*4882a593Smuzhiyun b'\x9c': ('oelig', '153'), 492*4882a593Smuzhiyun b'\x9d': '?', 493*4882a593Smuzhiyun b'\x9e': ('#x17E', '17E'), 494*4882a593Smuzhiyun b'\x9f': ('Yuml', ''),} 495*4882a593Smuzhiyun 496*4882a593Smuzhiyun # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains 497*4882a593Smuzhiyun # horrors like stripping diacritical marks to turn á into a, but also 498*4882a593Smuzhiyun # contains non-horrors like turning “ into ". 499*4882a593Smuzhiyun MS_CHARS_TO_ASCII = { 500*4882a593Smuzhiyun b'\x80' : 'EUR', 501*4882a593Smuzhiyun b'\x81' : ' ', 502*4882a593Smuzhiyun b'\x82' : ',', 503*4882a593Smuzhiyun b'\x83' : 'f', 504*4882a593Smuzhiyun b'\x84' : ',,', 505*4882a593Smuzhiyun b'\x85' : '...', 506*4882a593Smuzhiyun b'\x86' : '+', 507*4882a593Smuzhiyun b'\x87' : '++', 508*4882a593Smuzhiyun b'\x88' : '^', 509*4882a593Smuzhiyun b'\x89' : '%', 510*4882a593Smuzhiyun b'\x8a' : 'S', 511*4882a593Smuzhiyun b'\x8b' : '<', 512*4882a593Smuzhiyun b'\x8c' : 'OE', 513*4882a593Smuzhiyun b'\x8d' : '?', 514*4882a593Smuzhiyun b'\x8e' : 'Z', 515*4882a593Smuzhiyun b'\x8f' : '?', 516*4882a593Smuzhiyun b'\x90' : '?', 517*4882a593Smuzhiyun b'\x91' : "'", 518*4882a593Smuzhiyun b'\x92' : "'", 519*4882a593Smuzhiyun b'\x93' : '"', 520*4882a593Smuzhiyun b'\x94' : '"', 521*4882a593Smuzhiyun b'\x95' : '*', 522*4882a593Smuzhiyun b'\x96' : '-', 523*4882a593Smuzhiyun b'\x97' : '--', 524*4882a593Smuzhiyun b'\x98' : '~', 525*4882a593Smuzhiyun b'\x99' : '(TM)', 526*4882a593Smuzhiyun b'\x9a' : 's', 527*4882a593Smuzhiyun b'\x9b' : '>', 528*4882a593Smuzhiyun b'\x9c' : 'oe', 529*4882a593Smuzhiyun b'\x9d' : '?', 530*4882a593Smuzhiyun b'\x9e' : 'z', 531*4882a593Smuzhiyun b'\x9f' : 'Y', 532*4882a593Smuzhiyun b'\xa0' : ' ', 533*4882a593Smuzhiyun b'\xa1' : '!', 534*4882a593Smuzhiyun b'\xa2' : 'c', 535*4882a593Smuzhiyun b'\xa3' : 'GBP', 536*4882a593Smuzhiyun b'\xa4' : '$', #This approximation is especially parochial--this is the 537*4882a593Smuzhiyun #generic currency symbol. 538*4882a593Smuzhiyun b'\xa5' : 'YEN', 539*4882a593Smuzhiyun b'\xa6' : '|', 540*4882a593Smuzhiyun b'\xa7' : 'S', 541*4882a593Smuzhiyun b'\xa8' : '..', 542*4882a593Smuzhiyun b'\xa9' : '', 543*4882a593Smuzhiyun b'\xaa' : '(th)', 544*4882a593Smuzhiyun b'\xab' : '<<', 545*4882a593Smuzhiyun b'\xac' : '!', 546*4882a593Smuzhiyun b'\xad' : ' ', 547*4882a593Smuzhiyun b'\xae' : '(R)', 548*4882a593Smuzhiyun b'\xaf' : '-', 549*4882a593Smuzhiyun b'\xb0' : 'o', 550*4882a593Smuzhiyun b'\xb1' : '+-', 551*4882a593Smuzhiyun b'\xb2' : '2', 552*4882a593Smuzhiyun b'\xb3' : '3', 553*4882a593Smuzhiyun b'\xb4' : ("'", 'acute'), 554*4882a593Smuzhiyun b'\xb5' : 'u', 555*4882a593Smuzhiyun b'\xb6' : 'P', 556*4882a593Smuzhiyun b'\xb7' : '*', 557*4882a593Smuzhiyun b'\xb8' : ',', 558*4882a593Smuzhiyun b'\xb9' : '1', 559*4882a593Smuzhiyun b'\xba' : '(th)', 560*4882a593Smuzhiyun b'\xbb' : '>>', 561*4882a593Smuzhiyun b'\xbc' : '1/4', 562*4882a593Smuzhiyun b'\xbd' : '1/2', 563*4882a593Smuzhiyun b'\xbe' : '3/4', 564*4882a593Smuzhiyun b'\xbf' : '?', 565*4882a593Smuzhiyun b'\xc0' : 'A', 566*4882a593Smuzhiyun b'\xc1' : 'A', 567*4882a593Smuzhiyun b'\xc2' : 'A', 568*4882a593Smuzhiyun b'\xc3' : 'A', 569*4882a593Smuzhiyun b'\xc4' : 'A', 570*4882a593Smuzhiyun b'\xc5' : 'A', 571*4882a593Smuzhiyun b'\xc6' : 'AE', 572*4882a593Smuzhiyun b'\xc7' : 'C', 573*4882a593Smuzhiyun b'\xc8' : 'E', 574*4882a593Smuzhiyun b'\xc9' : 'E', 575*4882a593Smuzhiyun b'\xca' : 'E', 576*4882a593Smuzhiyun b'\xcb' : 'E', 577*4882a593Smuzhiyun b'\xcc' : 'I', 578*4882a593Smuzhiyun b'\xcd' : 'I', 579*4882a593Smuzhiyun b'\xce' : 'I', 580*4882a593Smuzhiyun b'\xcf' : 'I', 581*4882a593Smuzhiyun b'\xd0' : 'D', 582*4882a593Smuzhiyun b'\xd1' : 'N', 583*4882a593Smuzhiyun b'\xd2' : 'O', 584*4882a593Smuzhiyun b'\xd3' : 'O', 585*4882a593Smuzhiyun b'\xd4' : 'O', 586*4882a593Smuzhiyun b'\xd5' : 'O', 587*4882a593Smuzhiyun b'\xd6' : 'O', 588*4882a593Smuzhiyun b'\xd7' : '*', 589*4882a593Smuzhiyun b'\xd8' : 'O', 590*4882a593Smuzhiyun b'\xd9' : 'U', 591*4882a593Smuzhiyun b'\xda' : 'U', 592*4882a593Smuzhiyun b'\xdb' : 'U', 593*4882a593Smuzhiyun b'\xdc' : 'U', 594*4882a593Smuzhiyun b'\xdd' : 'Y', 595*4882a593Smuzhiyun b'\xde' : 'b', 596*4882a593Smuzhiyun b'\xdf' : 'B', 597*4882a593Smuzhiyun b'\xe0' : 'a', 598*4882a593Smuzhiyun b'\xe1' : 'a', 599*4882a593Smuzhiyun b'\xe2' : 'a', 600*4882a593Smuzhiyun b'\xe3' : 'a', 601*4882a593Smuzhiyun b'\xe4' : 'a', 602*4882a593Smuzhiyun b'\xe5' : 'a', 603*4882a593Smuzhiyun b'\xe6' : 'ae', 604*4882a593Smuzhiyun b'\xe7' : 'c', 605*4882a593Smuzhiyun b'\xe8' : 'e', 606*4882a593Smuzhiyun b'\xe9' : 'e', 607*4882a593Smuzhiyun b'\xea' : 'e', 608*4882a593Smuzhiyun b'\xeb' : 'e', 609*4882a593Smuzhiyun b'\xec' : 'i', 610*4882a593Smuzhiyun b'\xed' : 'i', 611*4882a593Smuzhiyun b'\xee' : 'i', 612*4882a593Smuzhiyun b'\xef' : 'i', 613*4882a593Smuzhiyun b'\xf0' : 'o', 614*4882a593Smuzhiyun b'\xf1' : 'n', 615*4882a593Smuzhiyun b'\xf2' : 'o', 616*4882a593Smuzhiyun b'\xf3' : 'o', 617*4882a593Smuzhiyun b'\xf4' : 'o', 618*4882a593Smuzhiyun b'\xf5' : 'o', 619*4882a593Smuzhiyun b'\xf6' : 'o', 620*4882a593Smuzhiyun b'\xf7' : '/', 621*4882a593Smuzhiyun b'\xf8' : 'o', 622*4882a593Smuzhiyun b'\xf9' : 'u', 623*4882a593Smuzhiyun b'\xfa' : 'u', 624*4882a593Smuzhiyun b'\xfb' : 'u', 625*4882a593Smuzhiyun b'\xfc' : 'u', 626*4882a593Smuzhiyun b'\xfd' : 'y', 627*4882a593Smuzhiyun b'\xfe' : 'b', 628*4882a593Smuzhiyun b'\xff' : 'y', 629*4882a593Smuzhiyun } 630*4882a593Smuzhiyun 631*4882a593Smuzhiyun # A map used when removing rogue Windows-1252/ISO-8859-1 632*4882a593Smuzhiyun # characters in otherwise UTF-8 documents. 633*4882a593Smuzhiyun # 634*4882a593Smuzhiyun # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in 635*4882a593Smuzhiyun # Windows-1252. 636*4882a593Smuzhiyun WINDOWS_1252_TO_UTF8 = { 637*4882a593Smuzhiyun 0x80 : b'\xe2\x82\xac', # € 638*4882a593Smuzhiyun 0x82 : b'\xe2\x80\x9a', # ‚ 639*4882a593Smuzhiyun 0x83 : b'\xc6\x92', # ƒ 640*4882a593Smuzhiyun 0x84 : b'\xe2\x80\x9e', # „ 641*4882a593Smuzhiyun 0x85 : b'\xe2\x80\xa6', # … 642*4882a593Smuzhiyun 0x86 : b'\xe2\x80\xa0', # † 643*4882a593Smuzhiyun 0x87 : b'\xe2\x80\xa1', # ‡ 644*4882a593Smuzhiyun 0x88 : b'\xcb\x86', # ˆ 645*4882a593Smuzhiyun 0x89 : b'\xe2\x80\xb0', # ‰ 646*4882a593Smuzhiyun 0x8a : b'\xc5\xa0', # Š 647*4882a593Smuzhiyun 0x8b : b'\xe2\x80\xb9', # ‹ 648*4882a593Smuzhiyun 0x8c : b'\xc5\x92', # Œ 649*4882a593Smuzhiyun 0x8e : b'\xc5\xbd', # Ž 650*4882a593Smuzhiyun 0x91 : b'\xe2\x80\x98', # ‘ 651*4882a593Smuzhiyun 0x92 : b'\xe2\x80\x99', # ’ 652*4882a593Smuzhiyun 0x93 : b'\xe2\x80\x9c', # “ 653*4882a593Smuzhiyun 0x94 : b'\xe2\x80\x9d', # ” 654*4882a593Smuzhiyun 0x95 : b'\xe2\x80\xa2', # • 655*4882a593Smuzhiyun 0x96 : b'\xe2\x80\x93', # – 656*4882a593Smuzhiyun 0x97 : b'\xe2\x80\x94', # — 657*4882a593Smuzhiyun 0x98 : b'\xcb\x9c', # ˜ 658*4882a593Smuzhiyun 0x99 : b'\xe2\x84\xa2', # ™ 659*4882a593Smuzhiyun 0x9a : b'\xc5\xa1', # š 660*4882a593Smuzhiyun 0x9b : b'\xe2\x80\xba', # › 661*4882a593Smuzhiyun 0x9c : b'\xc5\x93', # œ 662*4882a593Smuzhiyun 0x9e : b'\xc5\xbe', # ž 663*4882a593Smuzhiyun 0x9f : b'\xc5\xb8', # Ÿ 664*4882a593Smuzhiyun 0xa0 : b'\xc2\xa0', # 665*4882a593Smuzhiyun 0xa1 : b'\xc2\xa1', # ¡ 666*4882a593Smuzhiyun 0xa2 : b'\xc2\xa2', # ¢ 667*4882a593Smuzhiyun 0xa3 : b'\xc2\xa3', # £ 668*4882a593Smuzhiyun 0xa4 : b'\xc2\xa4', # ¤ 669*4882a593Smuzhiyun 0xa5 : b'\xc2\xa5', # ¥ 670*4882a593Smuzhiyun 0xa6 : b'\xc2\xa6', # ¦ 671*4882a593Smuzhiyun 0xa7 : b'\xc2\xa7', # § 672*4882a593Smuzhiyun 0xa8 : b'\xc2\xa8', # ¨ 673*4882a593Smuzhiyun 0xa9 : b'\xc2\xa9', # © 674*4882a593Smuzhiyun 0xaa : b'\xc2\xaa', # ª 675*4882a593Smuzhiyun 0xab : b'\xc2\xab', # « 676*4882a593Smuzhiyun 0xac : b'\xc2\xac', # ¬ 677*4882a593Smuzhiyun 0xad : b'\xc2\xad', # 678*4882a593Smuzhiyun 0xae : b'\xc2\xae', # ® 679*4882a593Smuzhiyun 0xaf : b'\xc2\xaf', # ¯ 680*4882a593Smuzhiyun 0xb0 : b'\xc2\xb0', # ° 681*4882a593Smuzhiyun 0xb1 : b'\xc2\xb1', # ± 682*4882a593Smuzhiyun 0xb2 : b'\xc2\xb2', # ² 683*4882a593Smuzhiyun 0xb3 : b'\xc2\xb3', # ³ 684*4882a593Smuzhiyun 0xb4 : b'\xc2\xb4', # ´ 685*4882a593Smuzhiyun 0xb5 : b'\xc2\xb5', # µ 686*4882a593Smuzhiyun 0xb6 : b'\xc2\xb6', # ¶ 687*4882a593Smuzhiyun 0xb7 : b'\xc2\xb7', # · 688*4882a593Smuzhiyun 0xb8 : b'\xc2\xb8', # ¸ 689*4882a593Smuzhiyun 0xb9 : b'\xc2\xb9', # ¹ 690*4882a593Smuzhiyun 0xba : b'\xc2\xba', # º 691*4882a593Smuzhiyun 0xbb : b'\xc2\xbb', # » 692*4882a593Smuzhiyun 0xbc : b'\xc2\xbc', # ¼ 693*4882a593Smuzhiyun 0xbd : b'\xc2\xbd', # ½ 694*4882a593Smuzhiyun 0xbe : b'\xc2\xbe', # ¾ 695*4882a593Smuzhiyun 0xbf : b'\xc2\xbf', # ¿ 696*4882a593Smuzhiyun 0xc0 : b'\xc3\x80', # À 697*4882a593Smuzhiyun 0xc1 : b'\xc3\x81', # Á 698*4882a593Smuzhiyun 0xc2 : b'\xc3\x82', #  699*4882a593Smuzhiyun 0xc3 : b'\xc3\x83', # à 700*4882a593Smuzhiyun 0xc4 : b'\xc3\x84', # Ä 701*4882a593Smuzhiyun 0xc5 : b'\xc3\x85', # Å 702*4882a593Smuzhiyun 0xc6 : b'\xc3\x86', # Æ 703*4882a593Smuzhiyun 0xc7 : b'\xc3\x87', # Ç 704*4882a593Smuzhiyun 0xc8 : b'\xc3\x88', # È 705*4882a593Smuzhiyun 0xc9 : b'\xc3\x89', # É 706*4882a593Smuzhiyun 0xca : b'\xc3\x8a', # Ê 707*4882a593Smuzhiyun 0xcb : b'\xc3\x8b', # Ë 708*4882a593Smuzhiyun 0xcc : b'\xc3\x8c', # Ì 709*4882a593Smuzhiyun 0xcd : b'\xc3\x8d', # Í 710*4882a593Smuzhiyun 0xce : b'\xc3\x8e', # Î 711*4882a593Smuzhiyun 0xcf : b'\xc3\x8f', # Ï 712*4882a593Smuzhiyun 0xd0 : b'\xc3\x90', # Ð 713*4882a593Smuzhiyun 0xd1 : b'\xc3\x91', # Ñ 714*4882a593Smuzhiyun 0xd2 : b'\xc3\x92', # Ò 715*4882a593Smuzhiyun 0xd3 : b'\xc3\x93', # Ó 716*4882a593Smuzhiyun 0xd4 : b'\xc3\x94', # Ô 717*4882a593Smuzhiyun 0xd5 : b'\xc3\x95', # Õ 718*4882a593Smuzhiyun 0xd6 : b'\xc3\x96', # Ö 719*4882a593Smuzhiyun 0xd7 : b'\xc3\x97', # × 720*4882a593Smuzhiyun 0xd8 : b'\xc3\x98', # Ø 721*4882a593Smuzhiyun 0xd9 : b'\xc3\x99', # Ù 722*4882a593Smuzhiyun 0xda : b'\xc3\x9a', # Ú 723*4882a593Smuzhiyun 0xdb : b'\xc3\x9b', # Û 724*4882a593Smuzhiyun 0xdc : b'\xc3\x9c', # Ü 725*4882a593Smuzhiyun 0xdd : b'\xc3\x9d', # Ý 726*4882a593Smuzhiyun 0xde : b'\xc3\x9e', # Þ 727*4882a593Smuzhiyun 0xdf : b'\xc3\x9f', # ß 728*4882a593Smuzhiyun 0xe0 : b'\xc3\xa0', # à 729*4882a593Smuzhiyun 0xe1 : b'\xa1', # á 730*4882a593Smuzhiyun 0xe2 : b'\xc3\xa2', # â 731*4882a593Smuzhiyun 0xe3 : b'\xc3\xa3', # ã 732*4882a593Smuzhiyun 0xe4 : b'\xc3\xa4', # ä 733*4882a593Smuzhiyun 0xe5 : b'\xc3\xa5', # å 734*4882a593Smuzhiyun 0xe6 : b'\xc3\xa6', # æ 735*4882a593Smuzhiyun 0xe7 : b'\xc3\xa7', # ç 736*4882a593Smuzhiyun 0xe8 : b'\xc3\xa8', # è 737*4882a593Smuzhiyun 0xe9 : b'\xc3\xa9', # é 738*4882a593Smuzhiyun 0xea : b'\xc3\xaa', # ê 739*4882a593Smuzhiyun 0xeb : b'\xc3\xab', # ë 740*4882a593Smuzhiyun 0xec : b'\xc3\xac', # ì 741*4882a593Smuzhiyun 0xed : b'\xc3\xad', # í 742*4882a593Smuzhiyun 0xee : b'\xc3\xae', # î 743*4882a593Smuzhiyun 0xef : b'\xc3\xaf', # ï 744*4882a593Smuzhiyun 0xf0 : b'\xc3\xb0', # ð 745*4882a593Smuzhiyun 0xf1 : b'\xc3\xb1', # ñ 746*4882a593Smuzhiyun 0xf2 : b'\xc3\xb2', # ò 747*4882a593Smuzhiyun 0xf3 : b'\xc3\xb3', # ó 748*4882a593Smuzhiyun 0xf4 : b'\xc3\xb4', # ô 749*4882a593Smuzhiyun 0xf5 : b'\xc3\xb5', # õ 750*4882a593Smuzhiyun 0xf6 : b'\xc3\xb6', # ö 751*4882a593Smuzhiyun 0xf7 : b'\xc3\xb7', # ÷ 752*4882a593Smuzhiyun 0xf8 : b'\xc3\xb8', # ø 753*4882a593Smuzhiyun 0xf9 : b'\xc3\xb9', # ù 754*4882a593Smuzhiyun 0xfa : b'\xc3\xba', # ú 755*4882a593Smuzhiyun 0xfb : b'\xc3\xbb', # û 756*4882a593Smuzhiyun 0xfc : b'\xc3\xbc', # ü 757*4882a593Smuzhiyun 0xfd : b'\xc3\xbd', # ý 758*4882a593Smuzhiyun 0xfe : b'\xc3\xbe', # þ 759*4882a593Smuzhiyun } 760*4882a593Smuzhiyun 761*4882a593Smuzhiyun MULTIBYTE_MARKERS_AND_SIZES = [ 762*4882a593Smuzhiyun (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF 763*4882a593Smuzhiyun (0xe0, 0xef, 3), # 3-byte characters start with E0-EF 764*4882a593Smuzhiyun (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 765*4882a593Smuzhiyun ] 766*4882a593Smuzhiyun 767*4882a593Smuzhiyun FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] 768*4882a593Smuzhiyun LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] 769*4882a593Smuzhiyun 770*4882a593Smuzhiyun @classmethod 771*4882a593Smuzhiyun def detwingle(cls, in_bytes, main_encoding="utf8", 772*4882a593Smuzhiyun embedded_encoding="windows-1252"): 773*4882a593Smuzhiyun """Fix characters from one encoding embedded in some other encoding. 774*4882a593Smuzhiyun 775*4882a593Smuzhiyun Currently the only situation supported is Windows-1252 (or its 776*4882a593Smuzhiyun subset ISO-8859-1), embedded in UTF-8. 777*4882a593Smuzhiyun 778*4882a593Smuzhiyun The input must be a bytestring. If you've already converted 779*4882a593Smuzhiyun the document to Unicode, you're too late. 780*4882a593Smuzhiyun 781*4882a593Smuzhiyun The output is a bytestring in which `embedded_encoding` 782*4882a593Smuzhiyun characters have been converted to their `main_encoding` 783*4882a593Smuzhiyun equivalents. 784*4882a593Smuzhiyun """ 785*4882a593Smuzhiyun if embedded_encoding.replace('_', '-').lower() not in ( 786*4882a593Smuzhiyun 'windows-1252', 'windows_1252'): 787*4882a593Smuzhiyun raise NotImplementedError( 788*4882a593Smuzhiyun "Windows-1252 and ISO-8859-1 are the only currently supported " 789*4882a593Smuzhiyun "embedded encodings.") 790*4882a593Smuzhiyun 791*4882a593Smuzhiyun if main_encoding.lower() not in ('utf8', 'utf-8'): 792*4882a593Smuzhiyun raise NotImplementedError( 793*4882a593Smuzhiyun "UTF-8 is the only currently supported main encoding.") 794*4882a593Smuzhiyun 795*4882a593Smuzhiyun byte_chunks = [] 796*4882a593Smuzhiyun 797*4882a593Smuzhiyun chunk_start = 0 798*4882a593Smuzhiyun pos = 0 799*4882a593Smuzhiyun while pos < len(in_bytes): 800*4882a593Smuzhiyun byte = in_bytes[pos] 801*4882a593Smuzhiyun if not isinstance(byte, int): 802*4882a593Smuzhiyun # Python 2.x 803*4882a593Smuzhiyun byte = ord(byte) 804*4882a593Smuzhiyun if (byte >= cls.FIRST_MULTIBYTE_MARKER 805*4882a593Smuzhiyun and byte <= cls.LAST_MULTIBYTE_MARKER): 806*4882a593Smuzhiyun # This is the start of a UTF-8 multibyte character. Skip 807*4882a593Smuzhiyun # to the end. 808*4882a593Smuzhiyun for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: 809*4882a593Smuzhiyun if byte >= start and byte <= end: 810*4882a593Smuzhiyun pos += size 811*4882a593Smuzhiyun break 812*4882a593Smuzhiyun elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: 813*4882a593Smuzhiyun # We found a Windows-1252 character! 814*4882a593Smuzhiyun # Save the string up to this point as a chunk. 815*4882a593Smuzhiyun byte_chunks.append(in_bytes[chunk_start:pos]) 816*4882a593Smuzhiyun 817*4882a593Smuzhiyun # Now translate the Windows-1252 character into UTF-8 818*4882a593Smuzhiyun # and add it as another, one-byte chunk. 819*4882a593Smuzhiyun byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) 820*4882a593Smuzhiyun pos += 1 821*4882a593Smuzhiyun chunk_start = pos 822*4882a593Smuzhiyun else: 823*4882a593Smuzhiyun # Go on to the next character. 824*4882a593Smuzhiyun pos += 1 825*4882a593Smuzhiyun if chunk_start == 0: 826*4882a593Smuzhiyun # The string is unchanged. 827*4882a593Smuzhiyun return in_bytes 828*4882a593Smuzhiyun else: 829*4882a593Smuzhiyun # Store the final chunk. 830*4882a593Smuzhiyun byte_chunks.append(in_bytes[chunk_start:]) 831*4882a593Smuzhiyun return b''.join(byte_chunks) 832*4882a593Smuzhiyun 833