1*4882a593Smuzhiyun# -*- coding: utf-8 -*- 2*4882a593Smuzhiyun"""Tests of Beautiful Soup as a whole.""" 3*4882a593Smuzhiyun 4*4882a593Smuzhiyunimport logging 5*4882a593Smuzhiyunimport unittest 6*4882a593Smuzhiyunimport sys 7*4882a593Smuzhiyunimport tempfile 8*4882a593Smuzhiyun 9*4882a593Smuzhiyunfrom bs4 import BeautifulSoup 10*4882a593Smuzhiyunfrom bs4.element import ( 11*4882a593Smuzhiyun CharsetMetaAttributeValue, 12*4882a593Smuzhiyun ContentMetaAttributeValue, 13*4882a593Smuzhiyun SoupStrainer, 14*4882a593Smuzhiyun NamespacedAttribute, 15*4882a593Smuzhiyun ) 16*4882a593Smuzhiyunimport bs4.dammit 17*4882a593Smuzhiyunfrom bs4.dammit import ( 18*4882a593Smuzhiyun EntitySubstitution, 19*4882a593Smuzhiyun UnicodeDammit, 20*4882a593Smuzhiyun EncodingDetector, 21*4882a593Smuzhiyun) 22*4882a593Smuzhiyunfrom bs4.testing import ( 23*4882a593Smuzhiyun SoupTest, 24*4882a593Smuzhiyun skipIf, 25*4882a593Smuzhiyun) 26*4882a593Smuzhiyunimport warnings 27*4882a593Smuzhiyun 28*4882a593Smuzhiyuntry: 29*4882a593Smuzhiyun from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 30*4882a593Smuzhiyun LXML_PRESENT = True 31*4882a593Smuzhiyunexcept ImportError as e: 32*4882a593Smuzhiyun LXML_PRESENT = False 33*4882a593Smuzhiyun 34*4882a593SmuzhiyunPYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) 35*4882a593SmuzhiyunPYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) 36*4882a593Smuzhiyun 37*4882a593Smuzhiyunclass TestConstructor(SoupTest): 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun def test_short_unicode_input(self): 40*4882a593Smuzhiyun data = "<h1>éé</h1>" 41*4882a593Smuzhiyun soup = self.soup(data) 42*4882a593Smuzhiyun self.assertEqual("éé", soup.h1.string) 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun def test_embedded_null(self): 45*4882a593Smuzhiyun data = "<h1>foo\0bar</h1>" 46*4882a593Smuzhiyun soup = self.soup(data) 47*4882a593Smuzhiyun self.assertEqual("foo\0bar", soup.h1.string) 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun def test_exclude_encodings(self): 50*4882a593Smuzhiyun utf8_data = "Räksmörgås".encode("utf-8") 51*4882a593Smuzhiyun soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) 52*4882a593Smuzhiyun self.assertEqual("windows-1252", soup.original_encoding) 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun 55*4882a593Smuzhiyunclass TestWarnings(SoupTest): 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun def _no_parser_specified(self, s, is_there=True): 58*4882a593Smuzhiyun v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) 59*4882a593Smuzhiyun self.assertTrue(v) 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun def test_warning_if_no_parser_specified(self): 62*4882a593Smuzhiyun with warnings.catch_warnings(record=True) as w: 63*4882a593Smuzhiyun soup = self.soup("<a><b></b></a>") 64*4882a593Smuzhiyun msg = str(w[0].message) 65*4882a593Smuzhiyun self._assert_no_parser_specified(msg) 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun def test_warning_if_parser_specified_too_vague(self): 68*4882a593Smuzhiyun with warnings.catch_warnings(record=True) as w: 69*4882a593Smuzhiyun soup = self.soup("<a><b></b></a>", "html") 70*4882a593Smuzhiyun msg = str(w[0].message) 71*4882a593Smuzhiyun self._assert_no_parser_specified(msg) 72*4882a593Smuzhiyun 73*4882a593Smuzhiyun def test_no_warning_if_explicit_parser_specified(self): 74*4882a593Smuzhiyun with warnings.catch_warnings(record=True) as w: 75*4882a593Smuzhiyun soup = self.soup("<a><b></b></a>", "html.parser") 76*4882a593Smuzhiyun self.assertEqual([], w) 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun def test_parseOnlyThese_renamed_to_parse_only(self): 79*4882a593Smuzhiyun with warnings.catch_warnings(record=True) as w: 80*4882a593Smuzhiyun soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) 81*4882a593Smuzhiyun msg = str(w[0].message) 82*4882a593Smuzhiyun self.assertTrue("parseOnlyThese" in msg) 83*4882a593Smuzhiyun self.assertTrue("parse_only" in msg) 84*4882a593Smuzhiyun self.assertEqual(b"<b></b>", soup.encode()) 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun def test_fromEncoding_renamed_to_from_encoding(self): 87*4882a593Smuzhiyun with warnings.catch_warnings(record=True) as w: 88*4882a593Smuzhiyun utf8 = b"\xc3\xa9" 89*4882a593Smuzhiyun soup = self.soup(utf8, fromEncoding="utf8") 90*4882a593Smuzhiyun msg = str(w[0].message) 91*4882a593Smuzhiyun self.assertTrue("fromEncoding" in msg) 92*4882a593Smuzhiyun self.assertTrue("from_encoding" in msg) 93*4882a593Smuzhiyun self.assertEqual("utf8", soup.original_encoding) 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun def test_unrecognized_keyword_argument(self): 96*4882a593Smuzhiyun self.assertRaises( 97*4882a593Smuzhiyun TypeError, self.soup, "<a>", no_such_argument=True) 98*4882a593Smuzhiyun 99*4882a593Smuzhiyunclass TestWarnings(SoupTest): 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun def test_disk_file_warning(self): 102*4882a593Smuzhiyun filehandle = tempfile.NamedTemporaryFile() 103*4882a593Smuzhiyun filename = filehandle.name 104*4882a593Smuzhiyun try: 105*4882a593Smuzhiyun with warnings.catch_warnings(record=True) as w: 106*4882a593Smuzhiyun soup = self.soup(filename) 107*4882a593Smuzhiyun msg = str(w[0].message) 108*4882a593Smuzhiyun self.assertTrue("looks like a filename" in msg) 109*4882a593Smuzhiyun finally: 110*4882a593Smuzhiyun filehandle.close() 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun # The file no longer exists, so Beautiful Soup will no longer issue the warning. 113*4882a593Smuzhiyun with warnings.catch_warnings(record=True) as w: 114*4882a593Smuzhiyun soup = self.soup(filename) 115*4882a593Smuzhiyun self.assertEqual(0, len(w)) 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun def test_url_warning(self): 118*4882a593Smuzhiyun with warnings.catch_warnings(record=True) as w: 119*4882a593Smuzhiyun soup = self.soup("http://www.crummy.com/") 120*4882a593Smuzhiyun msg = str(w[0].message) 121*4882a593Smuzhiyun self.assertTrue("looks like a URL" in msg) 122*4882a593Smuzhiyun 123*4882a593Smuzhiyun with warnings.catch_warnings(record=True) as w: 124*4882a593Smuzhiyun soup = self.soup("http://www.crummy.com/ is great") 125*4882a593Smuzhiyun self.assertEqual(0, len(w)) 126*4882a593Smuzhiyun 127*4882a593Smuzhiyunclass TestSelectiveParsing(SoupTest): 128*4882a593Smuzhiyun 129*4882a593Smuzhiyun def test_parse_with_soupstrainer(self): 130*4882a593Smuzhiyun markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" 131*4882a593Smuzhiyun strainer = SoupStrainer("b") 132*4882a593Smuzhiyun soup = self.soup(markup, parse_only=strainer) 133*4882a593Smuzhiyun self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") 134*4882a593Smuzhiyun 135*4882a593Smuzhiyun 136*4882a593Smuzhiyunclass TestEntitySubstitution(unittest.TestCase): 137*4882a593Smuzhiyun """Standalone tests of the EntitySubstitution class.""" 138*4882a593Smuzhiyun def setUp(self): 139*4882a593Smuzhiyun self.sub = EntitySubstitution 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun def test_simple_html_substitution(self): 142*4882a593Smuzhiyun # Unicode characters corresponding to named HTML entites 143*4882a593Smuzhiyun # are substituted, and no others. 144*4882a593Smuzhiyun s = "foo\u2200\N{SNOWMAN}\u00f5bar" 145*4882a593Smuzhiyun self.assertEqual(self.sub.substitute_html(s), 146*4882a593Smuzhiyun "foo∀\N{SNOWMAN}õbar") 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun def test_smart_quote_substitution(self): 149*4882a593Smuzhiyun # MS smart quotes are a common source of frustration, so we 150*4882a593Smuzhiyun # give them a special test. 151*4882a593Smuzhiyun quotes = b"\x91\x92foo\x93\x94" 152*4882a593Smuzhiyun dammit = UnicodeDammit(quotes) 153*4882a593Smuzhiyun self.assertEqual(self.sub.substitute_html(dammit.markup), 154*4882a593Smuzhiyun "‘’foo“”") 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): 157*4882a593Smuzhiyun s = 'Welcome to "my bar"' 158*4882a593Smuzhiyun self.assertEqual(self.sub.substitute_xml(s, False), s) 159*4882a593Smuzhiyun 160*4882a593Smuzhiyun def test_xml_attribute_quoting_normally_uses_double_quotes(self): 161*4882a593Smuzhiyun self.assertEqual(self.sub.substitute_xml("Welcome", True), 162*4882a593Smuzhiyun '"Welcome"') 163*4882a593Smuzhiyun self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), 164*4882a593Smuzhiyun '"Bob\'s Bar"') 165*4882a593Smuzhiyun 166*4882a593Smuzhiyun def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): 167*4882a593Smuzhiyun s = 'Welcome to "my bar"' 168*4882a593Smuzhiyun self.assertEqual(self.sub.substitute_xml(s, True), 169*4882a593Smuzhiyun "'Welcome to \"my bar\"'") 170*4882a593Smuzhiyun 171*4882a593Smuzhiyun def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): 172*4882a593Smuzhiyun s = 'Welcome to "Bob\'s Bar"' 173*4882a593Smuzhiyun self.assertEqual( 174*4882a593Smuzhiyun self.sub.substitute_xml(s, True), 175*4882a593Smuzhiyun '"Welcome to "Bob\'s Bar""') 176*4882a593Smuzhiyun 177*4882a593Smuzhiyun def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): 178*4882a593Smuzhiyun quoted = 'Welcome to "Bob\'s Bar"' 179*4882a593Smuzhiyun self.assertEqual(self.sub.substitute_xml(quoted), quoted) 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun def test_xml_quoting_handles_angle_brackets(self): 182*4882a593Smuzhiyun self.assertEqual( 183*4882a593Smuzhiyun self.sub.substitute_xml("foo<bar>"), 184*4882a593Smuzhiyun "foo<bar>") 185*4882a593Smuzhiyun 186*4882a593Smuzhiyun def test_xml_quoting_handles_ampersands(self): 187*4882a593Smuzhiyun self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") 188*4882a593Smuzhiyun 189*4882a593Smuzhiyun def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): 190*4882a593Smuzhiyun self.assertEqual( 191*4882a593Smuzhiyun self.sub.substitute_xml("ÁT&T"), 192*4882a593Smuzhiyun "&Aacute;T&T") 193*4882a593Smuzhiyun 194*4882a593Smuzhiyun def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): 195*4882a593Smuzhiyun self.assertEqual( 196*4882a593Smuzhiyun self.sub.substitute_xml_containing_entities("ÁT&T"), 197*4882a593Smuzhiyun "ÁT&T") 198*4882a593Smuzhiyun 199*4882a593Smuzhiyun def test_quotes_not_html_substituted(self): 200*4882a593Smuzhiyun """There's no need to do this except inside attribute values.""" 201*4882a593Smuzhiyun text = 'Bob\'s "bar"' 202*4882a593Smuzhiyun self.assertEqual(self.sub.substitute_html(text), text) 203*4882a593Smuzhiyun 204*4882a593Smuzhiyun 205*4882a593Smuzhiyunclass TestEncodingConversion(SoupTest): 206*4882a593Smuzhiyun # Test Beautiful Soup's ability to decode and encode from various 207*4882a593Smuzhiyun # encodings. 208*4882a593Smuzhiyun 209*4882a593Smuzhiyun def setUp(self): 210*4882a593Smuzhiyun super(TestEncodingConversion, self).setUp() 211*4882a593Smuzhiyun self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' 212*4882a593Smuzhiyun self.utf8_data = self.unicode_data.encode("utf-8") 213*4882a593Smuzhiyun # Just so you know what it looks like. 214*4882a593Smuzhiyun self.assertEqual( 215*4882a593Smuzhiyun self.utf8_data, 216*4882a593Smuzhiyun b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') 217*4882a593Smuzhiyun 218*4882a593Smuzhiyun def test_ascii_in_unicode_out(self): 219*4882a593Smuzhiyun # ASCII input is converted to Unicode. The original_encoding 220*4882a593Smuzhiyun # attribute is set to 'utf-8', a superset of ASCII. 221*4882a593Smuzhiyun chardet = bs4.dammit.chardet_dammit 222*4882a593Smuzhiyun logging.disable(logging.WARNING) 223*4882a593Smuzhiyun try: 224*4882a593Smuzhiyun def noop(str): 225*4882a593Smuzhiyun return None 226*4882a593Smuzhiyun # Disable chardet, which will realize that the ASCII is ASCII. 227*4882a593Smuzhiyun bs4.dammit.chardet_dammit = noop 228*4882a593Smuzhiyun ascii = b"<foo>a</foo>" 229*4882a593Smuzhiyun soup_from_ascii = self.soup(ascii) 230*4882a593Smuzhiyun unicode_output = soup_from_ascii.decode() 231*4882a593Smuzhiyun self.assertTrue(isinstance(unicode_output, str)) 232*4882a593Smuzhiyun self.assertEqual(unicode_output, self.document_for(ascii.decode())) 233*4882a593Smuzhiyun self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") 234*4882a593Smuzhiyun finally: 235*4882a593Smuzhiyun logging.disable(logging.NOTSET) 236*4882a593Smuzhiyun bs4.dammit.chardet_dammit = chardet 237*4882a593Smuzhiyun 238*4882a593Smuzhiyun def test_unicode_in_unicode_out(self): 239*4882a593Smuzhiyun # Unicode input is left alone. The original_encoding attribute 240*4882a593Smuzhiyun # is not set. 241*4882a593Smuzhiyun soup_from_unicode = self.soup(self.unicode_data) 242*4882a593Smuzhiyun self.assertEqual(soup_from_unicode.decode(), self.unicode_data) 243*4882a593Smuzhiyun self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') 244*4882a593Smuzhiyun self.assertEqual(soup_from_unicode.original_encoding, None) 245*4882a593Smuzhiyun 246*4882a593Smuzhiyun def test_utf8_in_unicode_out(self): 247*4882a593Smuzhiyun # UTF-8 input is converted to Unicode. The original_encoding 248*4882a593Smuzhiyun # attribute is set. 249*4882a593Smuzhiyun soup_from_utf8 = self.soup(self.utf8_data) 250*4882a593Smuzhiyun self.assertEqual(soup_from_utf8.decode(), self.unicode_data) 251*4882a593Smuzhiyun self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') 252*4882a593Smuzhiyun 253*4882a593Smuzhiyun def test_utf8_out(self): 254*4882a593Smuzhiyun # The internal data structures can be encoded as UTF-8. 255*4882a593Smuzhiyun soup_from_unicode = self.soup(self.unicode_data) 256*4882a593Smuzhiyun self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun @skipIf( 259*4882a593Smuzhiyun PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, 260*4882a593Smuzhiyun "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") 261*4882a593Smuzhiyun def test_attribute_name_containing_unicode_characters(self): 262*4882a593Smuzhiyun markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' 263*4882a593Smuzhiyun self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) 264*4882a593Smuzhiyun 265*4882a593Smuzhiyunclass TestUnicodeDammit(unittest.TestCase): 266*4882a593Smuzhiyun """Standalone tests of UnicodeDammit.""" 267*4882a593Smuzhiyun 268*4882a593Smuzhiyun def test_unicode_input(self): 269*4882a593Smuzhiyun markup = "I'm already Unicode! \N{SNOWMAN}" 270*4882a593Smuzhiyun dammit = UnicodeDammit(markup) 271*4882a593Smuzhiyun self.assertEqual(dammit.unicode_markup, markup) 272*4882a593Smuzhiyun 273*4882a593Smuzhiyun def test_smart_quotes_to_unicode(self): 274*4882a593Smuzhiyun markup = b"<foo>\x91\x92\x93\x94</foo>" 275*4882a593Smuzhiyun dammit = UnicodeDammit(markup) 276*4882a593Smuzhiyun self.assertEqual( 277*4882a593Smuzhiyun dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") 278*4882a593Smuzhiyun 279*4882a593Smuzhiyun def test_smart_quotes_to_xml_entities(self): 280*4882a593Smuzhiyun markup = b"<foo>\x91\x92\x93\x94</foo>" 281*4882a593Smuzhiyun dammit = UnicodeDammit(markup, smart_quotes_to="xml") 282*4882a593Smuzhiyun self.assertEqual( 283*4882a593Smuzhiyun dammit.unicode_markup, "<foo>‘’“”</foo>") 284*4882a593Smuzhiyun 285*4882a593Smuzhiyun def test_smart_quotes_to_html_entities(self): 286*4882a593Smuzhiyun markup = b"<foo>\x91\x92\x93\x94</foo>" 287*4882a593Smuzhiyun dammit = UnicodeDammit(markup, smart_quotes_to="html") 288*4882a593Smuzhiyun self.assertEqual( 289*4882a593Smuzhiyun dammit.unicode_markup, "<foo>‘’“”</foo>") 290*4882a593Smuzhiyun 291*4882a593Smuzhiyun def test_smart_quotes_to_ascii(self): 292*4882a593Smuzhiyun markup = b"<foo>\x91\x92\x93\x94</foo>" 293*4882a593Smuzhiyun dammit = UnicodeDammit(markup, smart_quotes_to="ascii") 294*4882a593Smuzhiyun self.assertEqual( 295*4882a593Smuzhiyun dammit.unicode_markup, """<foo>''""</foo>""") 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun def test_detect_utf8(self): 298*4882a593Smuzhiyun utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" 299*4882a593Smuzhiyun dammit = UnicodeDammit(utf8) 300*4882a593Smuzhiyun self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 301*4882a593Smuzhiyun self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') 302*4882a593Smuzhiyun 303*4882a593Smuzhiyun 304*4882a593Smuzhiyun def test_convert_hebrew(self): 305*4882a593Smuzhiyun hebrew = b"\xed\xe5\xec\xf9" 306*4882a593Smuzhiyun dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) 307*4882a593Smuzhiyun self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') 308*4882a593Smuzhiyun self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') 309*4882a593Smuzhiyun 310*4882a593Smuzhiyun def test_dont_see_smart_quotes_where_there_are_none(self): 311*4882a593Smuzhiyun utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" 312*4882a593Smuzhiyun dammit = UnicodeDammit(utf_8) 313*4882a593Smuzhiyun self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 314*4882a593Smuzhiyun self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) 315*4882a593Smuzhiyun 316*4882a593Smuzhiyun def test_ignore_inappropriate_codecs(self): 317*4882a593Smuzhiyun utf8_data = "Räksmörgås".encode("utf-8") 318*4882a593Smuzhiyun dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) 319*4882a593Smuzhiyun self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 320*4882a593Smuzhiyun 321*4882a593Smuzhiyun def test_ignore_invalid_codecs(self): 322*4882a593Smuzhiyun utf8_data = "Räksmörgås".encode("utf-8") 323*4882a593Smuzhiyun for bad_encoding in ['.utf8', '...', 'utF---16.!']: 324*4882a593Smuzhiyun dammit = UnicodeDammit(utf8_data, [bad_encoding]) 325*4882a593Smuzhiyun self.assertEqual(dammit.original_encoding.lower(), 'utf-8') 326*4882a593Smuzhiyun 327*4882a593Smuzhiyun def test_exclude_encodings(self): 328*4882a593Smuzhiyun # This is UTF-8. 329*4882a593Smuzhiyun utf8_data = "Räksmörgås".encode("utf-8") 330*4882a593Smuzhiyun 331*4882a593Smuzhiyun # But if we exclude UTF-8 from consideration, the guess is 332*4882a593Smuzhiyun # Windows-1252. 333*4882a593Smuzhiyun dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) 334*4882a593Smuzhiyun self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') 335*4882a593Smuzhiyun 336*4882a593Smuzhiyun # And if we exclude that, there is no valid guess at all. 337*4882a593Smuzhiyun dammit = UnicodeDammit( 338*4882a593Smuzhiyun utf8_data, exclude_encodings=["utf-8", "windows-1252"]) 339*4882a593Smuzhiyun self.assertEqual(dammit.original_encoding, None) 340*4882a593Smuzhiyun 341*4882a593Smuzhiyun def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): 342*4882a593Smuzhiyun detected = EncodingDetector( 343*4882a593Smuzhiyun b'<?xml version="1.0" encoding="UTF-\xdb" ?>') 344*4882a593Smuzhiyun encodings = list(detected.encodings) 345*4882a593Smuzhiyun assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings 346*4882a593Smuzhiyun 347*4882a593Smuzhiyun def test_detect_html5_style_meta_tag(self): 348*4882a593Smuzhiyun 349*4882a593Smuzhiyun for data in ( 350*4882a593Smuzhiyun b'<html><meta charset="euc-jp" /></html>', 351*4882a593Smuzhiyun b"<html><meta charset='euc-jp' /></html>", 352*4882a593Smuzhiyun b"<html><meta charset=euc-jp /></html>", 353*4882a593Smuzhiyun b"<html><meta charset=euc-jp/></html>"): 354*4882a593Smuzhiyun dammit = UnicodeDammit(data, is_html=True) 355*4882a593Smuzhiyun self.assertEqual( 356*4882a593Smuzhiyun "euc-jp", dammit.original_encoding) 357*4882a593Smuzhiyun 358*4882a593Smuzhiyun def test_last_ditch_entity_replacement(self): 359*4882a593Smuzhiyun # This is a UTF-8 document that contains bytestrings 360*4882a593Smuzhiyun # completely incompatible with UTF-8 (ie. encoded with some other 361*4882a593Smuzhiyun # encoding). 362*4882a593Smuzhiyun # 363*4882a593Smuzhiyun # Since there is no consistent encoding for the document, 364*4882a593Smuzhiyun # Unicode, Dammit will eventually encode the document as UTF-8 365*4882a593Smuzhiyun # and encode the incompatible characters as REPLACEMENT 366*4882a593Smuzhiyun # CHARACTER. 367*4882a593Smuzhiyun # 368*4882a593Smuzhiyun # If chardet is installed, it will detect that the document 369*4882a593Smuzhiyun # can be converted into ISO-8859-1 without errors. This happens 370*4882a593Smuzhiyun # to be the wrong encoding, but it is a consistent encoding, so the 371*4882a593Smuzhiyun # code we're testing here won't run. 372*4882a593Smuzhiyun # 373*4882a593Smuzhiyun # So we temporarily disable chardet if it's present. 374*4882a593Smuzhiyun doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> 375*4882a593Smuzhiyun<html><b>\330\250\330\252\330\261</b> 376*4882a593Smuzhiyun<i>\310\322\321\220\312\321\355\344</i></html>""" 377*4882a593Smuzhiyun chardet = bs4.dammit.chardet_dammit 378*4882a593Smuzhiyun logging.disable(logging.WARNING) 379*4882a593Smuzhiyun try: 380*4882a593Smuzhiyun def noop(str): 381*4882a593Smuzhiyun return None 382*4882a593Smuzhiyun bs4.dammit.chardet_dammit = noop 383*4882a593Smuzhiyun dammit = UnicodeDammit(doc) 384*4882a593Smuzhiyun self.assertEqual(True, dammit.contains_replacement_characters) 385*4882a593Smuzhiyun self.assertTrue("\ufffd" in dammit.unicode_markup) 386*4882a593Smuzhiyun 387*4882a593Smuzhiyun soup = BeautifulSoup(doc, "html.parser") 388*4882a593Smuzhiyun self.assertTrue(soup.contains_replacement_characters) 389*4882a593Smuzhiyun finally: 390*4882a593Smuzhiyun logging.disable(logging.NOTSET) 391*4882a593Smuzhiyun bs4.dammit.chardet_dammit = chardet 392*4882a593Smuzhiyun 393*4882a593Smuzhiyun def test_byte_order_mark_removed(self): 394*4882a593Smuzhiyun # A document written in UTF-16LE will have its byte order marker stripped. 395*4882a593Smuzhiyun data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' 396*4882a593Smuzhiyun dammit = UnicodeDammit(data) 397*4882a593Smuzhiyun self.assertEqual("<a>áé</a>", dammit.unicode_markup) 398*4882a593Smuzhiyun self.assertEqual("utf-16le", dammit.original_encoding) 399*4882a593Smuzhiyun 400*4882a593Smuzhiyun def test_detwingle(self): 401*4882a593Smuzhiyun # Here's a UTF8 document. 402*4882a593Smuzhiyun utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") 403*4882a593Smuzhiyun 404*4882a593Smuzhiyun # Here's a Windows-1252 document. 405*4882a593Smuzhiyun windows_1252 = ( 406*4882a593Smuzhiyun "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" 407*4882a593Smuzhiyun "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") 408*4882a593Smuzhiyun 409*4882a593Smuzhiyun # Through some unholy alchemy, they've been stuck together. 410*4882a593Smuzhiyun doc = utf8 + windows_1252 + utf8 411*4882a593Smuzhiyun 412*4882a593Smuzhiyun # The document can't be turned into UTF-8: 413*4882a593Smuzhiyun self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") 414*4882a593Smuzhiyun 415*4882a593Smuzhiyun # Unicode, Dammit thinks the whole document is Windows-1252, 416*4882a593Smuzhiyun # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" 417*4882a593Smuzhiyun 418*4882a593Smuzhiyun # But if we run it through fix_embedded_windows_1252, it's fixed: 419*4882a593Smuzhiyun 420*4882a593Smuzhiyun fixed = UnicodeDammit.detwingle(doc) 421*4882a593Smuzhiyun self.assertEqual( 422*4882a593Smuzhiyun "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) 423*4882a593Smuzhiyun 424*4882a593Smuzhiyun def test_detwingle_ignores_multibyte_characters(self): 425*4882a593Smuzhiyun # Each of these characters has a UTF-8 representation ending 426*4882a593Smuzhiyun # in \x93. \x93 is a smart quote if interpreted as 427*4882a593Smuzhiyun # Windows-1252. But our code knows to skip over multibyte 428*4882a593Smuzhiyun # UTF-8 characters, so they'll survive the process unscathed. 429*4882a593Smuzhiyun for tricky_unicode_char in ( 430*4882a593Smuzhiyun "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' 431*4882a593Smuzhiyun "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' 432*4882a593Smuzhiyun "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. 433*4882a593Smuzhiyun ): 434*4882a593Smuzhiyun input = tricky_unicode_char.encode("utf8") 435*4882a593Smuzhiyun self.assertTrue(input.endswith(b'\x93')) 436*4882a593Smuzhiyun output = UnicodeDammit.detwingle(input) 437*4882a593Smuzhiyun self.assertEqual(output, input) 438*4882a593Smuzhiyun 439*4882a593Smuzhiyunclass TestNamedspacedAttribute(SoupTest): 440*4882a593Smuzhiyun 441*4882a593Smuzhiyun def test_name_may_be_none(self): 442*4882a593Smuzhiyun a = NamespacedAttribute("xmlns", None) 443*4882a593Smuzhiyun self.assertEqual(a, "xmlns") 444*4882a593Smuzhiyun 445*4882a593Smuzhiyun def test_attribute_is_equivalent_to_colon_separated_string(self): 446*4882a593Smuzhiyun a = NamespacedAttribute("a", "b") 447*4882a593Smuzhiyun self.assertEqual("a:b", a) 448*4882a593Smuzhiyun 449*4882a593Smuzhiyun def test_attributes_are_equivalent_if_prefix_and_name_identical(self): 450*4882a593Smuzhiyun a = NamespacedAttribute("a", "b", "c") 451*4882a593Smuzhiyun b = NamespacedAttribute("a", "b", "c") 452*4882a593Smuzhiyun self.assertEqual(a, b) 453*4882a593Smuzhiyun 454*4882a593Smuzhiyun # The actual namespace is not considered. 455*4882a593Smuzhiyun c = NamespacedAttribute("a", "b", None) 456*4882a593Smuzhiyun self.assertEqual(a, c) 457*4882a593Smuzhiyun 458*4882a593Smuzhiyun # But name and prefix are important. 459*4882a593Smuzhiyun d = NamespacedAttribute("a", "z", "c") 460*4882a593Smuzhiyun self.assertNotEqual(a, d) 461*4882a593Smuzhiyun 462*4882a593Smuzhiyun e = NamespacedAttribute("z", "b", "c") 463*4882a593Smuzhiyun self.assertNotEqual(a, e) 464*4882a593Smuzhiyun 465*4882a593Smuzhiyun 466*4882a593Smuzhiyunclass TestAttributeValueWithCharsetSubstitution(unittest.TestCase): 467*4882a593Smuzhiyun 468*4882a593Smuzhiyun def test_content_meta_attribute_value(self): 469*4882a593Smuzhiyun value = CharsetMetaAttributeValue("euc-jp") 470*4882a593Smuzhiyun self.assertEqual("euc-jp", value) 471*4882a593Smuzhiyun self.assertEqual("euc-jp", value.original_value) 472*4882a593Smuzhiyun self.assertEqual("utf8", value.encode("utf8")) 473*4882a593Smuzhiyun 474*4882a593Smuzhiyun 475*4882a593Smuzhiyun def test_content_meta_attribute_value(self): 476*4882a593Smuzhiyun value = ContentMetaAttributeValue("text/html; charset=euc-jp") 477*4882a593Smuzhiyun self.assertEqual("text/html; charset=euc-jp", value) 478*4882a593Smuzhiyun self.assertEqual("text/html; charset=euc-jp", value.original_value) 479*4882a593Smuzhiyun self.assertEqual("text/html; charset=utf8", value.encode("utf8")) 480