lib/bs4/testing.py

103             'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
111 …doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/x…
127         markup = b"""<?xml version="1.0" encoding="utf-8"?>
128 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
135             soup.encode("utf-8").replace(b"\n", b""),
152         """A <p> tag is never designated as an empty-element tag.
154         Even if the markup shows it as an empty-element tag, it
164         This applies to all tags except empty-element tags.
172         """A <br> tag is designated as an empty-element tag.
175         two tags, but it should always be an empty-element tag.
203         markup = "<p>foo<!--foobar-->baz</p>"
368 …markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE}…
424         self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
428 …  expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
430         self.assertEqual(soup.p.encode("utf-8"), expected)
434         # easy-to-understand document.
436         # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
437 …ode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/>…
439         # That's because we're going to encode it into ISO-Latin-1, and use
441         iso_latin_html = unicode_html.encode("iso-8859-1")
443         # Parse the ISO-Latin-1 HTML.
445         # Encode it to UTF-8.
446         result = soup.encode("utf-8")
450         # UTF-8 instead of ISO-Latin-1.
451         expected = unicode_html.replace("ISO-Latin-1", "utf-8")
453         # And, of course, it would be in UTF-8, not Unicode.
454         expected = expected.encode("utf-8")
456         # Ta-da!
461         # Shift-JIS encoding, without choking.
464             b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
468         unicode_html = shift_jis_html.decode("shift-jis")
473         self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
477         # A real-world test to make sure we can convert ISO-8859-9 (a
478         # Hebrew encoding) to UTF-8.
479 …t = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebr…
481             hebrew_document, from_encoding="iso8859-8")
482         self.assertEqual(soup.original_encoding, 'iso8859-8')
484             soup.encode('utf-8'),
485             hebrew_document.decode("iso8859-8").encode("utf-8"))
489         # encoded in Shift-JIS.
490         meta_tag = ('<meta content="text/html; charset=x-sjis" '
491                     'http-equiv="Content-type"/>')
496             '<meta http-equiv="Content-language" content="ja"/>'
497             '</head><body>Shift-JIS markup goes here.') % meta_tag
501         parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
503         self.assertEqual('text/html; charset=x-sjis', content)
517         # encoded in Shift-JIS.
518         meta_tag = ('<meta id="encoding" charset="x-sjis" />')
523             '<meta http-equiv="Content-language" content="ja"/>'
524             '</head><body>Shift-JIS markup goes here.') % meta_tag
530         self.assertEqual('x-sjis', charset)
558             soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
567         markup = b"""<?xml version="1.0" encoding="utf-8"?>
568 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
575             soup.encode("utf-8"), markup)
582         soup = BeautifulSoup(doc, "lxml-xml")
590 …markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu…
595 …markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc…
608         markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
612         self.assertEqual(soup.encode("utf-8"), markup)
632 …markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http…
670         markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
673         self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')