1*4882a593Smuzhiyun"""Tests to ensure that the html5lib tree builder generates good trees.""" 2*4882a593Smuzhiyun 3*4882a593Smuzhiyunimport warnings 4*4882a593Smuzhiyun 5*4882a593Smuzhiyuntry: 6*4882a593Smuzhiyun from bs4.builder import HTML5TreeBuilder 7*4882a593Smuzhiyun HTML5LIB_PRESENT = True 8*4882a593Smuzhiyunexcept ImportError as e: 9*4882a593Smuzhiyun HTML5LIB_PRESENT = False 10*4882a593Smuzhiyunfrom bs4.element import SoupStrainer 11*4882a593Smuzhiyunfrom bs4.testing import ( 12*4882a593Smuzhiyun HTML5TreeBuilderSmokeTest, 13*4882a593Smuzhiyun SoupTest, 14*4882a593Smuzhiyun skipIf, 15*4882a593Smuzhiyun) 16*4882a593Smuzhiyun 17*4882a593Smuzhiyun@skipIf( 18*4882a593Smuzhiyun not HTML5LIB_PRESENT, 19*4882a593Smuzhiyun "html5lib seems not to be present, not testing its tree builder.") 20*4882a593Smuzhiyunclass HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21*4882a593Smuzhiyun """See ``HTML5TreeBuilderSmokeTest``.""" 22*4882a593Smuzhiyun 23*4882a593Smuzhiyun @property 24*4882a593Smuzhiyun def default_builder(self): 25*4882a593Smuzhiyun return HTML5TreeBuilder() 26*4882a593Smuzhiyun 27*4882a593Smuzhiyun def test_soupstrainer(self): 28*4882a593Smuzhiyun # The html5lib tree builder does not support SoupStrainers. 29*4882a593Smuzhiyun strainer = SoupStrainer("b") 30*4882a593Smuzhiyun markup = "<p>A <b>bold</b> statement.</p>" 31*4882a593Smuzhiyun with warnings.catch_warnings(record=True) as w: 32*4882a593Smuzhiyun soup = self.soup(markup, parse_only=strainer) 33*4882a593Smuzhiyun self.assertEqual( 34*4882a593Smuzhiyun soup.decode(), self.document_for(markup)) 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun self.assertTrue( 37*4882a593Smuzhiyun "the html5lib tree builder doesn't support parse_only" in 38*4882a593Smuzhiyun str(w[0].message)) 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun def test_correctly_nested_tables(self): 41*4882a593Smuzhiyun """html5lib inserts <tbody> tags where other parsers don't.""" 42*4882a593Smuzhiyun markup = ('<table id="1">' 43*4882a593Smuzhiyun '<tr>' 44*4882a593Smuzhiyun "<td>Here's another table:" 45*4882a593Smuzhiyun '<table id="2">' 46*4882a593Smuzhiyun '<tr><td>foo</td></tr>' 47*4882a593Smuzhiyun '</table></td>') 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun self.assertSoupEquals( 50*4882a593Smuzhiyun markup, 51*4882a593Smuzhiyun '<table id="1"><tbody><tr><td>Here\'s another table:' 52*4882a593Smuzhiyun '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' 53*4882a593Smuzhiyun '</td></tr></tbody></table>') 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun self.assertSoupEquals( 56*4882a593Smuzhiyun "<table><thead><tr><td>Foo</td></tr></thead>" 57*4882a593Smuzhiyun "<tbody><tr><td>Bar</td></tr></tbody>" 58*4882a593Smuzhiyun "<tfoot><tr><td>Baz</td></tr></tfoot></table>") 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun def test_xml_declaration_followed_by_doctype(self): 61*4882a593Smuzhiyun markup = '''<?xml version="1.0" encoding="utf-8"?> 62*4882a593Smuzhiyun<!DOCTYPE html> 63*4882a593Smuzhiyun<html> 64*4882a593Smuzhiyun <head> 65*4882a593Smuzhiyun </head> 66*4882a593Smuzhiyun <body> 67*4882a593Smuzhiyun <p>foo</p> 68*4882a593Smuzhiyun </body> 69*4882a593Smuzhiyun</html>''' 70*4882a593Smuzhiyun soup = self.soup(markup) 71*4882a593Smuzhiyun # Verify that we can reach the <p> tag; this means the tree is connected. 72*4882a593Smuzhiyun self.assertEqual(b"<p>foo</p>", soup.p.encode()) 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun def test_reparented_markup(self): 75*4882a593Smuzhiyun markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' 76*4882a593Smuzhiyun soup = self.soup(markup) 77*4882a593Smuzhiyun self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) 78*4882a593Smuzhiyun self.assertEqual(2, len(soup.find_all('p'))) 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun 81*4882a593Smuzhiyun def test_reparented_markup_ends_with_whitespace(self): 82*4882a593Smuzhiyun markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' 83*4882a593Smuzhiyun soup = self.soup(markup) 84*4882a593Smuzhiyun self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) 85*4882a593Smuzhiyun self.assertEqual(2, len(soup.find_all('p'))) 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun def test_processing_instruction(self): 88*4882a593Smuzhiyun """Processing instructions become comments.""" 89*4882a593Smuzhiyun markup = b"""<?PITarget PIContent?>""" 90*4882a593Smuzhiyun soup = self.soup(markup) 91*4882a593Smuzhiyun assert str(soup).startswith("<!--?PITarget PIContent?-->") 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun def test_cloned_multivalue_node(self): 94*4882a593Smuzhiyun markup = b"""<a class="my_class"><p></a>""" 95*4882a593Smuzhiyun soup = self.soup(markup) 96*4882a593Smuzhiyun a1, a2 = soup.find_all('a') 97*4882a593Smuzhiyun self.assertEqual(a1, a2) 98*4882a593Smuzhiyun assert a1 is not a2 99