1*4882a593Smuzhiyun__all__ = [ 2*4882a593Smuzhiyun 'HTML5TreeBuilder', 3*4882a593Smuzhiyun ] 4*4882a593Smuzhiyun 5*4882a593Smuzhiyunimport warnings 6*4882a593Smuzhiyunfrom bs4.builder import ( 7*4882a593Smuzhiyun PERMISSIVE, 8*4882a593Smuzhiyun HTML, 9*4882a593Smuzhiyun HTML_5, 10*4882a593Smuzhiyun HTMLTreeBuilder, 11*4882a593Smuzhiyun ) 12*4882a593Smuzhiyunfrom bs4.element import ( 13*4882a593Smuzhiyun NamespacedAttribute, 14*4882a593Smuzhiyun whitespace_re, 15*4882a593Smuzhiyun) 16*4882a593Smuzhiyunimport html5lib 17*4882a593Smuzhiyuntry: 18*4882a593Smuzhiyun # html5lib >= 0.99999999/1.0b9 19*4882a593Smuzhiyun from html5lib.treebuilders import base as treebuildersbase 20*4882a593Smuzhiyunexcept ImportError: 21*4882a593Smuzhiyun # html5lib <= 0.9999999/1.0b8 22*4882a593Smuzhiyun from html5lib.treebuilders import _base as treebuildersbase 23*4882a593Smuzhiyunfrom html5lib.constants import namespaces 24*4882a593Smuzhiyun 25*4882a593Smuzhiyunfrom bs4.element import ( 26*4882a593Smuzhiyun Comment, 27*4882a593Smuzhiyun Doctype, 28*4882a593Smuzhiyun NavigableString, 29*4882a593Smuzhiyun Tag, 30*4882a593Smuzhiyun ) 31*4882a593Smuzhiyun 32*4882a593Smuzhiyunclass HTML5TreeBuilder(HTMLTreeBuilder): 33*4882a593Smuzhiyun """Use html5lib to build a tree.""" 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun NAME = "html5lib" 36*4882a593Smuzhiyun 37*4882a593Smuzhiyun features = [NAME, PERMISSIVE, HTML_5, HTML] 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun def prepare_markup(self, markup, user_specified_encoding, 40*4882a593Smuzhiyun document_declared_encoding=None, exclude_encodings=None): 41*4882a593Smuzhiyun # Store the user-specified encoding for use later on. 42*4882a593Smuzhiyun self.user_specified_encoding = user_specified_encoding 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun # document_declared_encoding and exclude_encodings aren't used 45*4882a593Smuzhiyun # ATM because the html5lib TreeBuilder doesn't use 46*4882a593Smuzhiyun # UnicodeDammit. 47*4882a593Smuzhiyun if exclude_encodings: 48*4882a593Smuzhiyun warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") 49*4882a593Smuzhiyun yield (markup, None, None, False) 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun # These methods are defined by Beautiful Soup. 52*4882a593Smuzhiyun def feed(self, markup): 53*4882a593Smuzhiyun if self.soup.parse_only is not None: 54*4882a593Smuzhiyun warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 55*4882a593Smuzhiyun parser = html5lib.HTMLParser(tree=self.create_treebuilder) 56*4882a593Smuzhiyun doc = parser.parse(markup, encoding=self.user_specified_encoding) 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun # Set the character encoding detected by the tokenizer. 59*4882a593Smuzhiyun if isinstance(markup, str): 60*4882a593Smuzhiyun # We need to special-case this because html5lib sets 61*4882a593Smuzhiyun # charEncoding to UTF-8 if it gets Unicode input. 62*4882a593Smuzhiyun doc.original_encoding = None 63*4882a593Smuzhiyun else: 64*4882a593Smuzhiyun doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun def create_treebuilder(self, namespaceHTMLElements): 67*4882a593Smuzhiyun self.underlying_builder = TreeBuilderForHtml5lib( 68*4882a593Smuzhiyun self.soup, namespaceHTMLElements) 69*4882a593Smuzhiyun return self.underlying_builder 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun def test_fragment_to_document(self, fragment): 72*4882a593Smuzhiyun """See `TreeBuilder`.""" 73*4882a593Smuzhiyun return '<html><head></head><body>%s</body></html>' % fragment 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun 76*4882a593Smuzhiyunclass TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder): 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun def __init__(self, soup, namespaceHTMLElements): 79*4882a593Smuzhiyun self.soup = soup 80*4882a593Smuzhiyun super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 81*4882a593Smuzhiyun 82*4882a593Smuzhiyun def documentClass(self): 83*4882a593Smuzhiyun self.soup.reset() 84*4882a593Smuzhiyun return Element(self.soup, self.soup, None) 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun def insertDoctype(self, token): 87*4882a593Smuzhiyun name = token["name"] 88*4882a593Smuzhiyun publicId = token["publicId"] 89*4882a593Smuzhiyun systemId = token["systemId"] 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun doctype = Doctype.for_name_and_ids(name, publicId, systemId) 92*4882a593Smuzhiyun self.soup.object_was_parsed(doctype) 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun def elementClass(self, name, namespace): 95*4882a593Smuzhiyun tag = self.soup.new_tag(name, namespace) 96*4882a593Smuzhiyun return Element(tag, self.soup, namespace) 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun def commentClass(self, data): 99*4882a593Smuzhiyun return TextNode(Comment(data), self.soup) 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun def fragmentClass(self): 102*4882a593Smuzhiyun self.soup = BeautifulSoup("") 103*4882a593Smuzhiyun self.soup.name = "[document_fragment]" 104*4882a593Smuzhiyun return Element(self.soup, self.soup, None) 105*4882a593Smuzhiyun 106*4882a593Smuzhiyun def appendChild(self, node): 107*4882a593Smuzhiyun # XXX This code is not covered by the BS4 tests. 108*4882a593Smuzhiyun self.soup.append(node.element) 109*4882a593Smuzhiyun 110*4882a593Smuzhiyun def getDocument(self): 111*4882a593Smuzhiyun return self.soup 112*4882a593Smuzhiyun 113*4882a593Smuzhiyun def getFragment(self): 114*4882a593Smuzhiyun return treebuildersbase.TreeBuilder.getFragment(self).element 115*4882a593Smuzhiyun 116*4882a593Smuzhiyunclass AttrList(object): 117*4882a593Smuzhiyun def __init__(self, element): 118*4882a593Smuzhiyun self.element = element 119*4882a593Smuzhiyun self.attrs = dict(self.element.attrs) 120*4882a593Smuzhiyun def __iter__(self): 121*4882a593Smuzhiyun return list(self.attrs.items()).__iter__() 122*4882a593Smuzhiyun def __setitem__(self, name, value): 123*4882a593Smuzhiyun # If this attribute is a multi-valued attribute for this element, 124*4882a593Smuzhiyun # turn its value into a list. 125*4882a593Smuzhiyun list_attr = HTML5TreeBuilder.cdata_list_attributes 126*4882a593Smuzhiyun if (name in list_attr['*'] 127*4882a593Smuzhiyun or (self.element.name in list_attr 128*4882a593Smuzhiyun and name in list_attr[self.element.name])): 129*4882a593Smuzhiyun # A node that is being cloned may have already undergone 130*4882a593Smuzhiyun # this procedure. 131*4882a593Smuzhiyun if not isinstance(value, list): 132*4882a593Smuzhiyun value = whitespace_re.split(value) 133*4882a593Smuzhiyun self.element[name] = value 134*4882a593Smuzhiyun def items(self): 135*4882a593Smuzhiyun return list(self.attrs.items()) 136*4882a593Smuzhiyun def keys(self): 137*4882a593Smuzhiyun return list(self.attrs.keys()) 138*4882a593Smuzhiyun def __len__(self): 139*4882a593Smuzhiyun return len(self.attrs) 140*4882a593Smuzhiyun def __getitem__(self, name): 141*4882a593Smuzhiyun return self.attrs[name] 142*4882a593Smuzhiyun def __contains__(self, name): 143*4882a593Smuzhiyun return name in list(self.attrs.keys()) 144*4882a593Smuzhiyun 145*4882a593Smuzhiyun 146*4882a593Smuzhiyunclass Element(treebuildersbase.Node): 147*4882a593Smuzhiyun def __init__(self, element, soup, namespace): 148*4882a593Smuzhiyun treebuildersbase.Node.__init__(self, element.name) 149*4882a593Smuzhiyun self.element = element 150*4882a593Smuzhiyun self.soup = soup 151*4882a593Smuzhiyun self.namespace = namespace 152*4882a593Smuzhiyun 153*4882a593Smuzhiyun def appendChild(self, node): 154*4882a593Smuzhiyun string_child = child = None 155*4882a593Smuzhiyun if isinstance(node, str): 156*4882a593Smuzhiyun # Some other piece of code decided to pass in a string 157*4882a593Smuzhiyun # instead of creating a TextElement object to contain the 158*4882a593Smuzhiyun # string. 159*4882a593Smuzhiyun string_child = child = node 160*4882a593Smuzhiyun elif isinstance(node, Tag): 161*4882a593Smuzhiyun # Some other piece of code decided to pass in a Tag 162*4882a593Smuzhiyun # instead of creating an Element object to contain the 163*4882a593Smuzhiyun # Tag. 164*4882a593Smuzhiyun child = node 165*4882a593Smuzhiyun elif node.element.__class__ == NavigableString: 166*4882a593Smuzhiyun string_child = child = node.element 167*4882a593Smuzhiyun else: 168*4882a593Smuzhiyun child = node.element 169*4882a593Smuzhiyun 170*4882a593Smuzhiyun if not isinstance(child, str) and child.parent is not None: 171*4882a593Smuzhiyun node.element.extract() 172*4882a593Smuzhiyun 173*4882a593Smuzhiyun if (string_child and self.element.contents 174*4882a593Smuzhiyun and self.element.contents[-1].__class__ == NavigableString): 175*4882a593Smuzhiyun # We are appending a string onto another string. 176*4882a593Smuzhiyun # TODO This has O(n^2) performance, for input like 177*4882a593Smuzhiyun # "a</a>a</a>a</a>..." 178*4882a593Smuzhiyun old_element = self.element.contents[-1] 179*4882a593Smuzhiyun new_element = self.soup.new_string(old_element + string_child) 180*4882a593Smuzhiyun old_element.replace_with(new_element) 181*4882a593Smuzhiyun self.soup._most_recent_element = new_element 182*4882a593Smuzhiyun else: 183*4882a593Smuzhiyun if isinstance(node, str): 184*4882a593Smuzhiyun # Create a brand new NavigableString from this string. 185*4882a593Smuzhiyun child = self.soup.new_string(node) 186*4882a593Smuzhiyun 187*4882a593Smuzhiyun # Tell Beautiful Soup to act as if it parsed this element 188*4882a593Smuzhiyun # immediately after the parent's last descendant. (Or 189*4882a593Smuzhiyun # immediately after the parent, if it has no children.) 190*4882a593Smuzhiyun if self.element.contents: 191*4882a593Smuzhiyun most_recent_element = self.element._last_descendant(False) 192*4882a593Smuzhiyun elif self.element.next_element is not None: 193*4882a593Smuzhiyun # Something from further ahead in the parse tree is 194*4882a593Smuzhiyun # being inserted into this earlier element. This is 195*4882a593Smuzhiyun # very annoying because it means an expensive search 196*4882a593Smuzhiyun # for the last element in the tree. 197*4882a593Smuzhiyun most_recent_element = self.soup._last_descendant() 198*4882a593Smuzhiyun else: 199*4882a593Smuzhiyun most_recent_element = self.element 200*4882a593Smuzhiyun 201*4882a593Smuzhiyun self.soup.object_was_parsed( 202*4882a593Smuzhiyun child, parent=self.element, 203*4882a593Smuzhiyun most_recent_element=most_recent_element) 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun def getAttributes(self): 206*4882a593Smuzhiyun return AttrList(self.element) 207*4882a593Smuzhiyun 208*4882a593Smuzhiyun def setAttributes(self, attributes): 209*4882a593Smuzhiyun 210*4882a593Smuzhiyun if attributes is not None and len(attributes) > 0: 211*4882a593Smuzhiyun 212*4882a593Smuzhiyun converted_attributes = [] 213*4882a593Smuzhiyun for name, value in list(attributes.items()): 214*4882a593Smuzhiyun if isinstance(name, tuple): 215*4882a593Smuzhiyun new_name = NamespacedAttribute(*name) 216*4882a593Smuzhiyun del attributes[name] 217*4882a593Smuzhiyun attributes[new_name] = value 218*4882a593Smuzhiyun 219*4882a593Smuzhiyun self.soup.builder._replace_cdata_list_attribute_values( 220*4882a593Smuzhiyun self.name, attributes) 221*4882a593Smuzhiyun for name, value in list(attributes.items()): 222*4882a593Smuzhiyun self.element[name] = value 223*4882a593Smuzhiyun 224*4882a593Smuzhiyun # The attributes may contain variables that need substitution. 225*4882a593Smuzhiyun # Call set_up_substitutions manually. 226*4882a593Smuzhiyun # 227*4882a593Smuzhiyun # The Tag constructor called this method when the Tag was created, 228*4882a593Smuzhiyun # but we just set/changed the attributes, so call it again. 229*4882a593Smuzhiyun self.soup.builder.set_up_substitutions(self.element) 230*4882a593Smuzhiyun attributes = property(getAttributes, setAttributes) 231*4882a593Smuzhiyun 232*4882a593Smuzhiyun def insertText(self, data, insertBefore=None): 233*4882a593Smuzhiyun if insertBefore: 234*4882a593Smuzhiyun text = TextNode(self.soup.new_string(data), self.soup) 235*4882a593Smuzhiyun self.insertBefore(data, insertBefore) 236*4882a593Smuzhiyun else: 237*4882a593Smuzhiyun self.appendChild(data) 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun def insertBefore(self, node, refNode): 240*4882a593Smuzhiyun index = self.element.index(refNode.element) 241*4882a593Smuzhiyun if (node.element.__class__ == NavigableString and self.element.contents 242*4882a593Smuzhiyun and self.element.contents[index-1].__class__ == NavigableString): 243*4882a593Smuzhiyun # (See comments in appendChild) 244*4882a593Smuzhiyun old_node = self.element.contents[index-1] 245*4882a593Smuzhiyun new_str = self.soup.new_string(old_node + node.element) 246*4882a593Smuzhiyun old_node.replace_with(new_str) 247*4882a593Smuzhiyun else: 248*4882a593Smuzhiyun self.element.insert(index, node.element) 249*4882a593Smuzhiyun node.parent = self 250*4882a593Smuzhiyun 251*4882a593Smuzhiyun def removeChild(self, node): 252*4882a593Smuzhiyun node.element.extract() 253*4882a593Smuzhiyun 254*4882a593Smuzhiyun def reparentChildren(self, new_parent): 255*4882a593Smuzhiyun """Move all of this tag's children into another tag.""" 256*4882a593Smuzhiyun # print "MOVE", self.element.contents 257*4882a593Smuzhiyun # print "FROM", self.element 258*4882a593Smuzhiyun # print "TO", new_parent.element 259*4882a593Smuzhiyun element = self.element 260*4882a593Smuzhiyun new_parent_element = new_parent.element 261*4882a593Smuzhiyun # Determine what this tag's next_element will be once all the children 262*4882a593Smuzhiyun # are removed. 263*4882a593Smuzhiyun final_next_element = element.next_sibling 264*4882a593Smuzhiyun 265*4882a593Smuzhiyun new_parents_last_descendant = new_parent_element._last_descendant(False, False) 266*4882a593Smuzhiyun if len(new_parent_element.contents) > 0: 267*4882a593Smuzhiyun # The new parent already contains children. We will be 268*4882a593Smuzhiyun # appending this tag's children to the end. 269*4882a593Smuzhiyun new_parents_last_child = new_parent_element.contents[-1] 270*4882a593Smuzhiyun new_parents_last_descendant_next_element = new_parents_last_descendant.next_element 271*4882a593Smuzhiyun else: 272*4882a593Smuzhiyun # The new parent contains no children. 273*4882a593Smuzhiyun new_parents_last_child = None 274*4882a593Smuzhiyun new_parents_last_descendant_next_element = new_parent_element.next_element 275*4882a593Smuzhiyun 276*4882a593Smuzhiyun to_append = element.contents 277*4882a593Smuzhiyun append_after = new_parent_element.contents 278*4882a593Smuzhiyun if len(to_append) > 0: 279*4882a593Smuzhiyun # Set the first child's previous_element and previous_sibling 280*4882a593Smuzhiyun # to elements within the new parent 281*4882a593Smuzhiyun first_child = to_append[0] 282*4882a593Smuzhiyun if new_parents_last_descendant: 283*4882a593Smuzhiyun first_child.previous_element = new_parents_last_descendant 284*4882a593Smuzhiyun else: 285*4882a593Smuzhiyun first_child.previous_element = new_parent_element 286*4882a593Smuzhiyun first_child.previous_sibling = new_parents_last_child 287*4882a593Smuzhiyun if new_parents_last_descendant: 288*4882a593Smuzhiyun new_parents_last_descendant.next_element = first_child 289*4882a593Smuzhiyun else: 290*4882a593Smuzhiyun new_parent_element.next_element = first_child 291*4882a593Smuzhiyun if new_parents_last_child: 292*4882a593Smuzhiyun new_parents_last_child.next_sibling = first_child 293*4882a593Smuzhiyun 294*4882a593Smuzhiyun # Fix the last child's next_element and next_sibling 295*4882a593Smuzhiyun last_child = to_append[-1] 296*4882a593Smuzhiyun last_child.next_element = new_parents_last_descendant_next_element 297*4882a593Smuzhiyun if new_parents_last_descendant_next_element: 298*4882a593Smuzhiyun new_parents_last_descendant_next_element.previous_element = last_child 299*4882a593Smuzhiyun last_child.next_sibling = None 300*4882a593Smuzhiyun 301*4882a593Smuzhiyun for child in to_append: 302*4882a593Smuzhiyun child.parent = new_parent_element 303*4882a593Smuzhiyun new_parent_element.contents.append(child) 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun # Now that this element has no children, change its .next_element. 306*4882a593Smuzhiyun element.contents = [] 307*4882a593Smuzhiyun element.next_element = final_next_element 308*4882a593Smuzhiyun 309*4882a593Smuzhiyun # print "DONE WITH MOVE" 310*4882a593Smuzhiyun # print "FROM", self.element 311*4882a593Smuzhiyun # print "TO", new_parent_element 312*4882a593Smuzhiyun 313*4882a593Smuzhiyun def cloneNode(self): 314*4882a593Smuzhiyun tag = self.soup.new_tag(self.element.name, self.namespace) 315*4882a593Smuzhiyun node = Element(tag, self.soup, self.namespace) 316*4882a593Smuzhiyun for key,value in self.attributes: 317*4882a593Smuzhiyun node.attributes[key] = value 318*4882a593Smuzhiyun return node 319*4882a593Smuzhiyun 320*4882a593Smuzhiyun def hasContent(self): 321*4882a593Smuzhiyun return self.element.contents 322*4882a593Smuzhiyun 323*4882a593Smuzhiyun def getNameTuple(self): 324*4882a593Smuzhiyun if self.namespace is None: 325*4882a593Smuzhiyun return namespaces["html"], self.name 326*4882a593Smuzhiyun else: 327*4882a593Smuzhiyun return self.namespace, self.name 328*4882a593Smuzhiyun 329*4882a593Smuzhiyun nameTuple = property(getNameTuple) 330*4882a593Smuzhiyun 331*4882a593Smuzhiyunclass TextNode(Element): 332*4882a593Smuzhiyun def __init__(self, element, soup): 333*4882a593Smuzhiyun treebuildersbase.Node.__init__(self, None) 334*4882a593Smuzhiyun self.element = element 335*4882a593Smuzhiyun self.soup = soup 336*4882a593Smuzhiyun 337*4882a593Smuzhiyun def cloneNode(self): 338*4882a593Smuzhiyun raise NotImplementedError 339