1*4882a593Smuzhiyunfrom collections import defaultdict 2*4882a593Smuzhiyunimport itertools 3*4882a593Smuzhiyunimport sys 4*4882a593Smuzhiyunfrom bs4.element import ( 5*4882a593Smuzhiyun CharsetMetaAttributeValue, 6*4882a593Smuzhiyun ContentMetaAttributeValue, 7*4882a593Smuzhiyun whitespace_re 8*4882a593Smuzhiyun ) 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun__all__ = [ 11*4882a593Smuzhiyun 'HTMLTreeBuilder', 12*4882a593Smuzhiyun 'SAXTreeBuilder', 13*4882a593Smuzhiyun 'TreeBuilder', 14*4882a593Smuzhiyun 'TreeBuilderRegistry', 15*4882a593Smuzhiyun ] 16*4882a593Smuzhiyun 17*4882a593Smuzhiyun# Some useful features for a TreeBuilder to have. 18*4882a593SmuzhiyunFAST = 'fast' 19*4882a593SmuzhiyunPERMISSIVE = 'permissive' 20*4882a593SmuzhiyunSTRICT = 'strict' 21*4882a593SmuzhiyunXML = 'xml' 22*4882a593SmuzhiyunHTML = 'html' 23*4882a593SmuzhiyunHTML_5 = 'html5' 24*4882a593Smuzhiyun 25*4882a593Smuzhiyun 26*4882a593Smuzhiyunclass TreeBuilderRegistry(object): 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun def __init__(self): 29*4882a593Smuzhiyun self.builders_for_feature = defaultdict(list) 30*4882a593Smuzhiyun self.builders = [] 31*4882a593Smuzhiyun 32*4882a593Smuzhiyun def register(self, treebuilder_class): 33*4882a593Smuzhiyun """Register a treebuilder based on its advertised features.""" 34*4882a593Smuzhiyun for feature in treebuilder_class.features: 35*4882a593Smuzhiyun self.builders_for_feature[feature].insert(0, treebuilder_class) 36*4882a593Smuzhiyun self.builders.insert(0, treebuilder_class) 37*4882a593Smuzhiyun 38*4882a593Smuzhiyun def lookup(self, *features): 39*4882a593Smuzhiyun if len(self.builders) == 0: 40*4882a593Smuzhiyun # There are no builders at all. 41*4882a593Smuzhiyun return None 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun if len(features) == 0: 44*4882a593Smuzhiyun # They didn't ask for any features. Give them the most 45*4882a593Smuzhiyun # recently registered builder. 46*4882a593Smuzhiyun return self.builders[0] 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun # Go down the list of features in order, and eliminate any builders 49*4882a593Smuzhiyun # that don't match every feature. 50*4882a593Smuzhiyun features = list(features) 51*4882a593Smuzhiyun features.reverse() 52*4882a593Smuzhiyun candidates = None 53*4882a593Smuzhiyun candidate_set = None 54*4882a593Smuzhiyun while len(features) > 0: 55*4882a593Smuzhiyun feature = features.pop() 56*4882a593Smuzhiyun we_have_the_feature = self.builders_for_feature.get(feature, []) 57*4882a593Smuzhiyun if len(we_have_the_feature) > 0: 58*4882a593Smuzhiyun if candidates is None: 59*4882a593Smuzhiyun candidates = we_have_the_feature 60*4882a593Smuzhiyun candidate_set = set(candidates) 61*4882a593Smuzhiyun else: 62*4882a593Smuzhiyun # Eliminate any candidates that don't have this feature. 63*4882a593Smuzhiyun candidate_set = candidate_set.intersection( 64*4882a593Smuzhiyun set(we_have_the_feature)) 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun # The only valid candidates are the ones in candidate_set. 67*4882a593Smuzhiyun # Go through the original list of candidates and pick the first one 68*4882a593Smuzhiyun # that's in candidate_set. 69*4882a593Smuzhiyun if candidate_set is None: 70*4882a593Smuzhiyun return None 71*4882a593Smuzhiyun for candidate in candidates: 72*4882a593Smuzhiyun if candidate in candidate_set: 73*4882a593Smuzhiyun return candidate 74*4882a593Smuzhiyun return None 75*4882a593Smuzhiyun 76*4882a593Smuzhiyun# The BeautifulSoup class will take feature lists from developers and use them 77*4882a593Smuzhiyun# to look up builders in this registry. 78*4882a593Smuzhiyunbuilder_registry = TreeBuilderRegistry() 79*4882a593Smuzhiyun 80*4882a593Smuzhiyunclass TreeBuilder(object): 81*4882a593Smuzhiyun """Turn a document into a Beautiful Soup object tree.""" 82*4882a593Smuzhiyun 83*4882a593Smuzhiyun NAME = "[Unknown tree builder]" 84*4882a593Smuzhiyun ALTERNATE_NAMES = [] 85*4882a593Smuzhiyun features = [] 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun is_xml = False 88*4882a593Smuzhiyun picklable = False 89*4882a593Smuzhiyun preserve_whitespace_tags = set() 90*4882a593Smuzhiyun empty_element_tags = None # A tag will be considered an empty-element 91*4882a593Smuzhiyun # tag when and only when it has no contents. 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun # A value for these tag/attribute combinations is a space- or 94*4882a593Smuzhiyun # comma-separated list of CDATA, rather than a single CDATA. 95*4882a593Smuzhiyun cdata_list_attributes = {} 96*4882a593Smuzhiyun 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun def __init__(self): 99*4882a593Smuzhiyun self.soup = None 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun def reset(self): 102*4882a593Smuzhiyun pass 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun def can_be_empty_element(self, tag_name): 105*4882a593Smuzhiyun """Might a tag with this name be an empty-element tag? 106*4882a593Smuzhiyun 107*4882a593Smuzhiyun The final markup may or may not actually present this tag as 108*4882a593Smuzhiyun self-closing. 109*4882a593Smuzhiyun 110*4882a593Smuzhiyun For instance: an HTMLBuilder does not consider a <p> tag to be 111*4882a593Smuzhiyun an empty-element tag (it's not in 112*4882a593Smuzhiyun HTMLBuilder.empty_element_tags). This means an empty <p> tag 113*4882a593Smuzhiyun will be presented as "<p></p>", not "<p />". 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun The default implementation has no opinion about which tags are 116*4882a593Smuzhiyun empty-element tags, so a tag will be presented as an 117*4882a593Smuzhiyun empty-element tag if and only if it has no contents. 118*4882a593Smuzhiyun "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will 119*4882a593Smuzhiyun be left alone. 120*4882a593Smuzhiyun """ 121*4882a593Smuzhiyun if self.empty_element_tags is None: 122*4882a593Smuzhiyun return True 123*4882a593Smuzhiyun return tag_name in self.empty_element_tags 124*4882a593Smuzhiyun 125*4882a593Smuzhiyun def feed(self, markup): 126*4882a593Smuzhiyun raise NotImplementedError() 127*4882a593Smuzhiyun 128*4882a593Smuzhiyun def prepare_markup(self, markup, user_specified_encoding=None, 129*4882a593Smuzhiyun document_declared_encoding=None): 130*4882a593Smuzhiyun return markup, None, None, False 131*4882a593Smuzhiyun 132*4882a593Smuzhiyun def test_fragment_to_document(self, fragment): 133*4882a593Smuzhiyun """Wrap an HTML fragment to make it look like a document. 134*4882a593Smuzhiyun 135*4882a593Smuzhiyun Different parsers do this differently. For instance, lxml 136*4882a593Smuzhiyun introduces an empty <head> tag, and html5lib 137*4882a593Smuzhiyun doesn't. Abstracting this away lets us write simple tests 138*4882a593Smuzhiyun which run HTML fragments through the parser and compare the 139*4882a593Smuzhiyun results against other HTML fragments. 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun This method should not be used outside of tests. 142*4882a593Smuzhiyun """ 143*4882a593Smuzhiyun return fragment 144*4882a593Smuzhiyun 145*4882a593Smuzhiyun def set_up_substitutions(self, tag): 146*4882a593Smuzhiyun return False 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun def _replace_cdata_list_attribute_values(self, tag_name, attrs): 149*4882a593Smuzhiyun """Replaces class="foo bar" with class=["foo", "bar"] 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun Modifies its input in place. 152*4882a593Smuzhiyun """ 153*4882a593Smuzhiyun if not attrs: 154*4882a593Smuzhiyun return attrs 155*4882a593Smuzhiyun if self.cdata_list_attributes: 156*4882a593Smuzhiyun universal = self.cdata_list_attributes.get('*', []) 157*4882a593Smuzhiyun tag_specific = self.cdata_list_attributes.get( 158*4882a593Smuzhiyun tag_name.lower(), None) 159*4882a593Smuzhiyun for attr in list(attrs.keys()): 160*4882a593Smuzhiyun if attr in universal or (tag_specific and attr in tag_specific): 161*4882a593Smuzhiyun # We have a "class"-type attribute whose string 162*4882a593Smuzhiyun # value is a whitespace-separated list of 163*4882a593Smuzhiyun # values. Split it into a list. 164*4882a593Smuzhiyun value = attrs[attr] 165*4882a593Smuzhiyun if isinstance(value, str): 166*4882a593Smuzhiyun values = whitespace_re.split(value) 167*4882a593Smuzhiyun else: 168*4882a593Smuzhiyun # html5lib sometimes calls setAttributes twice 169*4882a593Smuzhiyun # for the same tag when rearranging the parse 170*4882a593Smuzhiyun # tree. On the second call the attribute value 171*4882a593Smuzhiyun # here is already a list. If this happens, 172*4882a593Smuzhiyun # leave the value alone rather than trying to 173*4882a593Smuzhiyun # split it again. 174*4882a593Smuzhiyun values = value 175*4882a593Smuzhiyun attrs[attr] = values 176*4882a593Smuzhiyun return attrs 177*4882a593Smuzhiyun 178*4882a593Smuzhiyunclass SAXTreeBuilder(TreeBuilder): 179*4882a593Smuzhiyun """A Beautiful Soup treebuilder that listens for SAX events.""" 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun def feed(self, markup): 182*4882a593Smuzhiyun raise NotImplementedError() 183*4882a593Smuzhiyun 184*4882a593Smuzhiyun def close(self): 185*4882a593Smuzhiyun pass 186*4882a593Smuzhiyun 187*4882a593Smuzhiyun def startElement(self, name, attrs): 188*4882a593Smuzhiyun attrs = dict((key[1], value) for key, value in list(attrs.items())) 189*4882a593Smuzhiyun #print "Start %s, %r" % (name, attrs) 190*4882a593Smuzhiyun self.soup.handle_starttag(name, attrs) 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun def endElement(self, name): 193*4882a593Smuzhiyun #print "End %s" % name 194*4882a593Smuzhiyun self.soup.handle_endtag(name) 195*4882a593Smuzhiyun 196*4882a593Smuzhiyun def startElementNS(self, nsTuple, nodeName, attrs): 197*4882a593Smuzhiyun # Throw away (ns, nodeName) for now. 198*4882a593Smuzhiyun self.startElement(nodeName, attrs) 199*4882a593Smuzhiyun 200*4882a593Smuzhiyun def endElementNS(self, nsTuple, nodeName): 201*4882a593Smuzhiyun # Throw away (ns, nodeName) for now. 202*4882a593Smuzhiyun self.endElement(nodeName) 203*4882a593Smuzhiyun #handler.endElementNS((ns, node.nodeName), node.nodeName) 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun def startPrefixMapping(self, prefix, nodeValue): 206*4882a593Smuzhiyun # Ignore the prefix for now. 207*4882a593Smuzhiyun pass 208*4882a593Smuzhiyun 209*4882a593Smuzhiyun def endPrefixMapping(self, prefix): 210*4882a593Smuzhiyun # Ignore the prefix for now. 211*4882a593Smuzhiyun # handler.endPrefixMapping(prefix) 212*4882a593Smuzhiyun pass 213*4882a593Smuzhiyun 214*4882a593Smuzhiyun def characters(self, content): 215*4882a593Smuzhiyun self.soup.handle_data(content) 216*4882a593Smuzhiyun 217*4882a593Smuzhiyun def startDocument(self): 218*4882a593Smuzhiyun pass 219*4882a593Smuzhiyun 220*4882a593Smuzhiyun def endDocument(self): 221*4882a593Smuzhiyun pass 222*4882a593Smuzhiyun 223*4882a593Smuzhiyun 224*4882a593Smuzhiyunclass HTMLTreeBuilder(TreeBuilder): 225*4882a593Smuzhiyun """This TreeBuilder knows facts about HTML. 226*4882a593Smuzhiyun 227*4882a593Smuzhiyun Such as which tags are empty-element tags. 228*4882a593Smuzhiyun """ 229*4882a593Smuzhiyun 230*4882a593Smuzhiyun preserve_whitespace_tags = set(['pre', 'textarea']) 231*4882a593Smuzhiyun empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 232*4882a593Smuzhiyun 'spacer', 'link', 'frame', 'base']) 233*4882a593Smuzhiyun 234*4882a593Smuzhiyun # The HTML standard defines these attributes as containing a 235*4882a593Smuzhiyun # space-separated list of values, not a single value. That is, 236*4882a593Smuzhiyun # class="foo bar" means that the 'class' attribute has two values, 237*4882a593Smuzhiyun # 'foo' and 'bar', not the single value 'foo bar'. When we 238*4882a593Smuzhiyun # encounter one of these attributes, we will parse its value into 239*4882a593Smuzhiyun # a list of values if possible. Upon output, the list will be 240*4882a593Smuzhiyun # converted back into a string. 241*4882a593Smuzhiyun cdata_list_attributes = { 242*4882a593Smuzhiyun "*" : ['class', 'accesskey', 'dropzone'], 243*4882a593Smuzhiyun "a" : ['rel', 'rev'], 244*4882a593Smuzhiyun "link" : ['rel', 'rev'], 245*4882a593Smuzhiyun "td" : ["headers"], 246*4882a593Smuzhiyun "th" : ["headers"], 247*4882a593Smuzhiyun "td" : ["headers"], 248*4882a593Smuzhiyun "form" : ["accept-charset"], 249*4882a593Smuzhiyun "object" : ["archive"], 250*4882a593Smuzhiyun 251*4882a593Smuzhiyun # These are HTML5 specific, as are *.accesskey and *.dropzone above. 252*4882a593Smuzhiyun "area" : ["rel"], 253*4882a593Smuzhiyun "icon" : ["sizes"], 254*4882a593Smuzhiyun "iframe" : ["sandbox"], 255*4882a593Smuzhiyun "output" : ["for"], 256*4882a593Smuzhiyun } 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun def set_up_substitutions(self, tag): 259*4882a593Smuzhiyun # We are only interested in <meta> tags 260*4882a593Smuzhiyun if tag.name != 'meta': 261*4882a593Smuzhiyun return False 262*4882a593Smuzhiyun 263*4882a593Smuzhiyun http_equiv = tag.get('http-equiv') 264*4882a593Smuzhiyun content = tag.get('content') 265*4882a593Smuzhiyun charset = tag.get('charset') 266*4882a593Smuzhiyun 267*4882a593Smuzhiyun # We are interested in <meta> tags that say what encoding the 268*4882a593Smuzhiyun # document was originally in. This means HTML 5-style <meta> 269*4882a593Smuzhiyun # tags that provide the "charset" attribute. It also means 270*4882a593Smuzhiyun # HTML 4-style <meta> tags that provide the "content" 271*4882a593Smuzhiyun # attribute and have "http-equiv" set to "content-type". 272*4882a593Smuzhiyun # 273*4882a593Smuzhiyun # In both cases we will replace the value of the appropriate 274*4882a593Smuzhiyun # attribute with a standin object that can take on any 275*4882a593Smuzhiyun # encoding. 276*4882a593Smuzhiyun meta_encoding = None 277*4882a593Smuzhiyun if charset is not None: 278*4882a593Smuzhiyun # HTML 5 style: 279*4882a593Smuzhiyun # <meta charset="utf8"> 280*4882a593Smuzhiyun meta_encoding = charset 281*4882a593Smuzhiyun tag['charset'] = CharsetMetaAttributeValue(charset) 282*4882a593Smuzhiyun 283*4882a593Smuzhiyun elif (content is not None and http_equiv is not None 284*4882a593Smuzhiyun and http_equiv.lower() == 'content-type'): 285*4882a593Smuzhiyun # HTML 4 style: 286*4882a593Smuzhiyun # <meta http-equiv="content-type" content="text/html; charset=utf8"> 287*4882a593Smuzhiyun tag['content'] = ContentMetaAttributeValue(content) 288*4882a593Smuzhiyun 289*4882a593Smuzhiyun return (meta_encoding is not None) 290*4882a593Smuzhiyun 291*4882a593Smuzhiyundef register_treebuilders_from(module): 292*4882a593Smuzhiyun """Copy TreeBuilders from the given module into this module.""" 293*4882a593Smuzhiyun # I'm fairly sure this is not the best way to do this. 294*4882a593Smuzhiyun this_module = sys.modules['bs4.builder'] 295*4882a593Smuzhiyun for name in module.__all__: 296*4882a593Smuzhiyun obj = getattr(module, name) 297*4882a593Smuzhiyun 298*4882a593Smuzhiyun if issubclass(obj, TreeBuilder): 299*4882a593Smuzhiyun setattr(this_module, name, obj) 300*4882a593Smuzhiyun this_module.__all__.append(name) 301*4882a593Smuzhiyun # Register the builder while we're at it. 302*4882a593Smuzhiyun this_module.builder_registry.register(obj) 303*4882a593Smuzhiyun 304*4882a593Smuzhiyunclass ParserRejectedMarkup(Exception): 305*4882a593Smuzhiyun pass 306*4882a593Smuzhiyun 307*4882a593Smuzhiyun# Builders are registered in reverse order of priority, so that custom 308*4882a593Smuzhiyun# builder registrations will take precedence. In general, we want lxml 309*4882a593Smuzhiyun# to take precedence over html5lib, because it's faster. And we only 310*4882a593Smuzhiyun# want to use HTMLParser as a last result. 311*4882a593Smuzhiyunfrom . import _htmlparser 312*4882a593Smuzhiyunregister_treebuilders_from(_htmlparser) 313*4882a593Smuzhiyuntry: 314*4882a593Smuzhiyun from . import _html5lib 315*4882a593Smuzhiyun register_treebuilders_from(_html5lib) 316*4882a593Smuzhiyunexcept ImportError: 317*4882a593Smuzhiyun # They don't have html5lib installed. 318*4882a593Smuzhiyun pass 319*4882a593Smuzhiyuntry: 320*4882a593Smuzhiyun from . import _lxml 321*4882a593Smuzhiyun register_treebuilders_from(_lxml) 322*4882a593Smuzhiyunexcept ImportError: 323*4882a593Smuzhiyun # They don't have lxml installed. 324*4882a593Smuzhiyun pass 325