xref: /OK3568_Linux_fs/yocto/bitbake/lib/bs4/builder/__init__.py (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyunfrom collections import defaultdict
2*4882a593Smuzhiyunimport itertools
3*4882a593Smuzhiyunimport sys
4*4882a593Smuzhiyunfrom bs4.element import (
5*4882a593Smuzhiyun    CharsetMetaAttributeValue,
6*4882a593Smuzhiyun    ContentMetaAttributeValue,
7*4882a593Smuzhiyun    whitespace_re
8*4882a593Smuzhiyun    )
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun__all__ = [
11*4882a593Smuzhiyun    'HTMLTreeBuilder',
12*4882a593Smuzhiyun    'SAXTreeBuilder',
13*4882a593Smuzhiyun    'TreeBuilder',
14*4882a593Smuzhiyun    'TreeBuilderRegistry',
15*4882a593Smuzhiyun    ]
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun# Some useful features for a TreeBuilder to have.
18*4882a593SmuzhiyunFAST = 'fast'
19*4882a593SmuzhiyunPERMISSIVE = 'permissive'
20*4882a593SmuzhiyunSTRICT = 'strict'
21*4882a593SmuzhiyunXML = 'xml'
22*4882a593SmuzhiyunHTML = 'html'
23*4882a593SmuzhiyunHTML_5 = 'html5'
24*4882a593Smuzhiyun
25*4882a593Smuzhiyun
26*4882a593Smuzhiyunclass TreeBuilderRegistry(object):
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun    def __init__(self):
29*4882a593Smuzhiyun        self.builders_for_feature = defaultdict(list)
30*4882a593Smuzhiyun        self.builders = []
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun    def register(self, treebuilder_class):
33*4882a593Smuzhiyun        """Register a treebuilder based on its advertised features."""
34*4882a593Smuzhiyun        for feature in treebuilder_class.features:
35*4882a593Smuzhiyun            self.builders_for_feature[feature].insert(0, treebuilder_class)
36*4882a593Smuzhiyun        self.builders.insert(0, treebuilder_class)
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun    def lookup(self, *features):
39*4882a593Smuzhiyun        if len(self.builders) == 0:
40*4882a593Smuzhiyun            # There are no builders at all.
41*4882a593Smuzhiyun            return None
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun        if len(features) == 0:
44*4882a593Smuzhiyun            # They didn't ask for any features. Give them the most
45*4882a593Smuzhiyun            # recently registered builder.
46*4882a593Smuzhiyun            return self.builders[0]
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun        # Go down the list of features in order, and eliminate any builders
49*4882a593Smuzhiyun        # that don't match every feature.
50*4882a593Smuzhiyun        features = list(features)
51*4882a593Smuzhiyun        features.reverse()
52*4882a593Smuzhiyun        candidates = None
53*4882a593Smuzhiyun        candidate_set = None
54*4882a593Smuzhiyun        while len(features) > 0:
55*4882a593Smuzhiyun            feature = features.pop()
56*4882a593Smuzhiyun            we_have_the_feature = self.builders_for_feature.get(feature, [])
57*4882a593Smuzhiyun            if len(we_have_the_feature) > 0:
58*4882a593Smuzhiyun                if candidates is None:
59*4882a593Smuzhiyun                    candidates = we_have_the_feature
60*4882a593Smuzhiyun                    candidate_set = set(candidates)
61*4882a593Smuzhiyun                else:
62*4882a593Smuzhiyun                    # Eliminate any candidates that don't have this feature.
63*4882a593Smuzhiyun                    candidate_set = candidate_set.intersection(
64*4882a593Smuzhiyun                        set(we_have_the_feature))
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun        # The only valid candidates are the ones in candidate_set.
67*4882a593Smuzhiyun        # Go through the original list of candidates and pick the first one
68*4882a593Smuzhiyun        # that's in candidate_set.
69*4882a593Smuzhiyun        if candidate_set is None:
70*4882a593Smuzhiyun            return None
71*4882a593Smuzhiyun        for candidate in candidates:
72*4882a593Smuzhiyun            if candidate in candidate_set:
73*4882a593Smuzhiyun                return candidate
74*4882a593Smuzhiyun        return None
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun# The BeautifulSoup class will take feature lists from developers and use them
77*4882a593Smuzhiyun# to look up builders in this registry.
78*4882a593Smuzhiyunbuilder_registry = TreeBuilderRegistry()
79*4882a593Smuzhiyun
80*4882a593Smuzhiyunclass TreeBuilder(object):
81*4882a593Smuzhiyun    """Turn a document into a Beautiful Soup object tree."""
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun    NAME = "[Unknown tree builder]"
84*4882a593Smuzhiyun    ALTERNATE_NAMES = []
85*4882a593Smuzhiyun    features = []
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun    is_xml = False
88*4882a593Smuzhiyun    picklable = False
89*4882a593Smuzhiyun    preserve_whitespace_tags = set()
90*4882a593Smuzhiyun    empty_element_tags = None # A tag will be considered an empty-element
91*4882a593Smuzhiyun                              # tag when and only when it has no contents.
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun    # A value for these tag/attribute combinations is a space- or
94*4882a593Smuzhiyun    # comma-separated list of CDATA, rather than a single CDATA.
95*4882a593Smuzhiyun    cdata_list_attributes = {}
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun    def __init__(self):
99*4882a593Smuzhiyun        self.soup = None
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun    def reset(self):
102*4882a593Smuzhiyun        pass
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun    def can_be_empty_element(self, tag_name):
105*4882a593Smuzhiyun        """Might a tag with this name be an empty-element tag?
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun        The final markup may or may not actually present this tag as
108*4882a593Smuzhiyun        self-closing.
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun        For instance: an HTMLBuilder does not consider a <p> tag to be
111*4882a593Smuzhiyun        an empty-element tag (it's not in
112*4882a593Smuzhiyun        HTMLBuilder.empty_element_tags). This means an empty <p> tag
113*4882a593Smuzhiyun        will be presented as "<p></p>", not "<p />".
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun        The default implementation has no opinion about which tags are
116*4882a593Smuzhiyun        empty-element tags, so a tag will be presented as an
117*4882a593Smuzhiyun        empty-element tag if and only if it has no contents.
118*4882a593Smuzhiyun        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
119*4882a593Smuzhiyun        be left alone.
120*4882a593Smuzhiyun        """
121*4882a593Smuzhiyun        if self.empty_element_tags is None:
122*4882a593Smuzhiyun            return True
123*4882a593Smuzhiyun        return tag_name in self.empty_element_tags
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun    def feed(self, markup):
126*4882a593Smuzhiyun        raise NotImplementedError()
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun    def prepare_markup(self, markup, user_specified_encoding=None,
129*4882a593Smuzhiyun                       document_declared_encoding=None):
130*4882a593Smuzhiyun        return markup, None, None, False
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun    def test_fragment_to_document(self, fragment):
133*4882a593Smuzhiyun        """Wrap an HTML fragment to make it look like a document.
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun        Different parsers do this differently. For instance, lxml
136*4882a593Smuzhiyun        introduces an empty <head> tag, and html5lib
137*4882a593Smuzhiyun        doesn't. Abstracting this away lets us write simple tests
138*4882a593Smuzhiyun        which run HTML fragments through the parser and compare the
139*4882a593Smuzhiyun        results against other HTML fragments.
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun        This method should not be used outside of tests.
142*4882a593Smuzhiyun        """
143*4882a593Smuzhiyun        return fragment
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun    def set_up_substitutions(self, tag):
146*4882a593Smuzhiyun        return False
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
149*4882a593Smuzhiyun        """Replaces class="foo bar" with class=["foo", "bar"]
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun        Modifies its input in place.
152*4882a593Smuzhiyun        """
153*4882a593Smuzhiyun        if not attrs:
154*4882a593Smuzhiyun            return attrs
155*4882a593Smuzhiyun        if self.cdata_list_attributes:
156*4882a593Smuzhiyun            universal = self.cdata_list_attributes.get('*', [])
157*4882a593Smuzhiyun            tag_specific = self.cdata_list_attributes.get(
158*4882a593Smuzhiyun                tag_name.lower(), None)
159*4882a593Smuzhiyun            for attr in list(attrs.keys()):
160*4882a593Smuzhiyun                if attr in universal or (tag_specific and attr in tag_specific):
161*4882a593Smuzhiyun                    # We have a "class"-type attribute whose string
162*4882a593Smuzhiyun                    # value is a whitespace-separated list of
163*4882a593Smuzhiyun                    # values. Split it into a list.
164*4882a593Smuzhiyun                    value = attrs[attr]
165*4882a593Smuzhiyun                    if isinstance(value, str):
166*4882a593Smuzhiyun                        values = whitespace_re.split(value)
167*4882a593Smuzhiyun                    else:
168*4882a593Smuzhiyun                        # html5lib sometimes calls setAttributes twice
169*4882a593Smuzhiyun                        # for the same tag when rearranging the parse
170*4882a593Smuzhiyun                        # tree. On the second call the attribute value
171*4882a593Smuzhiyun                        # here is already a list.  If this happens,
172*4882a593Smuzhiyun                        # leave the value alone rather than trying to
173*4882a593Smuzhiyun                        # split it again.
174*4882a593Smuzhiyun                        values = value
175*4882a593Smuzhiyun                    attrs[attr] = values
176*4882a593Smuzhiyun        return attrs
177*4882a593Smuzhiyun
178*4882a593Smuzhiyunclass SAXTreeBuilder(TreeBuilder):
179*4882a593Smuzhiyun    """A Beautiful Soup treebuilder that listens for SAX events."""
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun    def feed(self, markup):
182*4882a593Smuzhiyun        raise NotImplementedError()
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun    def close(self):
185*4882a593Smuzhiyun        pass
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun    def startElement(self, name, attrs):
188*4882a593Smuzhiyun        attrs = dict((key[1], value) for key, value in list(attrs.items()))
189*4882a593Smuzhiyun        #print "Start %s, %r" % (name, attrs)
190*4882a593Smuzhiyun        self.soup.handle_starttag(name, attrs)
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun    def endElement(self, name):
193*4882a593Smuzhiyun        #print "End %s" % name
194*4882a593Smuzhiyun        self.soup.handle_endtag(name)
195*4882a593Smuzhiyun
196*4882a593Smuzhiyun    def startElementNS(self, nsTuple, nodeName, attrs):
197*4882a593Smuzhiyun        # Throw away (ns, nodeName) for now.
198*4882a593Smuzhiyun        self.startElement(nodeName, attrs)
199*4882a593Smuzhiyun
200*4882a593Smuzhiyun    def endElementNS(self, nsTuple, nodeName):
201*4882a593Smuzhiyun        # Throw away (ns, nodeName) for now.
202*4882a593Smuzhiyun        self.endElement(nodeName)
203*4882a593Smuzhiyun        #handler.endElementNS((ns, node.nodeName), node.nodeName)
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun    def startPrefixMapping(self, prefix, nodeValue):
206*4882a593Smuzhiyun        # Ignore the prefix for now.
207*4882a593Smuzhiyun        pass
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun    def endPrefixMapping(self, prefix):
210*4882a593Smuzhiyun        # Ignore the prefix for now.
211*4882a593Smuzhiyun        # handler.endPrefixMapping(prefix)
212*4882a593Smuzhiyun        pass
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun    def characters(self, content):
215*4882a593Smuzhiyun        self.soup.handle_data(content)
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun    def startDocument(self):
218*4882a593Smuzhiyun        pass
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun    def endDocument(self):
221*4882a593Smuzhiyun        pass
222*4882a593Smuzhiyun
223*4882a593Smuzhiyun
224*4882a593Smuzhiyunclass HTMLTreeBuilder(TreeBuilder):
225*4882a593Smuzhiyun    """This TreeBuilder knows facts about HTML.
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun    Such as which tags are empty-element tags.
228*4882a593Smuzhiyun    """
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun    preserve_whitespace_tags = set(['pre', 'textarea'])
231*4882a593Smuzhiyun    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
232*4882a593Smuzhiyun                              'spacer', 'link', 'frame', 'base'])
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun    # The HTML standard defines these attributes as containing a
235*4882a593Smuzhiyun    # space-separated list of values, not a single value. That is,
236*4882a593Smuzhiyun    # class="foo bar" means that the 'class' attribute has two values,
237*4882a593Smuzhiyun    # 'foo' and 'bar', not the single value 'foo bar'.  When we
238*4882a593Smuzhiyun    # encounter one of these attributes, we will parse its value into
239*4882a593Smuzhiyun    # a list of values if possible. Upon output, the list will be
240*4882a593Smuzhiyun    # converted back into a string.
241*4882a593Smuzhiyun    cdata_list_attributes = {
242*4882a593Smuzhiyun        "*" : ['class', 'accesskey', 'dropzone'],
243*4882a593Smuzhiyun        "a" : ['rel', 'rev'],
244*4882a593Smuzhiyun        "link" :  ['rel', 'rev'],
245*4882a593Smuzhiyun        "td" : ["headers"],
246*4882a593Smuzhiyun        "th" : ["headers"],
247*4882a593Smuzhiyun        "td" : ["headers"],
248*4882a593Smuzhiyun        "form" : ["accept-charset"],
249*4882a593Smuzhiyun        "object" : ["archive"],
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
252*4882a593Smuzhiyun        "area" : ["rel"],
253*4882a593Smuzhiyun        "icon" : ["sizes"],
254*4882a593Smuzhiyun        "iframe" : ["sandbox"],
255*4882a593Smuzhiyun        "output" : ["for"],
256*4882a593Smuzhiyun        }
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun    def set_up_substitutions(self, tag):
259*4882a593Smuzhiyun        # We are only interested in <meta> tags
260*4882a593Smuzhiyun        if tag.name != 'meta':
261*4882a593Smuzhiyun            return False
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun        http_equiv = tag.get('http-equiv')
264*4882a593Smuzhiyun        content = tag.get('content')
265*4882a593Smuzhiyun        charset = tag.get('charset')
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun        # We are interested in <meta> tags that say what encoding the
268*4882a593Smuzhiyun        # document was originally in. This means HTML 5-style <meta>
269*4882a593Smuzhiyun        # tags that provide the "charset" attribute. It also means
270*4882a593Smuzhiyun        # HTML 4-style <meta> tags that provide the "content"
271*4882a593Smuzhiyun        # attribute and have "http-equiv" set to "content-type".
272*4882a593Smuzhiyun        #
273*4882a593Smuzhiyun        # In both cases we will replace the value of the appropriate
274*4882a593Smuzhiyun        # attribute with a standin object that can take on any
275*4882a593Smuzhiyun        # encoding.
276*4882a593Smuzhiyun        meta_encoding = None
277*4882a593Smuzhiyun        if charset is not None:
278*4882a593Smuzhiyun            # HTML 5 style:
279*4882a593Smuzhiyun            # <meta charset="utf8">
280*4882a593Smuzhiyun            meta_encoding = charset
281*4882a593Smuzhiyun            tag['charset'] = CharsetMetaAttributeValue(charset)
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun        elif (content is not None and http_equiv is not None
284*4882a593Smuzhiyun              and http_equiv.lower() == 'content-type'):
285*4882a593Smuzhiyun            # HTML 4 style:
286*4882a593Smuzhiyun            # <meta http-equiv="content-type" content="text/html; charset=utf8">
287*4882a593Smuzhiyun            tag['content'] = ContentMetaAttributeValue(content)
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun        return (meta_encoding is not None)
290*4882a593Smuzhiyun
291*4882a593Smuzhiyundef register_treebuilders_from(module):
292*4882a593Smuzhiyun    """Copy TreeBuilders from the given module into this module."""
293*4882a593Smuzhiyun    # I'm fairly sure this is not the best way to do this.
294*4882a593Smuzhiyun    this_module = sys.modules['bs4.builder']
295*4882a593Smuzhiyun    for name in module.__all__:
296*4882a593Smuzhiyun        obj = getattr(module, name)
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun        if issubclass(obj, TreeBuilder):
299*4882a593Smuzhiyun            setattr(this_module, name, obj)
300*4882a593Smuzhiyun            this_module.__all__.append(name)
301*4882a593Smuzhiyun            # Register the builder while we're at it.
302*4882a593Smuzhiyun            this_module.builder_registry.register(obj)
303*4882a593Smuzhiyun
304*4882a593Smuzhiyunclass ParserRejectedMarkup(Exception):
305*4882a593Smuzhiyun    pass
306*4882a593Smuzhiyun
307*4882a593Smuzhiyun# Builders are registered in reverse order of priority, so that custom
308*4882a593Smuzhiyun# builder registrations will take precedence. In general, we want lxml
309*4882a593Smuzhiyun# to take precedence over html5lib, because it's faster. And we only
310*4882a593Smuzhiyun# want to use HTMLParser as a last result.
311*4882a593Smuzhiyunfrom . import _htmlparser
312*4882a593Smuzhiyunregister_treebuilders_from(_htmlparser)
313*4882a593Smuzhiyuntry:
314*4882a593Smuzhiyun    from . import _html5lib
315*4882a593Smuzhiyun    register_treebuilders_from(_html5lib)
316*4882a593Smuzhiyunexcept ImportError:
317*4882a593Smuzhiyun    # They don't have html5lib installed.
318*4882a593Smuzhiyun    pass
319*4882a593Smuzhiyuntry:
320*4882a593Smuzhiyun    from . import _lxml
321*4882a593Smuzhiyun    register_treebuilders_from(_lxml)
322*4882a593Smuzhiyunexcept ImportError:
323*4882a593Smuzhiyun    # They don't have lxml installed.
324*4882a593Smuzhiyun    pass
325