xref: /OK3568_Linux_fs/yocto/bitbake/lib/bs4/builder/_lxml.py (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun__all__ = [
2*4882a593Smuzhiyun    'LXMLTreeBuilderForXML',
3*4882a593Smuzhiyun    'LXMLTreeBuilder',
4*4882a593Smuzhiyun    ]
5*4882a593Smuzhiyun
6*4882a593Smuzhiyunfrom io import BytesIO
7*4882a593Smuzhiyunfrom io import StringIO
8*4882a593Smuzhiyunimport collections
9*4882a593Smuzhiyunfrom lxml import etree
10*4882a593Smuzhiyunfrom bs4.element import (
11*4882a593Smuzhiyun    Comment,
12*4882a593Smuzhiyun    Doctype,
13*4882a593Smuzhiyun    NamespacedAttribute,
14*4882a593Smuzhiyun    ProcessingInstruction,
15*4882a593Smuzhiyun)
16*4882a593Smuzhiyunfrom bs4.builder import (
17*4882a593Smuzhiyun    FAST,
18*4882a593Smuzhiyun    HTML,
19*4882a593Smuzhiyun    HTMLTreeBuilder,
20*4882a593Smuzhiyun    PERMISSIVE,
21*4882a593Smuzhiyun    ParserRejectedMarkup,
22*4882a593Smuzhiyun    TreeBuilder,
23*4882a593Smuzhiyun    XML)
24*4882a593Smuzhiyunfrom bs4.dammit import EncodingDetector
25*4882a593Smuzhiyun
26*4882a593SmuzhiyunLXML = 'lxml'
27*4882a593Smuzhiyun
28*4882a593Smuzhiyunclass LXMLTreeBuilderForXML(TreeBuilder):
29*4882a593Smuzhiyun    DEFAULT_PARSER_CLASS = etree.XMLParser
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun    is_xml = True
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun    NAME = "lxml-xml"
34*4882a593Smuzhiyun    ALTERNATE_NAMES = ["xml"]
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun    # Well, it's permissive by XML parser standards.
37*4882a593Smuzhiyun    features = [NAME, LXML, XML, FAST, PERMISSIVE]
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun    CHUNK_SIZE = 512
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun    # This namespace mapping is specified in the XML Namespace
42*4882a593Smuzhiyun    # standard.
43*4882a593Smuzhiyun    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun    def default_parser(self, encoding):
46*4882a593Smuzhiyun        # This can either return a parser object or a class, which
47*4882a593Smuzhiyun        # will be instantiated with default arguments.
48*4882a593Smuzhiyun        if self._default_parser is not None:
49*4882a593Smuzhiyun            return self._default_parser
50*4882a593Smuzhiyun        return etree.XMLParser(
51*4882a593Smuzhiyun            target=self, strip_cdata=False, recover=True, encoding=encoding)
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun    def parser_for(self, encoding):
54*4882a593Smuzhiyun        # Use the default parser.
55*4882a593Smuzhiyun        parser = self.default_parser(encoding)
56*4882a593Smuzhiyun
57*4882a593Smuzhiyun        if isinstance(parser, collections.Callable):
58*4882a593Smuzhiyun            # Instantiate the parser with default arguments
59*4882a593Smuzhiyun            parser = parser(target=self, strip_cdata=False, encoding=encoding)
60*4882a593Smuzhiyun        return parser
61*4882a593Smuzhiyun
62*4882a593Smuzhiyun    def __init__(self, parser=None, empty_element_tags=None):
63*4882a593Smuzhiyun        # TODO: Issue a warning if parser is present but not a
64*4882a593Smuzhiyun        # callable, since that means there's no way to create new
65*4882a593Smuzhiyun        # parsers for different encodings.
66*4882a593Smuzhiyun        self._default_parser = parser
67*4882a593Smuzhiyun        if empty_element_tags is not None:
68*4882a593Smuzhiyun            self.empty_element_tags = set(empty_element_tags)
69*4882a593Smuzhiyun        self.soup = None
70*4882a593Smuzhiyun        self.nsmaps = [self.DEFAULT_NSMAPS]
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun    def _getNsTag(self, tag):
73*4882a593Smuzhiyun        # Split the namespace URL out of a fully-qualified lxml tag
74*4882a593Smuzhiyun        # name. Copied from lxml's src/lxml/sax.py.
75*4882a593Smuzhiyun        if tag[0] == '{':
76*4882a593Smuzhiyun            return tuple(tag[1:].split('}', 1))
77*4882a593Smuzhiyun        else:
78*4882a593Smuzhiyun            return (None, tag)
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun    def prepare_markup(self, markup, user_specified_encoding=None,
81*4882a593Smuzhiyun                       exclude_encodings=None,
82*4882a593Smuzhiyun                       document_declared_encoding=None):
83*4882a593Smuzhiyun        """
84*4882a593Smuzhiyun        :yield: A series of 4-tuples.
85*4882a593Smuzhiyun         (markup, encoding, declared encoding,
86*4882a593Smuzhiyun          has undergone character replacement)
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun        Each 4-tuple represents a strategy for parsing the document.
89*4882a593Smuzhiyun        """
90*4882a593Smuzhiyun        if isinstance(markup, str):
91*4882a593Smuzhiyun            # We were given Unicode. Maybe lxml can parse Unicode on
92*4882a593Smuzhiyun            # this system?
93*4882a593Smuzhiyun            yield markup, None, document_declared_encoding, False
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun        if isinstance(markup, str):
96*4882a593Smuzhiyun            # No, apparently not. Convert the Unicode to UTF-8 and
97*4882a593Smuzhiyun            # tell lxml to parse it as UTF-8.
98*4882a593Smuzhiyun            yield (markup.encode("utf8"), "utf8",
99*4882a593Smuzhiyun                   document_declared_encoding, False)
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun        # Instead of using UnicodeDammit to convert the bytestring to
102*4882a593Smuzhiyun        # Unicode using different encodings, use EncodingDetector to
103*4882a593Smuzhiyun        # iterate over the encodings, and tell lxml to try to parse
104*4882a593Smuzhiyun        # the document as each one in turn.
105*4882a593Smuzhiyun        is_html = not self.is_xml
106*4882a593Smuzhiyun        try_encodings = [user_specified_encoding, document_declared_encoding]
107*4882a593Smuzhiyun        detector = EncodingDetector(
108*4882a593Smuzhiyun            markup, try_encodings, is_html, exclude_encodings)
109*4882a593Smuzhiyun        for encoding in detector.encodings:
110*4882a593Smuzhiyun            yield (detector.markup, encoding, document_declared_encoding, False)
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun    def feed(self, markup):
113*4882a593Smuzhiyun        if isinstance(markup, bytes):
114*4882a593Smuzhiyun            markup = BytesIO(markup)
115*4882a593Smuzhiyun        elif isinstance(markup, str):
116*4882a593Smuzhiyun            markup = StringIO(markup)
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun        # Call feed() at least once, even if the markup is empty,
119*4882a593Smuzhiyun        # or the parser won't be initialized.
120*4882a593Smuzhiyun        data = markup.read(self.CHUNK_SIZE)
121*4882a593Smuzhiyun        try:
122*4882a593Smuzhiyun            self.parser = self.parser_for(self.soup.original_encoding)
123*4882a593Smuzhiyun            self.parser.feed(data)
124*4882a593Smuzhiyun            while len(data) != 0:
125*4882a593Smuzhiyun                # Now call feed() on the rest of the data, chunk by chunk.
126*4882a593Smuzhiyun                data = markup.read(self.CHUNK_SIZE)
127*4882a593Smuzhiyun                if len(data) != 0:
128*4882a593Smuzhiyun                    self.parser.feed(data)
129*4882a593Smuzhiyun            self.parser.close()
130*4882a593Smuzhiyun        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
131*4882a593Smuzhiyun            raise ParserRejectedMarkup(str(e))
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun    def close(self):
134*4882a593Smuzhiyun        self.nsmaps = [self.DEFAULT_NSMAPS]
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun    def start(self, name, attrs, nsmap={}):
137*4882a593Smuzhiyun        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
138*4882a593Smuzhiyun        attrs = dict(attrs)
139*4882a593Smuzhiyun        nsprefix = None
140*4882a593Smuzhiyun        # Invert each namespace map as it comes in.
141*4882a593Smuzhiyun        if len(self.nsmaps) > 1:
142*4882a593Smuzhiyun            # There are no new namespaces for this tag, but
143*4882a593Smuzhiyun            # non-default namespaces are in play, so we need a
144*4882a593Smuzhiyun            # separate tag stack to know when they end.
145*4882a593Smuzhiyun            self.nsmaps.append(None)
146*4882a593Smuzhiyun        elif len(nsmap) > 0:
147*4882a593Smuzhiyun            # A new namespace mapping has come into play.
148*4882a593Smuzhiyun            inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
149*4882a593Smuzhiyun            self.nsmaps.append(inverted_nsmap)
150*4882a593Smuzhiyun            # Also treat the namespace mapping as a set of attributes on the
151*4882a593Smuzhiyun            # tag, so we can recreate it later.
152*4882a593Smuzhiyun            attrs = attrs.copy()
153*4882a593Smuzhiyun            for prefix, namespace in list(nsmap.items()):
154*4882a593Smuzhiyun                attribute = NamespacedAttribute(
155*4882a593Smuzhiyun                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
156*4882a593Smuzhiyun                attrs[attribute] = namespace
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun        # Namespaces are in play. Find any attributes that came in
159*4882a593Smuzhiyun        # from lxml with namespaces attached to their names, and
160*4882a593Smuzhiyun        # turn then into NamespacedAttribute objects.
161*4882a593Smuzhiyun        new_attrs = {}
162*4882a593Smuzhiyun        for attr, value in list(attrs.items()):
163*4882a593Smuzhiyun            namespace, attr = self._getNsTag(attr)
164*4882a593Smuzhiyun            if namespace is None:
165*4882a593Smuzhiyun                new_attrs[attr] = value
166*4882a593Smuzhiyun            else:
167*4882a593Smuzhiyun                nsprefix = self._prefix_for_namespace(namespace)
168*4882a593Smuzhiyun                attr = NamespacedAttribute(nsprefix, attr, namespace)
169*4882a593Smuzhiyun                new_attrs[attr] = value
170*4882a593Smuzhiyun        attrs = new_attrs
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun        namespace, name = self._getNsTag(name)
173*4882a593Smuzhiyun        nsprefix = self._prefix_for_namespace(namespace)
174*4882a593Smuzhiyun        self.soup.handle_starttag(name, namespace, nsprefix, attrs)
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun    def _prefix_for_namespace(self, namespace):
177*4882a593Smuzhiyun        """Find the currently active prefix for the given namespace."""
178*4882a593Smuzhiyun        if namespace is None:
179*4882a593Smuzhiyun            return None
180*4882a593Smuzhiyun        for inverted_nsmap in reversed(self.nsmaps):
181*4882a593Smuzhiyun            if inverted_nsmap is not None and namespace in inverted_nsmap:
182*4882a593Smuzhiyun                return inverted_nsmap[namespace]
183*4882a593Smuzhiyun        return None
184*4882a593Smuzhiyun
185*4882a593Smuzhiyun    def end(self, name):
186*4882a593Smuzhiyun        self.soup.endData()
187*4882a593Smuzhiyun        completed_tag = self.soup.tagStack[-1]
188*4882a593Smuzhiyun        namespace, name = self._getNsTag(name)
189*4882a593Smuzhiyun        nsprefix = None
190*4882a593Smuzhiyun        if namespace is not None:
191*4882a593Smuzhiyun            for inverted_nsmap in reversed(self.nsmaps):
192*4882a593Smuzhiyun                if inverted_nsmap is not None and namespace in inverted_nsmap:
193*4882a593Smuzhiyun                    nsprefix = inverted_nsmap[namespace]
194*4882a593Smuzhiyun                    break
195*4882a593Smuzhiyun        self.soup.handle_endtag(name, nsprefix)
196*4882a593Smuzhiyun        if len(self.nsmaps) > 1:
197*4882a593Smuzhiyun            # This tag, or one of its parents, introduced a namespace
198*4882a593Smuzhiyun            # mapping, so pop it off the stack.
199*4882a593Smuzhiyun            self.nsmaps.pop()
200*4882a593Smuzhiyun
201*4882a593Smuzhiyun    def pi(self, target, data):
202*4882a593Smuzhiyun        self.soup.endData()
203*4882a593Smuzhiyun        self.soup.handle_data(target + ' ' + data)
204*4882a593Smuzhiyun        self.soup.endData(ProcessingInstruction)
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun    def data(self, content):
207*4882a593Smuzhiyun        self.soup.handle_data(content)
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun    def doctype(self, name, pubid, system):
210*4882a593Smuzhiyun        self.soup.endData()
211*4882a593Smuzhiyun        doctype = Doctype.for_name_and_ids(name, pubid, system)
212*4882a593Smuzhiyun        self.soup.object_was_parsed(doctype)
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun    def comment(self, content):
215*4882a593Smuzhiyun        "Handle comments as Comment objects."
216*4882a593Smuzhiyun        self.soup.endData()
217*4882a593Smuzhiyun        self.soup.handle_data(content)
218*4882a593Smuzhiyun        self.soup.endData(Comment)
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun    def test_fragment_to_document(self, fragment):
221*4882a593Smuzhiyun        """See `TreeBuilder`."""
222*4882a593Smuzhiyun        return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun
225*4882a593Smuzhiyunclass LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun    NAME = LXML
228*4882a593Smuzhiyun    ALTERNATE_NAMES = ["lxml-html"]
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
231*4882a593Smuzhiyun    is_xml = False
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun    def default_parser(self, encoding):
234*4882a593Smuzhiyun        return etree.HTMLParser
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun    def feed(self, markup):
237*4882a593Smuzhiyun        encoding = self.soup.original_encoding
238*4882a593Smuzhiyun        try:
239*4882a593Smuzhiyun            self.parser = self.parser_for(encoding)
240*4882a593Smuzhiyun            self.parser.feed(markup)
241*4882a593Smuzhiyun            self.parser.close()
242*4882a593Smuzhiyun        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
243*4882a593Smuzhiyun            raise ParserRejectedMarkup(str(e))
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun    def test_fragment_to_document(self, fragment):
247*4882a593Smuzhiyun        """See `TreeBuilder`."""
248*4882a593Smuzhiyun        return '<html><body>%s</body></html>' % fragment
249