1*4882a593Smuzhiyun__all__ = [ 2*4882a593Smuzhiyun 'LXMLTreeBuilderForXML', 3*4882a593Smuzhiyun 'LXMLTreeBuilder', 4*4882a593Smuzhiyun ] 5*4882a593Smuzhiyun 6*4882a593Smuzhiyunfrom io import BytesIO 7*4882a593Smuzhiyunfrom io import StringIO 8*4882a593Smuzhiyunimport collections 9*4882a593Smuzhiyunfrom lxml import etree 10*4882a593Smuzhiyunfrom bs4.element import ( 11*4882a593Smuzhiyun Comment, 12*4882a593Smuzhiyun Doctype, 13*4882a593Smuzhiyun NamespacedAttribute, 14*4882a593Smuzhiyun ProcessingInstruction, 15*4882a593Smuzhiyun) 16*4882a593Smuzhiyunfrom bs4.builder import ( 17*4882a593Smuzhiyun FAST, 18*4882a593Smuzhiyun HTML, 19*4882a593Smuzhiyun HTMLTreeBuilder, 20*4882a593Smuzhiyun PERMISSIVE, 21*4882a593Smuzhiyun ParserRejectedMarkup, 22*4882a593Smuzhiyun TreeBuilder, 23*4882a593Smuzhiyun XML) 24*4882a593Smuzhiyunfrom bs4.dammit import EncodingDetector 25*4882a593Smuzhiyun 26*4882a593SmuzhiyunLXML = 'lxml' 27*4882a593Smuzhiyun 28*4882a593Smuzhiyunclass LXMLTreeBuilderForXML(TreeBuilder): 29*4882a593Smuzhiyun DEFAULT_PARSER_CLASS = etree.XMLParser 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun is_xml = True 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun NAME = "lxml-xml" 34*4882a593Smuzhiyun ALTERNATE_NAMES = ["xml"] 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun # Well, it's permissive by XML parser standards. 37*4882a593Smuzhiyun features = [NAME, LXML, XML, FAST, PERMISSIVE] 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun CHUNK_SIZE = 512 40*4882a593Smuzhiyun 41*4882a593Smuzhiyun # This namespace mapping is specified in the XML Namespace 42*4882a593Smuzhiyun # standard. 43*4882a593Smuzhiyun DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 44*4882a593Smuzhiyun 45*4882a593Smuzhiyun def default_parser(self, encoding): 46*4882a593Smuzhiyun # This can either return a parser object or a class, which 47*4882a593Smuzhiyun # will be instantiated with default arguments. 48*4882a593Smuzhiyun if self._default_parser is not None: 49*4882a593Smuzhiyun return self._default_parser 50*4882a593Smuzhiyun return etree.XMLParser( 51*4882a593Smuzhiyun target=self, strip_cdata=False, recover=True, encoding=encoding) 52*4882a593Smuzhiyun 53*4882a593Smuzhiyun def parser_for(self, encoding): 54*4882a593Smuzhiyun # Use the default parser. 55*4882a593Smuzhiyun parser = self.default_parser(encoding) 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun if isinstance(parser, collections.Callable): 58*4882a593Smuzhiyun # Instantiate the parser with default arguments 59*4882a593Smuzhiyun parser = parser(target=self, strip_cdata=False, encoding=encoding) 60*4882a593Smuzhiyun return parser 61*4882a593Smuzhiyun 62*4882a593Smuzhiyun def __init__(self, parser=None, empty_element_tags=None): 63*4882a593Smuzhiyun # TODO: Issue a warning if parser is present but not a 64*4882a593Smuzhiyun # callable, since that means there's no way to create new 65*4882a593Smuzhiyun # parsers for different encodings. 66*4882a593Smuzhiyun self._default_parser = parser 67*4882a593Smuzhiyun if empty_element_tags is not None: 68*4882a593Smuzhiyun self.empty_element_tags = set(empty_element_tags) 69*4882a593Smuzhiyun self.soup = None 70*4882a593Smuzhiyun self.nsmaps = [self.DEFAULT_NSMAPS] 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun def _getNsTag(self, tag): 73*4882a593Smuzhiyun # Split the namespace URL out of a fully-qualified lxml tag 74*4882a593Smuzhiyun # name. Copied from lxml's src/lxml/sax.py. 75*4882a593Smuzhiyun if tag[0] == '{': 76*4882a593Smuzhiyun return tuple(tag[1:].split('}', 1)) 77*4882a593Smuzhiyun else: 78*4882a593Smuzhiyun return (None, tag) 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun def prepare_markup(self, markup, user_specified_encoding=None, 81*4882a593Smuzhiyun exclude_encodings=None, 82*4882a593Smuzhiyun document_declared_encoding=None): 83*4882a593Smuzhiyun """ 84*4882a593Smuzhiyun :yield: A series of 4-tuples. 85*4882a593Smuzhiyun (markup, encoding, declared encoding, 86*4882a593Smuzhiyun has undergone character replacement) 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun Each 4-tuple represents a strategy for parsing the document. 89*4882a593Smuzhiyun """ 90*4882a593Smuzhiyun if isinstance(markup, str): 91*4882a593Smuzhiyun # We were given Unicode. Maybe lxml can parse Unicode on 92*4882a593Smuzhiyun # this system? 93*4882a593Smuzhiyun yield markup, None, document_declared_encoding, False 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun if isinstance(markup, str): 96*4882a593Smuzhiyun # No, apparently not. Convert the Unicode to UTF-8 and 97*4882a593Smuzhiyun # tell lxml to parse it as UTF-8. 98*4882a593Smuzhiyun yield (markup.encode("utf8"), "utf8", 99*4882a593Smuzhiyun document_declared_encoding, False) 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun # Instead of using UnicodeDammit to convert the bytestring to 102*4882a593Smuzhiyun # Unicode using different encodings, use EncodingDetector to 103*4882a593Smuzhiyun # iterate over the encodings, and tell lxml to try to parse 104*4882a593Smuzhiyun # the document as each one in turn. 105*4882a593Smuzhiyun is_html = not self.is_xml 106*4882a593Smuzhiyun try_encodings = [user_specified_encoding, document_declared_encoding] 107*4882a593Smuzhiyun detector = EncodingDetector( 108*4882a593Smuzhiyun markup, try_encodings, is_html, exclude_encodings) 109*4882a593Smuzhiyun for encoding in detector.encodings: 110*4882a593Smuzhiyun yield (detector.markup, encoding, document_declared_encoding, False) 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun def feed(self, markup): 113*4882a593Smuzhiyun if isinstance(markup, bytes): 114*4882a593Smuzhiyun markup = BytesIO(markup) 115*4882a593Smuzhiyun elif isinstance(markup, str): 116*4882a593Smuzhiyun markup = StringIO(markup) 117*4882a593Smuzhiyun 118*4882a593Smuzhiyun # Call feed() at least once, even if the markup is empty, 119*4882a593Smuzhiyun # or the parser won't be initialized. 120*4882a593Smuzhiyun data = markup.read(self.CHUNK_SIZE) 121*4882a593Smuzhiyun try: 122*4882a593Smuzhiyun self.parser = self.parser_for(self.soup.original_encoding) 123*4882a593Smuzhiyun self.parser.feed(data) 124*4882a593Smuzhiyun while len(data) != 0: 125*4882a593Smuzhiyun # Now call feed() on the rest of the data, chunk by chunk. 126*4882a593Smuzhiyun data = markup.read(self.CHUNK_SIZE) 127*4882a593Smuzhiyun if len(data) != 0: 128*4882a593Smuzhiyun self.parser.feed(data) 129*4882a593Smuzhiyun self.parser.close() 130*4882a593Smuzhiyun except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 131*4882a593Smuzhiyun raise ParserRejectedMarkup(str(e)) 132*4882a593Smuzhiyun 133*4882a593Smuzhiyun def close(self): 134*4882a593Smuzhiyun self.nsmaps = [self.DEFAULT_NSMAPS] 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun def start(self, name, attrs, nsmap={}): 137*4882a593Smuzhiyun # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 138*4882a593Smuzhiyun attrs = dict(attrs) 139*4882a593Smuzhiyun nsprefix = None 140*4882a593Smuzhiyun # Invert each namespace map as it comes in. 141*4882a593Smuzhiyun if len(self.nsmaps) > 1: 142*4882a593Smuzhiyun # There are no new namespaces for this tag, but 143*4882a593Smuzhiyun # non-default namespaces are in play, so we need a 144*4882a593Smuzhiyun # separate tag stack to know when they end. 145*4882a593Smuzhiyun self.nsmaps.append(None) 146*4882a593Smuzhiyun elif len(nsmap) > 0: 147*4882a593Smuzhiyun # A new namespace mapping has come into play. 148*4882a593Smuzhiyun inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) 149*4882a593Smuzhiyun self.nsmaps.append(inverted_nsmap) 150*4882a593Smuzhiyun # Also treat the namespace mapping as a set of attributes on the 151*4882a593Smuzhiyun # tag, so we can recreate it later. 152*4882a593Smuzhiyun attrs = attrs.copy() 153*4882a593Smuzhiyun for prefix, namespace in list(nsmap.items()): 154*4882a593Smuzhiyun attribute = NamespacedAttribute( 155*4882a593Smuzhiyun "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 156*4882a593Smuzhiyun attrs[attribute] = namespace 157*4882a593Smuzhiyun 158*4882a593Smuzhiyun # Namespaces are in play. Find any attributes that came in 159*4882a593Smuzhiyun # from lxml with namespaces attached to their names, and 160*4882a593Smuzhiyun # turn then into NamespacedAttribute objects. 161*4882a593Smuzhiyun new_attrs = {} 162*4882a593Smuzhiyun for attr, value in list(attrs.items()): 163*4882a593Smuzhiyun namespace, attr = self._getNsTag(attr) 164*4882a593Smuzhiyun if namespace is None: 165*4882a593Smuzhiyun new_attrs[attr] = value 166*4882a593Smuzhiyun else: 167*4882a593Smuzhiyun nsprefix = self._prefix_for_namespace(namespace) 168*4882a593Smuzhiyun attr = NamespacedAttribute(nsprefix, attr, namespace) 169*4882a593Smuzhiyun new_attrs[attr] = value 170*4882a593Smuzhiyun attrs = new_attrs 171*4882a593Smuzhiyun 172*4882a593Smuzhiyun namespace, name = self._getNsTag(name) 173*4882a593Smuzhiyun nsprefix = self._prefix_for_namespace(namespace) 174*4882a593Smuzhiyun self.soup.handle_starttag(name, namespace, nsprefix, attrs) 175*4882a593Smuzhiyun 176*4882a593Smuzhiyun def _prefix_for_namespace(self, namespace): 177*4882a593Smuzhiyun """Find the currently active prefix for the given namespace.""" 178*4882a593Smuzhiyun if namespace is None: 179*4882a593Smuzhiyun return None 180*4882a593Smuzhiyun for inverted_nsmap in reversed(self.nsmaps): 181*4882a593Smuzhiyun if inverted_nsmap is not None and namespace in inverted_nsmap: 182*4882a593Smuzhiyun return inverted_nsmap[namespace] 183*4882a593Smuzhiyun return None 184*4882a593Smuzhiyun 185*4882a593Smuzhiyun def end(self, name): 186*4882a593Smuzhiyun self.soup.endData() 187*4882a593Smuzhiyun completed_tag = self.soup.tagStack[-1] 188*4882a593Smuzhiyun namespace, name = self._getNsTag(name) 189*4882a593Smuzhiyun nsprefix = None 190*4882a593Smuzhiyun if namespace is not None: 191*4882a593Smuzhiyun for inverted_nsmap in reversed(self.nsmaps): 192*4882a593Smuzhiyun if inverted_nsmap is not None and namespace in inverted_nsmap: 193*4882a593Smuzhiyun nsprefix = inverted_nsmap[namespace] 194*4882a593Smuzhiyun break 195*4882a593Smuzhiyun self.soup.handle_endtag(name, nsprefix) 196*4882a593Smuzhiyun if len(self.nsmaps) > 1: 197*4882a593Smuzhiyun # This tag, or one of its parents, introduced a namespace 198*4882a593Smuzhiyun # mapping, so pop it off the stack. 199*4882a593Smuzhiyun self.nsmaps.pop() 200*4882a593Smuzhiyun 201*4882a593Smuzhiyun def pi(self, target, data): 202*4882a593Smuzhiyun self.soup.endData() 203*4882a593Smuzhiyun self.soup.handle_data(target + ' ' + data) 204*4882a593Smuzhiyun self.soup.endData(ProcessingInstruction) 205*4882a593Smuzhiyun 206*4882a593Smuzhiyun def data(self, content): 207*4882a593Smuzhiyun self.soup.handle_data(content) 208*4882a593Smuzhiyun 209*4882a593Smuzhiyun def doctype(self, name, pubid, system): 210*4882a593Smuzhiyun self.soup.endData() 211*4882a593Smuzhiyun doctype = Doctype.for_name_and_ids(name, pubid, system) 212*4882a593Smuzhiyun self.soup.object_was_parsed(doctype) 213*4882a593Smuzhiyun 214*4882a593Smuzhiyun def comment(self, content): 215*4882a593Smuzhiyun "Handle comments as Comment objects." 216*4882a593Smuzhiyun self.soup.endData() 217*4882a593Smuzhiyun self.soup.handle_data(content) 218*4882a593Smuzhiyun self.soup.endData(Comment) 219*4882a593Smuzhiyun 220*4882a593Smuzhiyun def test_fragment_to_document(self, fragment): 221*4882a593Smuzhiyun """See `TreeBuilder`.""" 222*4882a593Smuzhiyun return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment 223*4882a593Smuzhiyun 224*4882a593Smuzhiyun 225*4882a593Smuzhiyunclass LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 226*4882a593Smuzhiyun 227*4882a593Smuzhiyun NAME = LXML 228*4882a593Smuzhiyun ALTERNATE_NAMES = ["lxml-html"] 229*4882a593Smuzhiyun 230*4882a593Smuzhiyun features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 231*4882a593Smuzhiyun is_xml = False 232*4882a593Smuzhiyun 233*4882a593Smuzhiyun def default_parser(self, encoding): 234*4882a593Smuzhiyun return etree.HTMLParser 235*4882a593Smuzhiyun 236*4882a593Smuzhiyun def feed(self, markup): 237*4882a593Smuzhiyun encoding = self.soup.original_encoding 238*4882a593Smuzhiyun try: 239*4882a593Smuzhiyun self.parser = self.parser_for(encoding) 240*4882a593Smuzhiyun self.parser.feed(markup) 241*4882a593Smuzhiyun self.parser.close() 242*4882a593Smuzhiyun except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 243*4882a593Smuzhiyun raise ParserRejectedMarkup(str(e)) 244*4882a593Smuzhiyun 245*4882a593Smuzhiyun 246*4882a593Smuzhiyun def test_fragment_to_document(self, fragment): 247*4882a593Smuzhiyun """See `TreeBuilder`.""" 248*4882a593Smuzhiyun return '<html><body>%s</body></html>' % fragment 249