1*4882a593Smuzhiyun"""Use the HTMLParser library to parse HTML files that aren't too bad.""" 2*4882a593Smuzhiyun 3*4882a593Smuzhiyun__all__ = [ 4*4882a593Smuzhiyun 'HTMLParserTreeBuilder', 5*4882a593Smuzhiyun ] 6*4882a593Smuzhiyun 7*4882a593Smuzhiyunfrom html.parser import HTMLParser 8*4882a593Smuzhiyun 9*4882a593Smuzhiyuntry: 10*4882a593Smuzhiyun from html.parser import HTMLParseError 11*4882a593Smuzhiyunexcept ImportError as e: 12*4882a593Smuzhiyun # HTMLParseError is removed in Python 3.5. Since it can never be 13*4882a593Smuzhiyun # thrown in 3.5, we can just define our own class as a placeholder. 14*4882a593Smuzhiyun class HTMLParseError(Exception): 15*4882a593Smuzhiyun pass 16*4882a593Smuzhiyun 17*4882a593Smuzhiyunimport sys 18*4882a593Smuzhiyunimport warnings 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 21*4882a593Smuzhiyun# argument, which we'd like to set to False. Unfortunately, 22*4882a593Smuzhiyun# http://bugs.python.org/issue13273 makes strict=True a better bet 23*4882a593Smuzhiyun# before Python 3.2.3. 24*4882a593Smuzhiyun# 25*4882a593Smuzhiyun# At the end of this file, we monkeypatch HTMLParser so that 26*4882a593Smuzhiyun# strict=True works well on Python 3.2.2. 27*4882a593Smuzhiyunmajor, minor, release = sys.version_info[:3] 28*4882a593SmuzhiyunCONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 29*4882a593SmuzhiyunCONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 30*4882a593SmuzhiyunCONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 31*4882a593Smuzhiyun 32*4882a593Smuzhiyun 33*4882a593Smuzhiyunfrom bs4.element import ( 34*4882a593Smuzhiyun CData, 35*4882a593Smuzhiyun Comment, 36*4882a593Smuzhiyun Declaration, 37*4882a593Smuzhiyun Doctype, 38*4882a593Smuzhiyun ProcessingInstruction, 39*4882a593Smuzhiyun ) 40*4882a593Smuzhiyunfrom bs4.dammit import EntitySubstitution, UnicodeDammit 41*4882a593Smuzhiyun 42*4882a593Smuzhiyunfrom bs4.builder import ( 43*4882a593Smuzhiyun HTML, 44*4882a593Smuzhiyun HTMLTreeBuilder, 45*4882a593Smuzhiyun STRICT, 46*4882a593Smuzhiyun ) 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun 49*4882a593SmuzhiyunHTMLPARSER = 'html.parser' 50*4882a593Smuzhiyun 51*4882a593Smuzhiyunclass BeautifulSoupHTMLParser(HTMLParser): 52*4882a593Smuzhiyun def handle_starttag(self, name, attrs): 53*4882a593Smuzhiyun # XXX namespace 54*4882a593Smuzhiyun attr_dict = {} 55*4882a593Smuzhiyun for key, value in attrs: 56*4882a593Smuzhiyun # Change None attribute values to the empty string 57*4882a593Smuzhiyun # for consistency with the other tree builders. 58*4882a593Smuzhiyun if value is None: 59*4882a593Smuzhiyun value = '' 60*4882a593Smuzhiyun attr_dict[key] = value 61*4882a593Smuzhiyun attrvalue = '""' 62*4882a593Smuzhiyun self.soup.handle_starttag(name, None, None, attr_dict) 63*4882a593Smuzhiyun 64*4882a593Smuzhiyun def handle_endtag(self, name): 65*4882a593Smuzhiyun self.soup.handle_endtag(name) 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun def handle_data(self, data): 68*4882a593Smuzhiyun self.soup.handle_data(data) 69*4882a593Smuzhiyun 70*4882a593Smuzhiyun def handle_charref(self, name): 71*4882a593Smuzhiyun # XXX workaround for a bug in HTMLParser. Remove this once 72*4882a593Smuzhiyun # it's fixed in all supported versions. 73*4882a593Smuzhiyun # http://bugs.python.org/issue13633 74*4882a593Smuzhiyun if name.startswith('x'): 75*4882a593Smuzhiyun real_name = int(name.lstrip('x'), 16) 76*4882a593Smuzhiyun elif name.startswith('X'): 77*4882a593Smuzhiyun real_name = int(name.lstrip('X'), 16) 78*4882a593Smuzhiyun else: 79*4882a593Smuzhiyun real_name = int(name) 80*4882a593Smuzhiyun 81*4882a593Smuzhiyun try: 82*4882a593Smuzhiyun data = chr(real_name) 83*4882a593Smuzhiyun except (ValueError, OverflowError) as e: 84*4882a593Smuzhiyun data = "\N{REPLACEMENT CHARACTER}" 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun self.handle_data(data) 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun def handle_entityref(self, name): 89*4882a593Smuzhiyun character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 90*4882a593Smuzhiyun if character is not None: 91*4882a593Smuzhiyun data = character 92*4882a593Smuzhiyun else: 93*4882a593Smuzhiyun data = "&%s;" % name 94*4882a593Smuzhiyun self.handle_data(data) 95*4882a593Smuzhiyun 96*4882a593Smuzhiyun def handle_comment(self, data): 97*4882a593Smuzhiyun self.soup.endData() 98*4882a593Smuzhiyun self.soup.handle_data(data) 99*4882a593Smuzhiyun self.soup.endData(Comment) 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun def handle_decl(self, data): 102*4882a593Smuzhiyun self.soup.endData() 103*4882a593Smuzhiyun if data.startswith("DOCTYPE "): 104*4882a593Smuzhiyun data = data[len("DOCTYPE "):] 105*4882a593Smuzhiyun elif data == 'DOCTYPE': 106*4882a593Smuzhiyun # i.e. "<!DOCTYPE>" 107*4882a593Smuzhiyun data = '' 108*4882a593Smuzhiyun self.soup.handle_data(data) 109*4882a593Smuzhiyun self.soup.endData(Doctype) 110*4882a593Smuzhiyun 111*4882a593Smuzhiyun def unknown_decl(self, data): 112*4882a593Smuzhiyun if data.upper().startswith('CDATA['): 113*4882a593Smuzhiyun cls = CData 114*4882a593Smuzhiyun data = data[len('CDATA['):] 115*4882a593Smuzhiyun else: 116*4882a593Smuzhiyun cls = Declaration 117*4882a593Smuzhiyun self.soup.endData() 118*4882a593Smuzhiyun self.soup.handle_data(data) 119*4882a593Smuzhiyun self.soup.endData(cls) 120*4882a593Smuzhiyun 121*4882a593Smuzhiyun def handle_pi(self, data): 122*4882a593Smuzhiyun self.soup.endData() 123*4882a593Smuzhiyun self.soup.handle_data(data) 124*4882a593Smuzhiyun self.soup.endData(ProcessingInstruction) 125*4882a593Smuzhiyun 126*4882a593Smuzhiyun 127*4882a593Smuzhiyunclass HTMLParserTreeBuilder(HTMLTreeBuilder): 128*4882a593Smuzhiyun 129*4882a593Smuzhiyun is_xml = False 130*4882a593Smuzhiyun picklable = True 131*4882a593Smuzhiyun NAME = HTMLPARSER 132*4882a593Smuzhiyun features = [NAME, HTML, STRICT] 133*4882a593Smuzhiyun 134*4882a593Smuzhiyun def __init__(self, *args, **kwargs): 135*4882a593Smuzhiyun if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 136*4882a593Smuzhiyun kwargs['strict'] = False 137*4882a593Smuzhiyun if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: 138*4882a593Smuzhiyun kwargs['convert_charrefs'] = False 139*4882a593Smuzhiyun self.parser_args = (args, kwargs) 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun def prepare_markup(self, markup, user_specified_encoding=None, 142*4882a593Smuzhiyun document_declared_encoding=None, exclude_encodings=None): 143*4882a593Smuzhiyun """ 144*4882a593Smuzhiyun :return: A 4-tuple (markup, original encoding, encoding 145*4882a593Smuzhiyun declared within markup, whether any characters had to be 146*4882a593Smuzhiyun replaced with REPLACEMENT CHARACTER). 147*4882a593Smuzhiyun """ 148*4882a593Smuzhiyun if isinstance(markup, str): 149*4882a593Smuzhiyun yield (markup, None, None, False) 150*4882a593Smuzhiyun return 151*4882a593Smuzhiyun 152*4882a593Smuzhiyun try_encodings = [user_specified_encoding, document_declared_encoding] 153*4882a593Smuzhiyun dammit = UnicodeDammit(markup, try_encodings, is_html=True, 154*4882a593Smuzhiyun exclude_encodings=exclude_encodings) 155*4882a593Smuzhiyun yield (dammit.markup, dammit.original_encoding, 156*4882a593Smuzhiyun dammit.declared_html_encoding, 157*4882a593Smuzhiyun dammit.contains_replacement_characters) 158*4882a593Smuzhiyun 159*4882a593Smuzhiyun def feed(self, markup): 160*4882a593Smuzhiyun args, kwargs = self.parser_args 161*4882a593Smuzhiyun parser = BeautifulSoupHTMLParser(*args, **kwargs) 162*4882a593Smuzhiyun parser.soup = self.soup 163*4882a593Smuzhiyun try: 164*4882a593Smuzhiyun parser.feed(markup) 165*4882a593Smuzhiyun except HTMLParseError as e: 166*4882a593Smuzhiyun warnings.warn(RuntimeWarning( 167*4882a593Smuzhiyun "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 168*4882a593Smuzhiyun raise e 169*4882a593Smuzhiyun 170*4882a593Smuzhiyun# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 171*4882a593Smuzhiyun# 3.2.3 code. This ensures they don't treat markup like <p></p> as a 172*4882a593Smuzhiyun# string. 173*4882a593Smuzhiyun# 174*4882a593Smuzhiyun# XXX This code can be removed once most Python 3 users are on 3.2.3. 175*4882a593Smuzhiyunif major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 176*4882a593Smuzhiyun import re 177*4882a593Smuzhiyun attrfind_tolerant = re.compile( 178*4882a593Smuzhiyun r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 179*4882a593Smuzhiyun r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 180*4882a593Smuzhiyun HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 181*4882a593Smuzhiyun 182*4882a593Smuzhiyun locatestarttagend = re.compile(r""" 183*4882a593Smuzhiyun <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 184*4882a593Smuzhiyun (?:\s+ # whitespace before attribute name 185*4882a593Smuzhiyun (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 186*4882a593Smuzhiyun (?:\s*=\s* # value indicator 187*4882a593Smuzhiyun (?:'[^']*' # LITA-enclosed value 188*4882a593Smuzhiyun |\"[^\"]*\" # LIT-enclosed value 189*4882a593Smuzhiyun |[^'\">\s]+ # bare value 190*4882a593Smuzhiyun ) 191*4882a593Smuzhiyun )? 192*4882a593Smuzhiyun ) 193*4882a593Smuzhiyun )* 194*4882a593Smuzhiyun \s* # trailing whitespace 195*4882a593Smuzhiyun""", re.VERBOSE) 196*4882a593Smuzhiyun BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 197*4882a593Smuzhiyun 198*4882a593Smuzhiyun from html.parser import tagfind, attrfind 199*4882a593Smuzhiyun 200*4882a593Smuzhiyun def parse_starttag(self, i): 201*4882a593Smuzhiyun self.__starttag_text = None 202*4882a593Smuzhiyun endpos = self.check_for_whole_start_tag(i) 203*4882a593Smuzhiyun if endpos < 0: 204*4882a593Smuzhiyun return endpos 205*4882a593Smuzhiyun rawdata = self.rawdata 206*4882a593Smuzhiyun self.__starttag_text = rawdata[i:endpos] 207*4882a593Smuzhiyun 208*4882a593Smuzhiyun # Now parse the data between i+1 and j into a tag and attrs 209*4882a593Smuzhiyun attrs = [] 210*4882a593Smuzhiyun match = tagfind.match(rawdata, i+1) 211*4882a593Smuzhiyun assert match, 'unexpected call to parse_starttag()' 212*4882a593Smuzhiyun k = match.end() 213*4882a593Smuzhiyun self.lasttag = tag = rawdata[i+1:k].lower() 214*4882a593Smuzhiyun while k < endpos: 215*4882a593Smuzhiyun if self.strict: 216*4882a593Smuzhiyun m = attrfind.match(rawdata, k) 217*4882a593Smuzhiyun else: 218*4882a593Smuzhiyun m = attrfind_tolerant.match(rawdata, k) 219*4882a593Smuzhiyun if not m: 220*4882a593Smuzhiyun break 221*4882a593Smuzhiyun attrname, rest, attrvalue = m.group(1, 2, 3) 222*4882a593Smuzhiyun if not rest: 223*4882a593Smuzhiyun attrvalue = None 224*4882a593Smuzhiyun elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 225*4882a593Smuzhiyun attrvalue[:1] == '"' == attrvalue[-1:]: 226*4882a593Smuzhiyun attrvalue = attrvalue[1:-1] 227*4882a593Smuzhiyun if attrvalue: 228*4882a593Smuzhiyun attrvalue = self.unescape(attrvalue) 229*4882a593Smuzhiyun attrs.append((attrname.lower(), attrvalue)) 230*4882a593Smuzhiyun k = m.end() 231*4882a593Smuzhiyun 232*4882a593Smuzhiyun end = rawdata[k:endpos].strip() 233*4882a593Smuzhiyun if end not in (">", "/>"): 234*4882a593Smuzhiyun lineno, offset = self.getpos() 235*4882a593Smuzhiyun if "\n" in self.__starttag_text: 236*4882a593Smuzhiyun lineno = lineno + self.__starttag_text.count("\n") 237*4882a593Smuzhiyun offset = len(self.__starttag_text) \ 238*4882a593Smuzhiyun - self.__starttag_text.rfind("\n") 239*4882a593Smuzhiyun else: 240*4882a593Smuzhiyun offset = offset + len(self.__starttag_text) 241*4882a593Smuzhiyun if self.strict: 242*4882a593Smuzhiyun self.error("junk characters in start tag: %r" 243*4882a593Smuzhiyun % (rawdata[k:endpos][:20],)) 244*4882a593Smuzhiyun self.handle_data(rawdata[i:endpos]) 245*4882a593Smuzhiyun return endpos 246*4882a593Smuzhiyun if end.endswith('/>'): 247*4882a593Smuzhiyun # XHTML-style empty tag: <span attr="value" /> 248*4882a593Smuzhiyun self.handle_startendtag(tag, attrs) 249*4882a593Smuzhiyun else: 250*4882a593Smuzhiyun self.handle_starttag(tag, attrs) 251*4882a593Smuzhiyun if tag in self.CDATA_CONTENT_ELEMENTS: 252*4882a593Smuzhiyun self.set_cdata_mode(tag) 253*4882a593Smuzhiyun return endpos 254*4882a593Smuzhiyun 255*4882a593Smuzhiyun def set_cdata_mode(self, elem): 256*4882a593Smuzhiyun self.cdata_elem = elem.lower() 257*4882a593Smuzhiyun self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun BeautifulSoupHTMLParser.parse_starttag = parse_starttag 260*4882a593Smuzhiyun BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 261*4882a593Smuzhiyun 262*4882a593Smuzhiyun CONSTRUCTOR_TAKES_STRICT = True 263