1*4882a593Smuzhiyun"""Diagnostic functions, mainly for use when doing tech support.""" 2*4882a593Smuzhiyun 3*4882a593Smuzhiyun__license__ = "MIT" 4*4882a593Smuzhiyun 5*4882a593Smuzhiyunimport cProfile 6*4882a593Smuzhiyunfrom io import StringIO 7*4882a593Smuzhiyunfrom html.parser import HTMLParser 8*4882a593Smuzhiyunimport bs4 9*4882a593Smuzhiyunfrom bs4 import BeautifulSoup, __version__ 10*4882a593Smuzhiyunfrom bs4.builder import builder_registry 11*4882a593Smuzhiyun 12*4882a593Smuzhiyunimport os 13*4882a593Smuzhiyunimport pstats 14*4882a593Smuzhiyunimport random 15*4882a593Smuzhiyunimport tempfile 16*4882a593Smuzhiyunimport time 17*4882a593Smuzhiyunimport traceback 18*4882a593Smuzhiyunimport sys 19*4882a593Smuzhiyunimport cProfile 20*4882a593Smuzhiyun 21*4882a593Smuzhiyundef diagnose(data): 22*4882a593Smuzhiyun """Diagnostic suite for isolating common problems.""" 23*4882a593Smuzhiyun print("Diagnostic running on Beautiful Soup %s" % __version__) 24*4882a593Smuzhiyun print("Python version %s" % sys.version) 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun basic_parsers = ["html.parser", "html5lib", "lxml"] 27*4882a593Smuzhiyun for name in basic_parsers: 28*4882a593Smuzhiyun for builder in builder_registry.builders: 29*4882a593Smuzhiyun if name in builder.features: 30*4882a593Smuzhiyun break 31*4882a593Smuzhiyun else: 32*4882a593Smuzhiyun basic_parsers.remove(name) 33*4882a593Smuzhiyun print(( 34*4882a593Smuzhiyun "I noticed that %s is not installed. Installing it may help." % 35*4882a593Smuzhiyun name)) 36*4882a593Smuzhiyun 37*4882a593Smuzhiyun if 'lxml' in basic_parsers: 38*4882a593Smuzhiyun basic_parsers.append(["lxml", "xml"]) 39*4882a593Smuzhiyun try: 40*4882a593Smuzhiyun from lxml import etree 41*4882a593Smuzhiyun print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) 42*4882a593Smuzhiyun except ImportError as e: 43*4882a593Smuzhiyun print ( 44*4882a593Smuzhiyun "lxml is not installed or couldn't be imported.") 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun if 'html5lib' in basic_parsers: 48*4882a593Smuzhiyun try: 49*4882a593Smuzhiyun import html5lib 50*4882a593Smuzhiyun print("Found html5lib version %s" % html5lib.__version__) 51*4882a593Smuzhiyun except ImportError as e: 52*4882a593Smuzhiyun print ( 53*4882a593Smuzhiyun "html5lib is not installed or couldn't be imported.") 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun if hasattr(data, 'read'): 56*4882a593Smuzhiyun data = data.read() 57*4882a593Smuzhiyun elif os.path.exists(data): 58*4882a593Smuzhiyun print('"%s" looks like a filename. Reading data from the file.' % data) 59*4882a593Smuzhiyun data = open(data).read() 60*4882a593Smuzhiyun elif data.startswith("http:") or data.startswith("https:"): 61*4882a593Smuzhiyun print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) 62*4882a593Smuzhiyun print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") 63*4882a593Smuzhiyun return 64*4882a593Smuzhiyun print() 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun for parser in basic_parsers: 67*4882a593Smuzhiyun print("Trying to parse your markup with %s" % parser) 68*4882a593Smuzhiyun success = False 69*4882a593Smuzhiyun try: 70*4882a593Smuzhiyun soup = BeautifulSoup(data, parser) 71*4882a593Smuzhiyun success = True 72*4882a593Smuzhiyun except Exception as e: 73*4882a593Smuzhiyun print("%s could not parse the markup." % parser) 74*4882a593Smuzhiyun traceback.print_exc() 75*4882a593Smuzhiyun if success: 76*4882a593Smuzhiyun print("Here's what %s did with the markup:" % parser) 77*4882a593Smuzhiyun print(soup.prettify()) 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun print("-" * 80) 80*4882a593Smuzhiyun 81*4882a593Smuzhiyundef lxml_trace(data, html=True, **kwargs): 82*4882a593Smuzhiyun """Print out the lxml events that occur during parsing. 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun This lets you see how lxml parses a document when no Beautiful 85*4882a593Smuzhiyun Soup code is running. 86*4882a593Smuzhiyun """ 87*4882a593Smuzhiyun from lxml import etree 88*4882a593Smuzhiyun for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 89*4882a593Smuzhiyun print(("%s, %4s, %s" % (event, element.tag, element.text))) 90*4882a593Smuzhiyun 91*4882a593Smuzhiyunclass AnnouncingParser(HTMLParser): 92*4882a593Smuzhiyun """Announces HTMLParser parse events, without doing anything else.""" 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun def _p(self, s): 95*4882a593Smuzhiyun print(s) 96*4882a593Smuzhiyun 97*4882a593Smuzhiyun def handle_starttag(self, name, attrs): 98*4882a593Smuzhiyun self._p("%s START" % name) 99*4882a593Smuzhiyun 100*4882a593Smuzhiyun def handle_endtag(self, name): 101*4882a593Smuzhiyun self._p("%s END" % name) 102*4882a593Smuzhiyun 103*4882a593Smuzhiyun def handle_data(self, data): 104*4882a593Smuzhiyun self._p("%s DATA" % data) 105*4882a593Smuzhiyun 106*4882a593Smuzhiyun def handle_charref(self, name): 107*4882a593Smuzhiyun self._p("%s CHARREF" % name) 108*4882a593Smuzhiyun 109*4882a593Smuzhiyun def handle_entityref(self, name): 110*4882a593Smuzhiyun self._p("%s ENTITYREF" % name) 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun def handle_comment(self, data): 113*4882a593Smuzhiyun self._p("%s COMMENT" % data) 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun def handle_decl(self, data): 116*4882a593Smuzhiyun self._p("%s DECL" % data) 117*4882a593Smuzhiyun 118*4882a593Smuzhiyun def unknown_decl(self, data): 119*4882a593Smuzhiyun self._p("%s UNKNOWN-DECL" % data) 120*4882a593Smuzhiyun 121*4882a593Smuzhiyun def handle_pi(self, data): 122*4882a593Smuzhiyun self._p("%s PI" % data) 123*4882a593Smuzhiyun 124*4882a593Smuzhiyundef htmlparser_trace(data): 125*4882a593Smuzhiyun """Print out the HTMLParser events that occur during parsing. 126*4882a593Smuzhiyun 127*4882a593Smuzhiyun This lets you see how HTMLParser parses a document when no 128*4882a593Smuzhiyun Beautiful Soup code is running. 129*4882a593Smuzhiyun """ 130*4882a593Smuzhiyun parser = AnnouncingParser() 131*4882a593Smuzhiyun parser.feed(data) 132*4882a593Smuzhiyun 133*4882a593Smuzhiyun_vowels = "aeiou" 134*4882a593Smuzhiyun_consonants = "bcdfghjklmnpqrstvwxyz" 135*4882a593Smuzhiyun 136*4882a593Smuzhiyundef rword(length=5): 137*4882a593Smuzhiyun "Generate a random word-like string." 138*4882a593Smuzhiyun s = '' 139*4882a593Smuzhiyun for i in range(length): 140*4882a593Smuzhiyun if i % 2 == 0: 141*4882a593Smuzhiyun t = _consonants 142*4882a593Smuzhiyun else: 143*4882a593Smuzhiyun t = _vowels 144*4882a593Smuzhiyun s += random.choice(t) 145*4882a593Smuzhiyun return s 146*4882a593Smuzhiyun 147*4882a593Smuzhiyundef rsentence(length=4): 148*4882a593Smuzhiyun "Generate a random sentence-like string." 149*4882a593Smuzhiyun return " ".join(rword(random.randint(4,9)) for i in range(length)) 150*4882a593Smuzhiyun 151*4882a593Smuzhiyundef rdoc(num_elements=1000): 152*4882a593Smuzhiyun """Randomly generate an invalid HTML document.""" 153*4882a593Smuzhiyun tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 154*4882a593Smuzhiyun elements = [] 155*4882a593Smuzhiyun for i in range(num_elements): 156*4882a593Smuzhiyun choice = random.randint(0,3) 157*4882a593Smuzhiyun if choice == 0: 158*4882a593Smuzhiyun # New tag. 159*4882a593Smuzhiyun tag_name = random.choice(tag_names) 160*4882a593Smuzhiyun elements.append("<%s>" % tag_name) 161*4882a593Smuzhiyun elif choice == 1: 162*4882a593Smuzhiyun elements.append(rsentence(random.randint(1,4))) 163*4882a593Smuzhiyun elif choice == 2: 164*4882a593Smuzhiyun # Close a tag. 165*4882a593Smuzhiyun tag_name = random.choice(tag_names) 166*4882a593Smuzhiyun elements.append("</%s>" % tag_name) 167*4882a593Smuzhiyun return "<html>" + "\n".join(elements) + "</html>" 168*4882a593Smuzhiyun 169*4882a593Smuzhiyundef benchmark_parsers(num_elements=100000): 170*4882a593Smuzhiyun """Very basic head-to-head performance benchmark.""" 171*4882a593Smuzhiyun print("Comparative parser benchmark on Beautiful Soup %s" % __version__) 172*4882a593Smuzhiyun data = rdoc(num_elements) 173*4882a593Smuzhiyun print("Generated a large invalid HTML document (%d bytes)." % len(data)) 174*4882a593Smuzhiyun 175*4882a593Smuzhiyun for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 176*4882a593Smuzhiyun success = False 177*4882a593Smuzhiyun try: 178*4882a593Smuzhiyun a = time.time() 179*4882a593Smuzhiyun soup = BeautifulSoup(data, parser) 180*4882a593Smuzhiyun b = time.time() 181*4882a593Smuzhiyun success = True 182*4882a593Smuzhiyun except Exception as e: 183*4882a593Smuzhiyun print("%s could not parse the markup." % parser) 184*4882a593Smuzhiyun traceback.print_exc() 185*4882a593Smuzhiyun if success: 186*4882a593Smuzhiyun print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) 187*4882a593Smuzhiyun 188*4882a593Smuzhiyun from lxml import etree 189*4882a593Smuzhiyun a = time.time() 190*4882a593Smuzhiyun etree.HTML(data) 191*4882a593Smuzhiyun b = time.time() 192*4882a593Smuzhiyun print("Raw lxml parsed the markup in %.2fs." % (b-a)) 193*4882a593Smuzhiyun 194*4882a593Smuzhiyun import html5lib 195*4882a593Smuzhiyun parser = html5lib.HTMLParser() 196*4882a593Smuzhiyun a = time.time() 197*4882a593Smuzhiyun parser.parse(data) 198*4882a593Smuzhiyun b = time.time() 199*4882a593Smuzhiyun print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 200*4882a593Smuzhiyun 201*4882a593Smuzhiyundef profile(num_elements=100000, parser="lxml"): 202*4882a593Smuzhiyun 203*4882a593Smuzhiyun filehandle = tempfile.NamedTemporaryFile() 204*4882a593Smuzhiyun filename = filehandle.name 205*4882a593Smuzhiyun 206*4882a593Smuzhiyun data = rdoc(num_elements) 207*4882a593Smuzhiyun vars = dict(bs4=bs4, data=data, parser=parser) 208*4882a593Smuzhiyun cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 209*4882a593Smuzhiyun 210*4882a593Smuzhiyun stats = pstats.Stats(filename) 211*4882a593Smuzhiyun # stats.strip_dirs() 212*4882a593Smuzhiyun stats.sort_stats("cumulative") 213*4882a593Smuzhiyun stats.print_stats('_html5lib|bs4', 50) 214*4882a593Smuzhiyun 215*4882a593Smuzhiyunif __name__ == '__main__': 216*4882a593Smuzhiyun diagnose(sys.stdin.read()) 217