1*4882a593Smuzhiyun"""Beautiful Soup 2*4882a593SmuzhiyunElixir and Tonic 3*4882a593Smuzhiyun"The Screen-Scraper's Friend" 4*4882a593Smuzhiyunhttp://www.crummy.com/software/BeautifulSoup/ 5*4882a593Smuzhiyun 6*4882a593SmuzhiyunBeautiful Soup uses a pluggable XML or HTML parser to parse a 7*4882a593Smuzhiyun(possibly invalid) document into a tree representation. Beautiful Soup 8*4882a593Smuzhiyunprovides provides methods and Pythonic idioms that make it easy to 9*4882a593Smuzhiyunnavigate, search, and modify the parse tree. 10*4882a593Smuzhiyun 11*4882a593SmuzhiyunBeautiful Soup works with Python 2.6 and up. It works better if lxml 12*4882a593Smuzhiyunand/or html5lib is installed. 13*4882a593Smuzhiyun 14*4882a593SmuzhiyunFor more than you ever wanted to know about Beautiful Soup, see the 15*4882a593Smuzhiyundocumentation: 16*4882a593Smuzhiyunhttp://www.crummy.com/software/BeautifulSoup/bs4/doc/ 17*4882a593Smuzhiyun""" 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun__author__ = "Leonard Richardson (leonardr@segfault.org)" 20*4882a593Smuzhiyun__version__ = "4.4.1" 21*4882a593Smuzhiyun__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" 22*4882a593Smuzhiyun__license__ = "MIT" 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun__all__ = ['BeautifulSoup'] 25*4882a593Smuzhiyun 26*4882a593Smuzhiyunimport os 27*4882a593Smuzhiyunimport re 28*4882a593Smuzhiyunimport warnings 29*4882a593Smuzhiyun 30*4882a593Smuzhiyunfrom .builder import builder_registry, ParserRejectedMarkup 31*4882a593Smuzhiyunfrom .dammit import UnicodeDammit 32*4882a593Smuzhiyunfrom .element import ( 33*4882a593Smuzhiyun CData, 34*4882a593Smuzhiyun Comment, 35*4882a593Smuzhiyun DEFAULT_OUTPUT_ENCODING, 36*4882a593Smuzhiyun Declaration, 37*4882a593Smuzhiyun Doctype, 38*4882a593Smuzhiyun NavigableString, 39*4882a593Smuzhiyun PageElement, 40*4882a593Smuzhiyun ProcessingInstruction, 41*4882a593Smuzhiyun ResultSet, 42*4882a593Smuzhiyun SoupStrainer, 43*4882a593Smuzhiyun Tag, 44*4882a593Smuzhiyun ) 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun# The very first thing we do is give a useful error if someone is 47*4882a593Smuzhiyun# running this code under Python 3 without converting it. 48*4882a593Smuzhiyun'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 49*4882a593Smuzhiyun 50*4882a593Smuzhiyunclass BeautifulSoup(Tag): 51*4882a593Smuzhiyun """ 52*4882a593Smuzhiyun This class defines the basic interface called by the tree builders. 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun These methods will be called by the parser: 55*4882a593Smuzhiyun reset() 56*4882a593Smuzhiyun feed(markup) 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun The tree builder may call these methods from its feed() implementation: 59*4882a593Smuzhiyun handle_starttag(name, attrs) # See note about return value 60*4882a593Smuzhiyun handle_endtag(name) 61*4882a593Smuzhiyun handle_data(data) # Appends to the current data node 62*4882a593Smuzhiyun endData(containerClass=NavigableString) # Ends the current data node 63*4882a593Smuzhiyun 64*4882a593Smuzhiyun No matter how complicated the underlying parser is, you should be 65*4882a593Smuzhiyun able to build a tree using 'start tag' events, 'end tag' events, 66*4882a593Smuzhiyun 'data' events, and "done with data" events. 67*4882a593Smuzhiyun 68*4882a593Smuzhiyun If you encounter an empty-element tag (aka a self-closing tag, 69*4882a593Smuzhiyun like HTML's <br> tag), call handle_starttag and then 70*4882a593Smuzhiyun handle_endtag. 71*4882a593Smuzhiyun """ 72*4882a593Smuzhiyun ROOT_TAG_NAME = '[document]' 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun # If the end-user gives no indication which tree builder they 75*4882a593Smuzhiyun # want, look for one with these features. 76*4882a593Smuzhiyun DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" 81*4882a593Smuzhiyun 82*4882a593Smuzhiyun def __init__(self, markup="", features=None, builder=None, 83*4882a593Smuzhiyun parse_only=None, from_encoding=None, exclude_encodings=None, 84*4882a593Smuzhiyun **kwargs): 85*4882a593Smuzhiyun """The Soup object is initialized as the 'root tag', and the 86*4882a593Smuzhiyun provided markup (which can be a string or a file-like object) 87*4882a593Smuzhiyun is fed into the underlying parser.""" 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun if 'convertEntities' in kwargs: 90*4882a593Smuzhiyun warnings.warn( 91*4882a593Smuzhiyun "BS4 does not respect the convertEntities argument to the " 92*4882a593Smuzhiyun "BeautifulSoup constructor. Entities are always converted " 93*4882a593Smuzhiyun "to Unicode characters.") 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun if 'markupMassage' in kwargs: 96*4882a593Smuzhiyun del kwargs['markupMassage'] 97*4882a593Smuzhiyun warnings.warn( 98*4882a593Smuzhiyun "BS4 does not respect the markupMassage argument to the " 99*4882a593Smuzhiyun "BeautifulSoup constructor. The tree builder is responsible " 100*4882a593Smuzhiyun "for any necessary markup massage.") 101*4882a593Smuzhiyun 102*4882a593Smuzhiyun if 'smartQuotesTo' in kwargs: 103*4882a593Smuzhiyun del kwargs['smartQuotesTo'] 104*4882a593Smuzhiyun warnings.warn( 105*4882a593Smuzhiyun "BS4 does not respect the smartQuotesTo argument to the " 106*4882a593Smuzhiyun "BeautifulSoup constructor. Smart quotes are always converted " 107*4882a593Smuzhiyun "to Unicode characters.") 108*4882a593Smuzhiyun 109*4882a593Smuzhiyun if 'selfClosingTags' in kwargs: 110*4882a593Smuzhiyun del kwargs['selfClosingTags'] 111*4882a593Smuzhiyun warnings.warn( 112*4882a593Smuzhiyun "BS4 does not respect the selfClosingTags argument to the " 113*4882a593Smuzhiyun "BeautifulSoup constructor. The tree builder is responsible " 114*4882a593Smuzhiyun "for understanding self-closing tags.") 115*4882a593Smuzhiyun 116*4882a593Smuzhiyun if 'isHTML' in kwargs: 117*4882a593Smuzhiyun del kwargs['isHTML'] 118*4882a593Smuzhiyun warnings.warn( 119*4882a593Smuzhiyun "BS4 does not respect the isHTML argument to the " 120*4882a593Smuzhiyun "BeautifulSoup constructor. Suggest you use " 121*4882a593Smuzhiyun "features='lxml' for HTML and features='lxml-xml' for " 122*4882a593Smuzhiyun "XML.") 123*4882a593Smuzhiyun 124*4882a593Smuzhiyun def deprecated_argument(old_name, new_name): 125*4882a593Smuzhiyun if old_name in kwargs: 126*4882a593Smuzhiyun warnings.warn( 127*4882a593Smuzhiyun 'The "%s" argument to the BeautifulSoup constructor ' 128*4882a593Smuzhiyun 'has been renamed to "%s."' % (old_name, new_name)) 129*4882a593Smuzhiyun value = kwargs[old_name] 130*4882a593Smuzhiyun del kwargs[old_name] 131*4882a593Smuzhiyun return value 132*4882a593Smuzhiyun return None 133*4882a593Smuzhiyun 134*4882a593Smuzhiyun parse_only = parse_only or deprecated_argument( 135*4882a593Smuzhiyun "parseOnlyThese", "parse_only") 136*4882a593Smuzhiyun 137*4882a593Smuzhiyun from_encoding = from_encoding or deprecated_argument( 138*4882a593Smuzhiyun "fromEncoding", "from_encoding") 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun if len(kwargs) > 0: 141*4882a593Smuzhiyun arg = list(kwargs.keys()).pop() 142*4882a593Smuzhiyun raise TypeError( 143*4882a593Smuzhiyun "__init__() got an unexpected keyword argument '%s'" % arg) 144*4882a593Smuzhiyun 145*4882a593Smuzhiyun if builder is None: 146*4882a593Smuzhiyun original_features = features 147*4882a593Smuzhiyun if isinstance(features, str): 148*4882a593Smuzhiyun features = [features] 149*4882a593Smuzhiyun if features is None or len(features) == 0: 150*4882a593Smuzhiyun features = self.DEFAULT_BUILDER_FEATURES 151*4882a593Smuzhiyun builder_class = builder_registry.lookup(*features) 152*4882a593Smuzhiyun if builder_class is None: 153*4882a593Smuzhiyun raise FeatureNotFound( 154*4882a593Smuzhiyun "Couldn't find a tree builder with the features you " 155*4882a593Smuzhiyun "requested: %s. Do you need to install a parser library?" 156*4882a593Smuzhiyun % ",".join(features)) 157*4882a593Smuzhiyun builder = builder_class() 158*4882a593Smuzhiyun if not (original_features == builder.NAME or 159*4882a593Smuzhiyun original_features in builder.ALTERNATE_NAMES): 160*4882a593Smuzhiyun if builder.is_xml: 161*4882a593Smuzhiyun markup_type = "XML" 162*4882a593Smuzhiyun else: 163*4882a593Smuzhiyun markup_type = "HTML" 164*4882a593Smuzhiyun warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( 165*4882a593Smuzhiyun parser=builder.NAME, 166*4882a593Smuzhiyun markup_type=markup_type)) 167*4882a593Smuzhiyun 168*4882a593Smuzhiyun self.builder = builder 169*4882a593Smuzhiyun self.is_xml = builder.is_xml 170*4882a593Smuzhiyun self.builder.soup = self 171*4882a593Smuzhiyun 172*4882a593Smuzhiyun self.parse_only = parse_only 173*4882a593Smuzhiyun 174*4882a593Smuzhiyun if hasattr(markup, 'read'): # It's a file-type object. 175*4882a593Smuzhiyun markup = markup.read() 176*4882a593Smuzhiyun elif len(markup) <= 256: 177*4882a593Smuzhiyun # Print out warnings for a couple beginner problems 178*4882a593Smuzhiyun # involving passing non-markup to Beautiful Soup. 179*4882a593Smuzhiyun # Beautiful Soup will still parse the input as markup, 180*4882a593Smuzhiyun # just in case that's what the user really wants. 181*4882a593Smuzhiyun if (isinstance(markup, str) 182*4882a593Smuzhiyun and not os.path.supports_unicode_filenames): 183*4882a593Smuzhiyun possible_filename = markup.encode("utf8") 184*4882a593Smuzhiyun else: 185*4882a593Smuzhiyun possible_filename = markup 186*4882a593Smuzhiyun is_file = False 187*4882a593Smuzhiyun try: 188*4882a593Smuzhiyun is_file = os.path.exists(possible_filename) 189*4882a593Smuzhiyun except Exception as e: 190*4882a593Smuzhiyun # This is almost certainly a problem involving 191*4882a593Smuzhiyun # characters not valid in filenames on this 192*4882a593Smuzhiyun # system. Just let it go. 193*4882a593Smuzhiyun pass 194*4882a593Smuzhiyun if is_file: 195*4882a593Smuzhiyun if isinstance(markup, str): 196*4882a593Smuzhiyun markup = markup.encode("utf8") 197*4882a593Smuzhiyun warnings.warn( 198*4882a593Smuzhiyun '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) 199*4882a593Smuzhiyun if markup[:5] == "http:" or markup[:6] == "https:": 200*4882a593Smuzhiyun # TODO: This is ugly but I couldn't get it to work in 201*4882a593Smuzhiyun # Python 3 otherwise. 202*4882a593Smuzhiyun if ((isinstance(markup, bytes) and not b' ' in markup) 203*4882a593Smuzhiyun or (isinstance(markup, str) and not ' ' in markup)): 204*4882a593Smuzhiyun if isinstance(markup, str): 205*4882a593Smuzhiyun markup = markup.encode("utf8") 206*4882a593Smuzhiyun warnings.warn( 207*4882a593Smuzhiyun '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) 208*4882a593Smuzhiyun 209*4882a593Smuzhiyun for (self.markup, self.original_encoding, self.declared_html_encoding, 210*4882a593Smuzhiyun self.contains_replacement_characters) in ( 211*4882a593Smuzhiyun self.builder.prepare_markup( 212*4882a593Smuzhiyun markup, from_encoding, exclude_encodings=exclude_encodings)): 213*4882a593Smuzhiyun self.reset() 214*4882a593Smuzhiyun try: 215*4882a593Smuzhiyun self._feed() 216*4882a593Smuzhiyun break 217*4882a593Smuzhiyun except ParserRejectedMarkup: 218*4882a593Smuzhiyun pass 219*4882a593Smuzhiyun 220*4882a593Smuzhiyun # Clear out the markup and remove the builder's circular 221*4882a593Smuzhiyun # reference to this object. 222*4882a593Smuzhiyun self.markup = None 223*4882a593Smuzhiyun self.builder.soup = None 224*4882a593Smuzhiyun 225*4882a593Smuzhiyun def __copy__(self): 226*4882a593Smuzhiyun return type(self)(self.encode(), builder=self.builder) 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun def __getstate__(self): 229*4882a593Smuzhiyun # Frequently a tree builder can't be pickled. 230*4882a593Smuzhiyun d = dict(self.__dict__) 231*4882a593Smuzhiyun if 'builder' in d and not self.builder.picklable: 232*4882a593Smuzhiyun del d['builder'] 233*4882a593Smuzhiyun return d 234*4882a593Smuzhiyun 235*4882a593Smuzhiyun def _feed(self): 236*4882a593Smuzhiyun # Convert the document to Unicode. 237*4882a593Smuzhiyun self.builder.reset() 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun self.builder.feed(self.markup) 240*4882a593Smuzhiyun # Close out any unfinished strings and close all the open tags. 241*4882a593Smuzhiyun self.endData() 242*4882a593Smuzhiyun while self.currentTag.name != self.ROOT_TAG_NAME: 243*4882a593Smuzhiyun self.popTag() 244*4882a593Smuzhiyun 245*4882a593Smuzhiyun def reset(self): 246*4882a593Smuzhiyun Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 247*4882a593Smuzhiyun self.hidden = 1 248*4882a593Smuzhiyun self.builder.reset() 249*4882a593Smuzhiyun self.current_data = [] 250*4882a593Smuzhiyun self.currentTag = None 251*4882a593Smuzhiyun self.tagStack = [] 252*4882a593Smuzhiyun self.preserve_whitespace_tag_stack = [] 253*4882a593Smuzhiyun self.pushTag(self) 254*4882a593Smuzhiyun 255*4882a593Smuzhiyun def new_tag(self, name, namespace=None, nsprefix=None, **attrs): 256*4882a593Smuzhiyun """Create a new tag associated with this soup.""" 257*4882a593Smuzhiyun return Tag(None, self.builder, name, namespace, nsprefix, attrs) 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun def new_string(self, s, subclass=NavigableString): 260*4882a593Smuzhiyun """Create a new NavigableString associated with this soup.""" 261*4882a593Smuzhiyun return subclass(s) 262*4882a593Smuzhiyun 263*4882a593Smuzhiyun def insert_before(self, successor): 264*4882a593Smuzhiyun raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 265*4882a593Smuzhiyun 266*4882a593Smuzhiyun def insert_after(self, successor): 267*4882a593Smuzhiyun raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 268*4882a593Smuzhiyun 269*4882a593Smuzhiyun def popTag(self): 270*4882a593Smuzhiyun tag = self.tagStack.pop() 271*4882a593Smuzhiyun if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: 272*4882a593Smuzhiyun self.preserve_whitespace_tag_stack.pop() 273*4882a593Smuzhiyun #print "Pop", tag.name 274*4882a593Smuzhiyun if self.tagStack: 275*4882a593Smuzhiyun self.currentTag = self.tagStack[-1] 276*4882a593Smuzhiyun return self.currentTag 277*4882a593Smuzhiyun 278*4882a593Smuzhiyun def pushTag(self, tag): 279*4882a593Smuzhiyun #print "Push", tag.name 280*4882a593Smuzhiyun if self.currentTag: 281*4882a593Smuzhiyun self.currentTag.contents.append(tag) 282*4882a593Smuzhiyun self.tagStack.append(tag) 283*4882a593Smuzhiyun self.currentTag = self.tagStack[-1] 284*4882a593Smuzhiyun if tag.name in self.builder.preserve_whitespace_tags: 285*4882a593Smuzhiyun self.preserve_whitespace_tag_stack.append(tag) 286*4882a593Smuzhiyun 287*4882a593Smuzhiyun def endData(self, containerClass=NavigableString): 288*4882a593Smuzhiyun if self.current_data: 289*4882a593Smuzhiyun current_data = ''.join(self.current_data) 290*4882a593Smuzhiyun # If whitespace is not preserved, and this string contains 291*4882a593Smuzhiyun # nothing but ASCII spaces, replace it with a single space 292*4882a593Smuzhiyun # or newline. 293*4882a593Smuzhiyun if not self.preserve_whitespace_tag_stack: 294*4882a593Smuzhiyun strippable = True 295*4882a593Smuzhiyun for i in current_data: 296*4882a593Smuzhiyun if i not in self.ASCII_SPACES: 297*4882a593Smuzhiyun strippable = False 298*4882a593Smuzhiyun break 299*4882a593Smuzhiyun if strippable: 300*4882a593Smuzhiyun if '\n' in current_data: 301*4882a593Smuzhiyun current_data = '\n' 302*4882a593Smuzhiyun else: 303*4882a593Smuzhiyun current_data = ' ' 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun # Reset the data collector. 306*4882a593Smuzhiyun self.current_data = [] 307*4882a593Smuzhiyun 308*4882a593Smuzhiyun # Should we add this string to the tree at all? 309*4882a593Smuzhiyun if self.parse_only and len(self.tagStack) <= 1 and \ 310*4882a593Smuzhiyun (not self.parse_only.text or \ 311*4882a593Smuzhiyun not self.parse_only.search(current_data)): 312*4882a593Smuzhiyun return 313*4882a593Smuzhiyun 314*4882a593Smuzhiyun o = containerClass(current_data) 315*4882a593Smuzhiyun self.object_was_parsed(o) 316*4882a593Smuzhiyun 317*4882a593Smuzhiyun def object_was_parsed(self, o, parent=None, most_recent_element=None): 318*4882a593Smuzhiyun """Add an object to the parse tree.""" 319*4882a593Smuzhiyun parent = parent or self.currentTag 320*4882a593Smuzhiyun previous_element = most_recent_element or self._most_recent_element 321*4882a593Smuzhiyun 322*4882a593Smuzhiyun next_element = previous_sibling = next_sibling = None 323*4882a593Smuzhiyun if isinstance(o, Tag): 324*4882a593Smuzhiyun next_element = o.next_element 325*4882a593Smuzhiyun next_sibling = o.next_sibling 326*4882a593Smuzhiyun previous_sibling = o.previous_sibling 327*4882a593Smuzhiyun if not previous_element: 328*4882a593Smuzhiyun previous_element = o.previous_element 329*4882a593Smuzhiyun 330*4882a593Smuzhiyun o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) 331*4882a593Smuzhiyun 332*4882a593Smuzhiyun self._most_recent_element = o 333*4882a593Smuzhiyun parent.contents.append(o) 334*4882a593Smuzhiyun 335*4882a593Smuzhiyun if parent.next_sibling: 336*4882a593Smuzhiyun # This node is being inserted into an element that has 337*4882a593Smuzhiyun # already been parsed. Deal with any dangling references. 338*4882a593Smuzhiyun index = parent.contents.index(o) 339*4882a593Smuzhiyun if index == 0: 340*4882a593Smuzhiyun previous_element = parent 341*4882a593Smuzhiyun previous_sibling = None 342*4882a593Smuzhiyun else: 343*4882a593Smuzhiyun previous_element = previous_sibling = parent.contents[index-1] 344*4882a593Smuzhiyun if index == len(parent.contents)-1: 345*4882a593Smuzhiyun next_element = parent.next_sibling 346*4882a593Smuzhiyun next_sibling = None 347*4882a593Smuzhiyun else: 348*4882a593Smuzhiyun next_element = next_sibling = parent.contents[index+1] 349*4882a593Smuzhiyun 350*4882a593Smuzhiyun o.previous_element = previous_element 351*4882a593Smuzhiyun if previous_element: 352*4882a593Smuzhiyun previous_element.next_element = o 353*4882a593Smuzhiyun o.next_element = next_element 354*4882a593Smuzhiyun if next_element: 355*4882a593Smuzhiyun next_element.previous_element = o 356*4882a593Smuzhiyun o.next_sibling = next_sibling 357*4882a593Smuzhiyun if next_sibling: 358*4882a593Smuzhiyun next_sibling.previous_sibling = o 359*4882a593Smuzhiyun o.previous_sibling = previous_sibling 360*4882a593Smuzhiyun if previous_sibling: 361*4882a593Smuzhiyun previous_sibling.next_sibling = o 362*4882a593Smuzhiyun 363*4882a593Smuzhiyun def _popToTag(self, name, nsprefix=None, inclusivePop=True): 364*4882a593Smuzhiyun """Pops the tag stack up to and including the most recent 365*4882a593Smuzhiyun instance of the given tag. If inclusivePop is false, pops the tag 366*4882a593Smuzhiyun stack up to but *not* including the most recent instqance of 367*4882a593Smuzhiyun the given tag.""" 368*4882a593Smuzhiyun #print "Popping to %s" % name 369*4882a593Smuzhiyun if name == self.ROOT_TAG_NAME: 370*4882a593Smuzhiyun # The BeautifulSoup object itself can never be popped. 371*4882a593Smuzhiyun return 372*4882a593Smuzhiyun 373*4882a593Smuzhiyun most_recently_popped = None 374*4882a593Smuzhiyun 375*4882a593Smuzhiyun stack_size = len(self.tagStack) 376*4882a593Smuzhiyun for i in range(stack_size - 1, 0, -1): 377*4882a593Smuzhiyun t = self.tagStack[i] 378*4882a593Smuzhiyun if (name == t.name and nsprefix == t.prefix): 379*4882a593Smuzhiyun if inclusivePop: 380*4882a593Smuzhiyun most_recently_popped = self.popTag() 381*4882a593Smuzhiyun break 382*4882a593Smuzhiyun most_recently_popped = self.popTag() 383*4882a593Smuzhiyun 384*4882a593Smuzhiyun return most_recently_popped 385*4882a593Smuzhiyun 386*4882a593Smuzhiyun def handle_starttag(self, name, namespace, nsprefix, attrs): 387*4882a593Smuzhiyun """Push a start tag on to the stack. 388*4882a593Smuzhiyun 389*4882a593Smuzhiyun If this method returns None, the tag was rejected by the 390*4882a593Smuzhiyun SoupStrainer. You should proceed as if the tag had not occured 391*4882a593Smuzhiyun in the document. For instance, if this was a self-closing tag, 392*4882a593Smuzhiyun don't call handle_endtag. 393*4882a593Smuzhiyun """ 394*4882a593Smuzhiyun 395*4882a593Smuzhiyun # print "Start tag %s: %s" % (name, attrs) 396*4882a593Smuzhiyun self.endData() 397*4882a593Smuzhiyun 398*4882a593Smuzhiyun if (self.parse_only and len(self.tagStack) <= 1 399*4882a593Smuzhiyun and (self.parse_only.text 400*4882a593Smuzhiyun or not self.parse_only.search_tag(name, attrs))): 401*4882a593Smuzhiyun return None 402*4882a593Smuzhiyun 403*4882a593Smuzhiyun tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, 404*4882a593Smuzhiyun self.currentTag, self._most_recent_element) 405*4882a593Smuzhiyun if tag is None: 406*4882a593Smuzhiyun return tag 407*4882a593Smuzhiyun if self._most_recent_element: 408*4882a593Smuzhiyun self._most_recent_element.next_element = tag 409*4882a593Smuzhiyun self._most_recent_element = tag 410*4882a593Smuzhiyun self.pushTag(tag) 411*4882a593Smuzhiyun return tag 412*4882a593Smuzhiyun 413*4882a593Smuzhiyun def handle_endtag(self, name, nsprefix=None): 414*4882a593Smuzhiyun #print "End tag: " + name 415*4882a593Smuzhiyun self.endData() 416*4882a593Smuzhiyun self._popToTag(name, nsprefix) 417*4882a593Smuzhiyun 418*4882a593Smuzhiyun def handle_data(self, data): 419*4882a593Smuzhiyun self.current_data.append(data) 420*4882a593Smuzhiyun 421*4882a593Smuzhiyun def decode(self, pretty_print=False, 422*4882a593Smuzhiyun eventual_encoding=DEFAULT_OUTPUT_ENCODING, 423*4882a593Smuzhiyun formatter="minimal"): 424*4882a593Smuzhiyun """Returns a string or Unicode representation of this document. 425*4882a593Smuzhiyun To get Unicode, pass None for encoding.""" 426*4882a593Smuzhiyun 427*4882a593Smuzhiyun if self.is_xml: 428*4882a593Smuzhiyun # Print the XML declaration 429*4882a593Smuzhiyun encoding_part = '' 430*4882a593Smuzhiyun if eventual_encoding is not None: 431*4882a593Smuzhiyun encoding_part = ' encoding="%s"' % eventual_encoding 432*4882a593Smuzhiyun prefix = '<?xml version="1.0"%s?>\n' % encoding_part 433*4882a593Smuzhiyun else: 434*4882a593Smuzhiyun prefix = '' 435*4882a593Smuzhiyun if not pretty_print: 436*4882a593Smuzhiyun indent_level = None 437*4882a593Smuzhiyun else: 438*4882a593Smuzhiyun indent_level = 0 439*4882a593Smuzhiyun return prefix + super(BeautifulSoup, self).decode( 440*4882a593Smuzhiyun indent_level, eventual_encoding, formatter) 441*4882a593Smuzhiyun 442*4882a593Smuzhiyun# Alias to make it easier to type import: 'from bs4 import _soup' 443*4882a593Smuzhiyun_s = BeautifulSoup 444*4882a593Smuzhiyun_soup = BeautifulSoup 445*4882a593Smuzhiyun 446*4882a593Smuzhiyunclass BeautifulStoneSoup(BeautifulSoup): 447*4882a593Smuzhiyun """Deprecated interface to an XML parser.""" 448*4882a593Smuzhiyun 449*4882a593Smuzhiyun def __init__(self, *args, **kwargs): 450*4882a593Smuzhiyun kwargs['features'] = 'xml' 451*4882a593Smuzhiyun warnings.warn( 452*4882a593Smuzhiyun 'The BeautifulStoneSoup class is deprecated. Instead of using ' 453*4882a593Smuzhiyun 'it, pass features="xml" into the BeautifulSoup constructor.') 454*4882a593Smuzhiyun super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 455*4882a593Smuzhiyun 456*4882a593Smuzhiyun 457*4882a593Smuzhiyunclass StopParsing(Exception): 458*4882a593Smuzhiyun pass 459*4882a593Smuzhiyun 460*4882a593Smuzhiyunclass FeatureNotFound(ValueError): 461*4882a593Smuzhiyun pass 462*4882a593Smuzhiyun 463*4882a593Smuzhiyun 464*4882a593Smuzhiyun#By default, act as an HTML pretty-printer. 465*4882a593Smuzhiyunif __name__ == '__main__': 466*4882a593Smuzhiyun import sys 467*4882a593Smuzhiyun soup = BeautifulSoup(sys.stdin) 468*4882a593Smuzhiyun print(soup.prettify()) 469