lib/bs4/__init__.py

*4882a593Smuzhiyun"""Beautiful Soup
*4882a593SmuzhiyunElixir and Tonic
*4882a593Smuzhiyun"The Screen-Scraper's Friend"
*4882a593Smuzhiyunhttp://www.crummy.com/software/BeautifulSoup/
*4882a593Smuzhiyun
*4882a593SmuzhiyunBeautiful Soup uses a pluggable XML or HTML parser to parse a
*4882a593Smuzhiyun(possibly invalid) document into a tree representation. Beautiful Soup
*4882a593Smuzhiyunprovides provides methods and Pythonic idioms that make it easy to
*4882a593Smuzhiyunnavigate, search, and modify the parse tree.
*4882a593Smuzhiyun
*4882a593SmuzhiyunBeautiful Soup works with Python 2.6 and up. It works better if lxml
*4882a593Smuzhiyunand/or html5lib is installed.
*4882a593Smuzhiyun
*4882a593SmuzhiyunFor more than you ever wanted to know about Beautiful Soup, see the
*4882a593Smuzhiyundocumentation:
*4882a593Smuzhiyunhttp://www.crummy.com/software/BeautifulSoup/bs4/doc/
*4882a593Smuzhiyun"""
*4882a593Smuzhiyun
*4882a593Smuzhiyun__author__ = "Leonard Richardson (leonardr@segfault.org)"
*4882a593Smuzhiyun__version__ = "4.4.1"
*4882a593Smuzhiyun__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
*4882a593Smuzhiyun__license__ = "MIT"
*4882a593Smuzhiyun
*4882a593Smuzhiyun__all__ = ['BeautifulSoup']
*4882a593Smuzhiyun
*4882a593Smuzhiyunimport os
*4882a593Smuzhiyunimport re
*4882a593Smuzhiyunimport warnings
*4882a593Smuzhiyun
*4882a593Smuzhiyunfrom .builder import builder_registry, ParserRejectedMarkup
*4882a593Smuzhiyunfrom .dammit import UnicodeDammit
*4882a593Smuzhiyunfrom .element import (
*4882a593Smuzhiyun    CData,
*4882a593Smuzhiyun    Comment,
*4882a593Smuzhiyun    DEFAULT_OUTPUT_ENCODING,
*4882a593Smuzhiyun    Declaration,
*4882a593Smuzhiyun    Doctype,
*4882a593Smuzhiyun    NavigableString,
*4882a593Smuzhiyun    PageElement,
*4882a593Smuzhiyun    ProcessingInstruction,
*4882a593Smuzhiyun    ResultSet,
*4882a593Smuzhiyun    SoupStrainer,
*4882a593Smuzhiyun    Tag,
*4882a593Smuzhiyun    )
*4882a593Smuzhiyun
*4882a593Smuzhiyun# The very first thing we do is give a useful error if someone is
*4882a593Smuzhiyun# running this code under Python 3 without converting it.
*4882a593Smuzhiyun'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
*4882a593Smuzhiyun
*4882a593Smuzhiyunclass BeautifulSoup(Tag):
*4882a593Smuzhiyun    """
*4882a593Smuzhiyun    This class defines the basic interface called by the tree builders.
*4882a593Smuzhiyun
*4882a593Smuzhiyun    These methods will be called by the parser:
*4882a593Smuzhiyun      reset()
*4882a593Smuzhiyun      feed(markup)
*4882a593Smuzhiyun
*4882a593Smuzhiyun    The tree builder may call these methods from its feed() implementation:
*4882a593Smuzhiyun      handle_starttag(name, attrs) # See note about return value
*4882a593Smuzhiyun      handle_endtag(name)
*4882a593Smuzhiyun      handle_data(data) # Appends to the current data node
*4882a593Smuzhiyun      endData(containerClass=NavigableString) # Ends the current data node
*4882a593Smuzhiyun
*4882a593Smuzhiyun    No matter how complicated the underlying parser is, you should be
*4882a593Smuzhiyun    able to build a tree using 'start tag' events, 'end tag' events,
*4882a593Smuzhiyun    'data' events, and "done with data" events.
*4882a593Smuzhiyun
*4882a593Smuzhiyun    If you encounter an empty-element tag (aka a self-closing tag,
*4882a593Smuzhiyun    like HTML's <br> tag), call handle_starttag and then
*4882a593Smuzhiyun    handle_endtag.
*4882a593Smuzhiyun    """
*4882a593Smuzhiyun    ROOT_TAG_NAME = '[document]'
*4882a593Smuzhiyun
*4882a593Smuzhiyun    # If the end-user gives no indication which tree builder they
*4882a593Smuzhiyun    # want, look for one with these features.
*4882a593Smuzhiyun    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
*4882a593Smuzhiyun
*4882a593Smuzhiyun    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
*4882a593Smuzhiyun
*4882a593Smuzhiyun    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def __init__(self, markup="", features=None, builder=None,
*4882a593Smuzhiyun                 parse_only=None, from_encoding=None, exclude_encodings=None,
*4882a593Smuzhiyun                 **kwargs):
*4882a593Smuzhiyun        """The Soup object is initialized as the 'root tag', and the
*4882a593Smuzhiyun        provided markup (which can be a string or a file-like object)
*4882a593Smuzhiyun        is fed into the underlying parser."""
*4882a593Smuzhiyun
*4882a593Smuzhiyun        if 'convertEntities' in kwargs:
*4882a593Smuzhiyun            warnings.warn(
*4882a593Smuzhiyun                "BS4 does not respect the convertEntities argument to the "
*4882a593Smuzhiyun                "BeautifulSoup constructor. Entities are always converted "
*4882a593Smuzhiyun                "to Unicode characters.")
*4882a593Smuzhiyun
*4882a593Smuzhiyun        if 'markupMassage' in kwargs:
*4882a593Smuzhiyun            del kwargs['markupMassage']
*4882a593Smuzhiyun            warnings.warn(
*4882a593Smuzhiyun                "BS4 does not respect the markupMassage argument to the "
*4882a593Smuzhiyun                "BeautifulSoup constructor. The tree builder is responsible "
*4882a593Smuzhiyun                "for any necessary markup massage.")
*4882a593Smuzhiyun
*4882a593Smuzhiyun        if 'smartQuotesTo' in kwargs:
*4882a593Smuzhiyun            del kwargs['smartQuotesTo']
*4882a593Smuzhiyun            warnings.warn(
*4882a593Smuzhiyun                "BS4 does not respect the smartQuotesTo argument to the "
*4882a593Smuzhiyun                "BeautifulSoup constructor. Smart quotes are always converted "
*4882a593Smuzhiyun                "to Unicode characters.")
*4882a593Smuzhiyun
*4882a593Smuzhiyun        if 'selfClosingTags' in kwargs:
*4882a593Smuzhiyun            del kwargs['selfClosingTags']
*4882a593Smuzhiyun            warnings.warn(
*4882a593Smuzhiyun                "BS4 does not respect the selfClosingTags argument to the "
*4882a593Smuzhiyun                "BeautifulSoup constructor. The tree builder is responsible "
*4882a593Smuzhiyun                "for understanding self-closing tags.")
*4882a593Smuzhiyun
*4882a593Smuzhiyun        if 'isHTML' in kwargs:
*4882a593Smuzhiyun            del kwargs['isHTML']
*4882a593Smuzhiyun            warnings.warn(
*4882a593Smuzhiyun                "BS4 does not respect the isHTML argument to the "
*4882a593Smuzhiyun                "BeautifulSoup constructor. Suggest you use "
*4882a593Smuzhiyun                "features='lxml' for HTML and features='lxml-xml' for "
*4882a593Smuzhiyun                "XML.")
*4882a593Smuzhiyun
*4882a593Smuzhiyun        def deprecated_argument(old_name, new_name):
*4882a593Smuzhiyun            if old_name in kwargs:
*4882a593Smuzhiyun                warnings.warn(
*4882a593Smuzhiyun                    'The "%s" argument to the BeautifulSoup constructor '
*4882a593Smuzhiyun                    'has been renamed to "%s."' % (old_name, new_name))
*4882a593Smuzhiyun                value = kwargs[old_name]
*4882a593Smuzhiyun                del kwargs[old_name]
*4882a593Smuzhiyun                return value
*4882a593Smuzhiyun            return None
*4882a593Smuzhiyun
*4882a593Smuzhiyun        parse_only = parse_only or deprecated_argument(
*4882a593Smuzhiyun            "parseOnlyThese", "parse_only")
*4882a593Smuzhiyun
*4882a593Smuzhiyun        from_encoding = from_encoding or deprecated_argument(
*4882a593Smuzhiyun            "fromEncoding", "from_encoding")
*4882a593Smuzhiyun
*4882a593Smuzhiyun        if len(kwargs) > 0:
*4882a593Smuzhiyun            arg = list(kwargs.keys()).pop()
*4882a593Smuzhiyun            raise TypeError(
*4882a593Smuzhiyun                "__init__() got an unexpected keyword argument '%s'" % arg)
*4882a593Smuzhiyun
*4882a593Smuzhiyun        if builder is None:
*4882a593Smuzhiyun            original_features = features
*4882a593Smuzhiyun            if isinstance(features, str):
*4882a593Smuzhiyun                features = [features]
*4882a593Smuzhiyun            if features is None or len(features) == 0:
*4882a593Smuzhiyun                features = self.DEFAULT_BUILDER_FEATURES
*4882a593Smuzhiyun            builder_class = builder_registry.lookup(*features)
*4882a593Smuzhiyun            if builder_class is None:
*4882a593Smuzhiyun                raise FeatureNotFound(
*4882a593Smuzhiyun                    "Couldn't find a tree builder with the features you "
*4882a593Smuzhiyun                    "requested: %s. Do you need to install a parser library?"
*4882a593Smuzhiyun                    % ",".join(features))
*4882a593Smuzhiyun            builder = builder_class()
*4882a593Smuzhiyun            if not (original_features == builder.NAME or
*4882a593Smuzhiyun                    original_features in builder.ALTERNATE_NAMES):
*4882a593Smuzhiyun                if builder.is_xml:
*4882a593Smuzhiyun                    markup_type = "XML"
*4882a593Smuzhiyun                else:
*4882a593Smuzhiyun                    markup_type = "HTML"
*4882a593Smuzhiyun                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
*4882a593Smuzhiyun                    parser=builder.NAME,
*4882a593Smuzhiyun                    markup_type=markup_type))
*4882a593Smuzhiyun
*4882a593Smuzhiyun        self.builder = builder
*4882a593Smuzhiyun        self.is_xml = builder.is_xml
*4882a593Smuzhiyun        self.builder.soup = self
*4882a593Smuzhiyun
*4882a593Smuzhiyun        self.parse_only = parse_only
*4882a593Smuzhiyun
*4882a593Smuzhiyun        if hasattr(markup, 'read'):        # It's a file-type object.
*4882a593Smuzhiyun            markup = markup.read()
*4882a593Smuzhiyun        elif len(markup) <= 256:
*4882a593Smuzhiyun            # Print out warnings for a couple beginner problems
*4882a593Smuzhiyun            # involving passing non-markup to Beautiful Soup.
*4882a593Smuzhiyun            # Beautiful Soup will still parse the input as markup,
*4882a593Smuzhiyun            # just in case that's what the user really wants.
*4882a593Smuzhiyun            if (isinstance(markup, str)
*4882a593Smuzhiyun                and not os.path.supports_unicode_filenames):
*4882a593Smuzhiyun                possible_filename = markup.encode("utf8")
*4882a593Smuzhiyun            else:
*4882a593Smuzhiyun                possible_filename = markup
*4882a593Smuzhiyun            is_file = False
*4882a593Smuzhiyun            try:
*4882a593Smuzhiyun                is_file = os.path.exists(possible_filename)
*4882a593Smuzhiyun            except Exception as e:
*4882a593Smuzhiyun                # This is almost certainly a problem involving
*4882a593Smuzhiyun                # characters not valid in filenames on this
*4882a593Smuzhiyun                # system. Just let it go.
*4882a593Smuzhiyun                pass
*4882a593Smuzhiyun            if is_file:
*4882a593Smuzhiyun                if isinstance(markup, str):
*4882a593Smuzhiyun                    markup = markup.encode("utf8")
*4882a593Smuzhiyun                warnings.warn(
*4882a593Smuzhiyun                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
*4882a593Smuzhiyun            if markup[:5] == "http:" or markup[:6] == "https:":
*4882a593Smuzhiyun                # TODO: This is ugly but I couldn't get it to work in
*4882a593Smuzhiyun                # Python 3 otherwise.
*4882a593Smuzhiyun                if ((isinstance(markup, bytes) and not b' ' in markup)
*4882a593Smuzhiyun                    or (isinstance(markup, str) and not ' ' in markup)):
*4882a593Smuzhiyun                    if isinstance(markup, str):
*4882a593Smuzhiyun                        markup = markup.encode("utf8")
*4882a593Smuzhiyun                    warnings.warn(
*4882a593Smuzhiyun                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
*4882a593Smuzhiyun
*4882a593Smuzhiyun        for (self.markup, self.original_encoding, self.declared_html_encoding,
*4882a593Smuzhiyun         self.contains_replacement_characters) in (
*4882a593Smuzhiyun             self.builder.prepare_markup(
*4882a593Smuzhiyun                 markup, from_encoding, exclude_encodings=exclude_encodings)):
*4882a593Smuzhiyun            self.reset()
*4882a593Smuzhiyun            try:
*4882a593Smuzhiyun                self._feed()
*4882a593Smuzhiyun                break
*4882a593Smuzhiyun            except ParserRejectedMarkup:
*4882a593Smuzhiyun                pass
*4882a593Smuzhiyun
*4882a593Smuzhiyun        # Clear out the markup and remove the builder's circular
*4882a593Smuzhiyun        # reference to this object.
*4882a593Smuzhiyun        self.markup = None
*4882a593Smuzhiyun        self.builder.soup = None
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def __copy__(self):
*4882a593Smuzhiyun        return type(self)(self.encode(), builder=self.builder)
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def __getstate__(self):
*4882a593Smuzhiyun        # Frequently a tree builder can't be pickled.
*4882a593Smuzhiyun        d = dict(self.__dict__)
*4882a593Smuzhiyun        if 'builder' in d and not self.builder.picklable:
*4882a593Smuzhiyun            del d['builder']
*4882a593Smuzhiyun        return d
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def _feed(self):
*4882a593Smuzhiyun        # Convert the document to Unicode.
*4882a593Smuzhiyun        self.builder.reset()
*4882a593Smuzhiyun
*4882a593Smuzhiyun        self.builder.feed(self.markup)
*4882a593Smuzhiyun        # Close out any unfinished strings and close all the open tags.
*4882a593Smuzhiyun        self.endData()
*4882a593Smuzhiyun        while self.currentTag.name != self.ROOT_TAG_NAME:
*4882a593Smuzhiyun            self.popTag()
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def reset(self):
*4882a593Smuzhiyun        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
*4882a593Smuzhiyun        self.hidden = 1
*4882a593Smuzhiyun        self.builder.reset()
*4882a593Smuzhiyun        self.current_data = []
*4882a593Smuzhiyun        self.currentTag = None
*4882a593Smuzhiyun        self.tagStack = []
*4882a593Smuzhiyun        self.preserve_whitespace_tag_stack = []
*4882a593Smuzhiyun        self.pushTag(self)
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
*4882a593Smuzhiyun        """Create a new tag associated with this soup."""
*4882a593Smuzhiyun        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def new_string(self, s, subclass=NavigableString):
*4882a593Smuzhiyun        """Create a new NavigableString associated with this soup."""
*4882a593Smuzhiyun        return subclass(s)
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def insert_before(self, successor):
*4882a593Smuzhiyun        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def insert_after(self, successor):
*4882a593Smuzhiyun        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def popTag(self):
*4882a593Smuzhiyun        tag = self.tagStack.pop()
*4882a593Smuzhiyun        if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
*4882a593Smuzhiyun            self.preserve_whitespace_tag_stack.pop()
*4882a593Smuzhiyun        #print "Pop", tag.name
*4882a593Smuzhiyun        if self.tagStack:
*4882a593Smuzhiyun            self.currentTag = self.tagStack[-1]
*4882a593Smuzhiyun        return self.currentTag
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def pushTag(self, tag):
*4882a593Smuzhiyun        #print "Push", tag.name
*4882a593Smuzhiyun        if self.currentTag:
*4882a593Smuzhiyun            self.currentTag.contents.append(tag)
*4882a593Smuzhiyun        self.tagStack.append(tag)
*4882a593Smuzhiyun        self.currentTag = self.tagStack[-1]
*4882a593Smuzhiyun        if tag.name in self.builder.preserve_whitespace_tags:
*4882a593Smuzhiyun            self.preserve_whitespace_tag_stack.append(tag)
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def endData(self, containerClass=NavigableString):
*4882a593Smuzhiyun        if self.current_data:
*4882a593Smuzhiyun            current_data = ''.join(self.current_data)
*4882a593Smuzhiyun            # If whitespace is not preserved, and this string contains
*4882a593Smuzhiyun            # nothing but ASCII spaces, replace it with a single space
*4882a593Smuzhiyun            # or newline.
*4882a593Smuzhiyun            if not self.preserve_whitespace_tag_stack:
*4882a593Smuzhiyun                strippable = True
*4882a593Smuzhiyun                for i in current_data:
*4882a593Smuzhiyun                    if i not in self.ASCII_SPACES:
*4882a593Smuzhiyun                        strippable = False
*4882a593Smuzhiyun                        break
*4882a593Smuzhiyun                if strippable:
*4882a593Smuzhiyun                    if '\n' in current_data:
*4882a593Smuzhiyun                        current_data = '\n'
*4882a593Smuzhiyun                    else:
*4882a593Smuzhiyun                        current_data = ' '
*4882a593Smuzhiyun
*4882a593Smuzhiyun            # Reset the data collector.
*4882a593Smuzhiyun            self.current_data = []
*4882a593Smuzhiyun
*4882a593Smuzhiyun            # Should we add this string to the tree at all?
*4882a593Smuzhiyun            if self.parse_only and len(self.tagStack) <= 1 and \
*4882a593Smuzhiyun                   (not self.parse_only.text or \
*4882a593Smuzhiyun                    not self.parse_only.search(current_data)):
*4882a593Smuzhiyun                return
*4882a593Smuzhiyun
*4882a593Smuzhiyun            o = containerClass(current_data)
*4882a593Smuzhiyun            self.object_was_parsed(o)
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def object_was_parsed(self, o, parent=None, most_recent_element=None):
*4882a593Smuzhiyun        """Add an object to the parse tree."""
*4882a593Smuzhiyun        parent = parent or self.currentTag
*4882a593Smuzhiyun        previous_element = most_recent_element or self._most_recent_element
*4882a593Smuzhiyun
*4882a593Smuzhiyun        next_element = previous_sibling = next_sibling = None
*4882a593Smuzhiyun        if isinstance(o, Tag):
*4882a593Smuzhiyun            next_element = o.next_element
*4882a593Smuzhiyun            next_sibling = o.next_sibling
*4882a593Smuzhiyun            previous_sibling = o.previous_sibling
*4882a593Smuzhiyun            if not previous_element:
*4882a593Smuzhiyun                previous_element = o.previous_element
*4882a593Smuzhiyun
*4882a593Smuzhiyun        o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
*4882a593Smuzhiyun
*4882a593Smuzhiyun        self._most_recent_element = o
*4882a593Smuzhiyun        parent.contents.append(o)
*4882a593Smuzhiyun
*4882a593Smuzhiyun        if parent.next_sibling:
*4882a593Smuzhiyun            # This node is being inserted into an element that has
*4882a593Smuzhiyun            # already been parsed. Deal with any dangling references.
*4882a593Smuzhiyun            index = parent.contents.index(o)
*4882a593Smuzhiyun            if index == 0:
*4882a593Smuzhiyun                previous_element = parent
*4882a593Smuzhiyun                previous_sibling = None
*4882a593Smuzhiyun            else:
*4882a593Smuzhiyun                previous_element = previous_sibling = parent.contents[index-1]
*4882a593Smuzhiyun            if index == len(parent.contents)-1:
*4882a593Smuzhiyun                next_element = parent.next_sibling
*4882a593Smuzhiyun                next_sibling = None
*4882a593Smuzhiyun            else:
*4882a593Smuzhiyun                next_element = next_sibling = parent.contents[index+1]
*4882a593Smuzhiyun
*4882a593Smuzhiyun            o.previous_element = previous_element
*4882a593Smuzhiyun            if previous_element:
*4882a593Smuzhiyun                previous_element.next_element = o
*4882a593Smuzhiyun            o.next_element = next_element
*4882a593Smuzhiyun            if next_element:
*4882a593Smuzhiyun                next_element.previous_element = o
*4882a593Smuzhiyun            o.next_sibling = next_sibling
*4882a593Smuzhiyun            if next_sibling:
*4882a593Smuzhiyun                next_sibling.previous_sibling = o
*4882a593Smuzhiyun            o.previous_sibling = previous_sibling
*4882a593Smuzhiyun            if previous_sibling:
*4882a593Smuzhiyun                previous_sibling.next_sibling = o
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
*4882a593Smuzhiyun        """Pops the tag stack up to and including the most recent
*4882a593Smuzhiyun        instance of the given tag. If inclusivePop is false, pops the tag
*4882a593Smuzhiyun        stack up to but *not* including the most recent instqance of
*4882a593Smuzhiyun        the given tag."""
*4882a593Smuzhiyun        #print "Popping to %s" % name
*4882a593Smuzhiyun        if name == self.ROOT_TAG_NAME:
*4882a593Smuzhiyun            # The BeautifulSoup object itself can never be popped.
*4882a593Smuzhiyun            return
*4882a593Smuzhiyun
*4882a593Smuzhiyun        most_recently_popped = None
*4882a593Smuzhiyun
*4882a593Smuzhiyun        stack_size = len(self.tagStack)
*4882a593Smuzhiyun        for i in range(stack_size - 1, 0, -1):
*4882a593Smuzhiyun            t = self.tagStack[i]
*4882a593Smuzhiyun            if (name == t.name and nsprefix == t.prefix):
*4882a593Smuzhiyun                if inclusivePop:
*4882a593Smuzhiyun                    most_recently_popped = self.popTag()
*4882a593Smuzhiyun                break
*4882a593Smuzhiyun            most_recently_popped = self.popTag()
*4882a593Smuzhiyun
*4882a593Smuzhiyun        return most_recently_popped
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def handle_starttag(self, name, namespace, nsprefix, attrs):
*4882a593Smuzhiyun        """Push a start tag on to the stack.
*4882a593Smuzhiyun
*4882a593Smuzhiyun        If this method returns None, the tag was rejected by the
*4882a593Smuzhiyun        SoupStrainer. You should proceed as if the tag had not occured
*4882a593Smuzhiyun        in the document. For instance, if this was a self-closing tag,
*4882a593Smuzhiyun        don't call handle_endtag.
*4882a593Smuzhiyun        """
*4882a593Smuzhiyun
*4882a593Smuzhiyun        # print "Start tag %s: %s" % (name, attrs)
*4882a593Smuzhiyun        self.endData()
*4882a593Smuzhiyun
*4882a593Smuzhiyun        if (self.parse_only and len(self.tagStack) <= 1
*4882a593Smuzhiyun            and (self.parse_only.text
*4882a593Smuzhiyun                 or not self.parse_only.search_tag(name, attrs))):
*4882a593Smuzhiyun            return None
*4882a593Smuzhiyun
*4882a593Smuzhiyun        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
*4882a593Smuzhiyun                  self.currentTag, self._most_recent_element)
*4882a593Smuzhiyun        if tag is None:
*4882a593Smuzhiyun            return tag
*4882a593Smuzhiyun        if self._most_recent_element:
*4882a593Smuzhiyun            self._most_recent_element.next_element = tag
*4882a593Smuzhiyun        self._most_recent_element = tag
*4882a593Smuzhiyun        self.pushTag(tag)
*4882a593Smuzhiyun        return tag
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def handle_endtag(self, name, nsprefix=None):
*4882a593Smuzhiyun        #print "End tag: " + name
*4882a593Smuzhiyun        self.endData()
*4882a593Smuzhiyun        self._popToTag(name, nsprefix)
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def handle_data(self, data):
*4882a593Smuzhiyun        self.current_data.append(data)
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def decode(self, pretty_print=False,
*4882a593Smuzhiyun               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
*4882a593Smuzhiyun               formatter="minimal"):
*4882a593Smuzhiyun        """Returns a string or Unicode representation of this document.
*4882a593Smuzhiyun        To get Unicode, pass None for encoding."""
*4882a593Smuzhiyun
*4882a593Smuzhiyun        if self.is_xml:
*4882a593Smuzhiyun            # Print the XML declaration
*4882a593Smuzhiyun            encoding_part = ''
*4882a593Smuzhiyun            if eventual_encoding is not None:
*4882a593Smuzhiyun                encoding_part = ' encoding="%s"' % eventual_encoding
*4882a593Smuzhiyun            prefix = '<?xml version="1.0"%s?>\n' % encoding_part
*4882a593Smuzhiyun        else:
*4882a593Smuzhiyun            prefix = ''
*4882a593Smuzhiyun        if not pretty_print:
*4882a593Smuzhiyun            indent_level = None
*4882a593Smuzhiyun        else:
*4882a593Smuzhiyun            indent_level = 0
*4882a593Smuzhiyun        return prefix + super(BeautifulSoup, self).decode(
*4882a593Smuzhiyun            indent_level, eventual_encoding, formatter)
*4882a593Smuzhiyun
*4882a593Smuzhiyun# Alias to make it easier to type import: 'from bs4 import _soup'
*4882a593Smuzhiyun_s = BeautifulSoup
*4882a593Smuzhiyun_soup = BeautifulSoup
*4882a593Smuzhiyun
*4882a593Smuzhiyunclass BeautifulStoneSoup(BeautifulSoup):
*4882a593Smuzhiyun    """Deprecated interface to an XML parser."""
*4882a593Smuzhiyun
*4882a593Smuzhiyun    def __init__(self, *args, **kwargs):
*4882a593Smuzhiyun        kwargs['features'] = 'xml'
*4882a593Smuzhiyun        warnings.warn(
*4882a593Smuzhiyun            'The BeautifulStoneSoup class is deprecated. Instead of using '
*4882a593Smuzhiyun            'it, pass features="xml" into the BeautifulSoup constructor.')
*4882a593Smuzhiyun        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
*4882a593Smuzhiyun
*4882a593Smuzhiyun
*4882a593Smuzhiyunclass StopParsing(Exception):
*4882a593Smuzhiyun    pass
*4882a593Smuzhiyun
*4882a593Smuzhiyunclass FeatureNotFound(ValueError):
*4882a593Smuzhiyun    pass
*4882a593Smuzhiyun
*4882a593Smuzhiyun
*4882a593Smuzhiyun#By default, act as an HTML pretty-printer.
*4882a593Smuzhiyunif __name__ == '__main__':
*4882a593Smuzhiyun    import sys
*4882a593Smuzhiyun    soup = BeautifulSoup(sys.stdin)
*4882a593Smuzhiyun    print(soup.prettify())