xref: /OK3568_Linux_fs/yocto/bitbake/lib/bs4/__init__.py (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun"""Beautiful Soup
2*4882a593SmuzhiyunElixir and Tonic
3*4882a593Smuzhiyun"The Screen-Scraper's Friend"
4*4882a593Smuzhiyunhttp://www.crummy.com/software/BeautifulSoup/
5*4882a593Smuzhiyun
6*4882a593SmuzhiyunBeautiful Soup uses a pluggable XML or HTML parser to parse a
7*4882a593Smuzhiyun(possibly invalid) document into a tree representation. Beautiful Soup
8*4882a593Smuzhiyunprovides provides methods and Pythonic idioms that make it easy to
9*4882a593Smuzhiyunnavigate, search, and modify the parse tree.
10*4882a593Smuzhiyun
11*4882a593SmuzhiyunBeautiful Soup works with Python 2.6 and up. It works better if lxml
12*4882a593Smuzhiyunand/or html5lib is installed.
13*4882a593Smuzhiyun
14*4882a593SmuzhiyunFor more than you ever wanted to know about Beautiful Soup, see the
15*4882a593Smuzhiyundocumentation:
16*4882a593Smuzhiyunhttp://www.crummy.com/software/BeautifulSoup/bs4/doc/
17*4882a593Smuzhiyun"""
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun__author__ = "Leonard Richardson (leonardr@segfault.org)"
20*4882a593Smuzhiyun__version__ = "4.4.1"
21*4882a593Smuzhiyun__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
22*4882a593Smuzhiyun__license__ = "MIT"
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun__all__ = ['BeautifulSoup']
25*4882a593Smuzhiyun
26*4882a593Smuzhiyunimport os
27*4882a593Smuzhiyunimport re
28*4882a593Smuzhiyunimport warnings
29*4882a593Smuzhiyun
30*4882a593Smuzhiyunfrom .builder import builder_registry, ParserRejectedMarkup
31*4882a593Smuzhiyunfrom .dammit import UnicodeDammit
32*4882a593Smuzhiyunfrom .element import (
33*4882a593Smuzhiyun    CData,
34*4882a593Smuzhiyun    Comment,
35*4882a593Smuzhiyun    DEFAULT_OUTPUT_ENCODING,
36*4882a593Smuzhiyun    Declaration,
37*4882a593Smuzhiyun    Doctype,
38*4882a593Smuzhiyun    NavigableString,
39*4882a593Smuzhiyun    PageElement,
40*4882a593Smuzhiyun    ProcessingInstruction,
41*4882a593Smuzhiyun    ResultSet,
42*4882a593Smuzhiyun    SoupStrainer,
43*4882a593Smuzhiyun    Tag,
44*4882a593Smuzhiyun    )
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun# The very first thing we do is give a useful error if someone is
47*4882a593Smuzhiyun# running this code under Python 3 without converting it.
48*4882a593Smuzhiyun'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
49*4882a593Smuzhiyun
50*4882a593Smuzhiyunclass BeautifulSoup(Tag):
51*4882a593Smuzhiyun    """
52*4882a593Smuzhiyun    This class defines the basic interface called by the tree builders.
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun    These methods will be called by the parser:
55*4882a593Smuzhiyun      reset()
56*4882a593Smuzhiyun      feed(markup)
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun    The tree builder may call these methods from its feed() implementation:
59*4882a593Smuzhiyun      handle_starttag(name, attrs) # See note about return value
60*4882a593Smuzhiyun      handle_endtag(name)
61*4882a593Smuzhiyun      handle_data(data) # Appends to the current data node
62*4882a593Smuzhiyun      endData(containerClass=NavigableString) # Ends the current data node
63*4882a593Smuzhiyun
64*4882a593Smuzhiyun    No matter how complicated the underlying parser is, you should be
65*4882a593Smuzhiyun    able to build a tree using 'start tag' events, 'end tag' events,
66*4882a593Smuzhiyun    'data' events, and "done with data" events.
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun    If you encounter an empty-element tag (aka a self-closing tag,
69*4882a593Smuzhiyun    like HTML's <br> tag), call handle_starttag and then
70*4882a593Smuzhiyun    handle_endtag.
71*4882a593Smuzhiyun    """
72*4882a593Smuzhiyun    ROOT_TAG_NAME = '[document]'
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun    # If the end-user gives no indication which tree builder they
75*4882a593Smuzhiyun    # want, look for one with these features.
76*4882a593Smuzhiyun    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
81*4882a593Smuzhiyun
82*4882a593Smuzhiyun    def __init__(self, markup="", features=None, builder=None,
83*4882a593Smuzhiyun                 parse_only=None, from_encoding=None, exclude_encodings=None,
84*4882a593Smuzhiyun                 **kwargs):
85*4882a593Smuzhiyun        """The Soup object is initialized as the 'root tag', and the
86*4882a593Smuzhiyun        provided markup (which can be a string or a file-like object)
87*4882a593Smuzhiyun        is fed into the underlying parser."""
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun        if 'convertEntities' in kwargs:
90*4882a593Smuzhiyun            warnings.warn(
91*4882a593Smuzhiyun                "BS4 does not respect the convertEntities argument to the "
92*4882a593Smuzhiyun                "BeautifulSoup constructor. Entities are always converted "
93*4882a593Smuzhiyun                "to Unicode characters.")
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun        if 'markupMassage' in kwargs:
96*4882a593Smuzhiyun            del kwargs['markupMassage']
97*4882a593Smuzhiyun            warnings.warn(
98*4882a593Smuzhiyun                "BS4 does not respect the markupMassage argument to the "
99*4882a593Smuzhiyun                "BeautifulSoup constructor. The tree builder is responsible "
100*4882a593Smuzhiyun                "for any necessary markup massage.")
101*4882a593Smuzhiyun
102*4882a593Smuzhiyun        if 'smartQuotesTo' in kwargs:
103*4882a593Smuzhiyun            del kwargs['smartQuotesTo']
104*4882a593Smuzhiyun            warnings.warn(
105*4882a593Smuzhiyun                "BS4 does not respect the smartQuotesTo argument to the "
106*4882a593Smuzhiyun                "BeautifulSoup constructor. Smart quotes are always converted "
107*4882a593Smuzhiyun                "to Unicode characters.")
108*4882a593Smuzhiyun
109*4882a593Smuzhiyun        if 'selfClosingTags' in kwargs:
110*4882a593Smuzhiyun            del kwargs['selfClosingTags']
111*4882a593Smuzhiyun            warnings.warn(
112*4882a593Smuzhiyun                "BS4 does not respect the selfClosingTags argument to the "
113*4882a593Smuzhiyun                "BeautifulSoup constructor. The tree builder is responsible "
114*4882a593Smuzhiyun                "for understanding self-closing tags.")
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun        if 'isHTML' in kwargs:
117*4882a593Smuzhiyun            del kwargs['isHTML']
118*4882a593Smuzhiyun            warnings.warn(
119*4882a593Smuzhiyun                "BS4 does not respect the isHTML argument to the "
120*4882a593Smuzhiyun                "BeautifulSoup constructor. Suggest you use "
121*4882a593Smuzhiyun                "features='lxml' for HTML and features='lxml-xml' for "
122*4882a593Smuzhiyun                "XML.")
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun        def deprecated_argument(old_name, new_name):
125*4882a593Smuzhiyun            if old_name in kwargs:
126*4882a593Smuzhiyun                warnings.warn(
127*4882a593Smuzhiyun                    'The "%s" argument to the BeautifulSoup constructor '
128*4882a593Smuzhiyun                    'has been renamed to "%s."' % (old_name, new_name))
129*4882a593Smuzhiyun                value = kwargs[old_name]
130*4882a593Smuzhiyun                del kwargs[old_name]
131*4882a593Smuzhiyun                return value
132*4882a593Smuzhiyun            return None
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun        parse_only = parse_only or deprecated_argument(
135*4882a593Smuzhiyun            "parseOnlyThese", "parse_only")
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun        from_encoding = from_encoding or deprecated_argument(
138*4882a593Smuzhiyun            "fromEncoding", "from_encoding")
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun        if len(kwargs) > 0:
141*4882a593Smuzhiyun            arg = list(kwargs.keys()).pop()
142*4882a593Smuzhiyun            raise TypeError(
143*4882a593Smuzhiyun                "__init__() got an unexpected keyword argument '%s'" % arg)
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun        if builder is None:
146*4882a593Smuzhiyun            original_features = features
147*4882a593Smuzhiyun            if isinstance(features, str):
148*4882a593Smuzhiyun                features = [features]
149*4882a593Smuzhiyun            if features is None or len(features) == 0:
150*4882a593Smuzhiyun                features = self.DEFAULT_BUILDER_FEATURES
151*4882a593Smuzhiyun            builder_class = builder_registry.lookup(*features)
152*4882a593Smuzhiyun            if builder_class is None:
153*4882a593Smuzhiyun                raise FeatureNotFound(
154*4882a593Smuzhiyun                    "Couldn't find a tree builder with the features you "
155*4882a593Smuzhiyun                    "requested: %s. Do you need to install a parser library?"
156*4882a593Smuzhiyun                    % ",".join(features))
157*4882a593Smuzhiyun            builder = builder_class()
158*4882a593Smuzhiyun            if not (original_features == builder.NAME or
159*4882a593Smuzhiyun                    original_features in builder.ALTERNATE_NAMES):
160*4882a593Smuzhiyun                if builder.is_xml:
161*4882a593Smuzhiyun                    markup_type = "XML"
162*4882a593Smuzhiyun                else:
163*4882a593Smuzhiyun                    markup_type = "HTML"
164*4882a593Smuzhiyun                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
165*4882a593Smuzhiyun                    parser=builder.NAME,
166*4882a593Smuzhiyun                    markup_type=markup_type))
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun        self.builder = builder
169*4882a593Smuzhiyun        self.is_xml = builder.is_xml
170*4882a593Smuzhiyun        self.builder.soup = self
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun        self.parse_only = parse_only
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun        if hasattr(markup, 'read'):        # It's a file-type object.
175*4882a593Smuzhiyun            markup = markup.read()
176*4882a593Smuzhiyun        elif len(markup) <= 256:
177*4882a593Smuzhiyun            # Print out warnings for a couple beginner problems
178*4882a593Smuzhiyun            # involving passing non-markup to Beautiful Soup.
179*4882a593Smuzhiyun            # Beautiful Soup will still parse the input as markup,
180*4882a593Smuzhiyun            # just in case that's what the user really wants.
181*4882a593Smuzhiyun            if (isinstance(markup, str)
182*4882a593Smuzhiyun                and not os.path.supports_unicode_filenames):
183*4882a593Smuzhiyun                possible_filename = markup.encode("utf8")
184*4882a593Smuzhiyun            else:
185*4882a593Smuzhiyun                possible_filename = markup
186*4882a593Smuzhiyun            is_file = False
187*4882a593Smuzhiyun            try:
188*4882a593Smuzhiyun                is_file = os.path.exists(possible_filename)
189*4882a593Smuzhiyun            except Exception as e:
190*4882a593Smuzhiyun                # This is almost certainly a problem involving
191*4882a593Smuzhiyun                # characters not valid in filenames on this
192*4882a593Smuzhiyun                # system. Just let it go.
193*4882a593Smuzhiyun                pass
194*4882a593Smuzhiyun            if is_file:
195*4882a593Smuzhiyun                if isinstance(markup, str):
196*4882a593Smuzhiyun                    markup = markup.encode("utf8")
197*4882a593Smuzhiyun                warnings.warn(
198*4882a593Smuzhiyun                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
199*4882a593Smuzhiyun            if markup[:5] == "http:" or markup[:6] == "https:":
200*4882a593Smuzhiyun                # TODO: This is ugly but I couldn't get it to work in
201*4882a593Smuzhiyun                # Python 3 otherwise.
202*4882a593Smuzhiyun                if ((isinstance(markup, bytes) and not b' ' in markup)
203*4882a593Smuzhiyun                    or (isinstance(markup, str) and not ' ' in markup)):
204*4882a593Smuzhiyun                    if isinstance(markup, str):
205*4882a593Smuzhiyun                        markup = markup.encode("utf8")
206*4882a593Smuzhiyun                    warnings.warn(
207*4882a593Smuzhiyun                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun        for (self.markup, self.original_encoding, self.declared_html_encoding,
210*4882a593Smuzhiyun         self.contains_replacement_characters) in (
211*4882a593Smuzhiyun             self.builder.prepare_markup(
212*4882a593Smuzhiyun                 markup, from_encoding, exclude_encodings=exclude_encodings)):
213*4882a593Smuzhiyun            self.reset()
214*4882a593Smuzhiyun            try:
215*4882a593Smuzhiyun                self._feed()
216*4882a593Smuzhiyun                break
217*4882a593Smuzhiyun            except ParserRejectedMarkup:
218*4882a593Smuzhiyun                pass
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun        # Clear out the markup and remove the builder's circular
221*4882a593Smuzhiyun        # reference to this object.
222*4882a593Smuzhiyun        self.markup = None
223*4882a593Smuzhiyun        self.builder.soup = None
224*4882a593Smuzhiyun
225*4882a593Smuzhiyun    def __copy__(self):
226*4882a593Smuzhiyun        return type(self)(self.encode(), builder=self.builder)
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun    def __getstate__(self):
229*4882a593Smuzhiyun        # Frequently a tree builder can't be pickled.
230*4882a593Smuzhiyun        d = dict(self.__dict__)
231*4882a593Smuzhiyun        if 'builder' in d and not self.builder.picklable:
232*4882a593Smuzhiyun            del d['builder']
233*4882a593Smuzhiyun        return d
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun    def _feed(self):
236*4882a593Smuzhiyun        # Convert the document to Unicode.
237*4882a593Smuzhiyun        self.builder.reset()
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun        self.builder.feed(self.markup)
240*4882a593Smuzhiyun        # Close out any unfinished strings and close all the open tags.
241*4882a593Smuzhiyun        self.endData()
242*4882a593Smuzhiyun        while self.currentTag.name != self.ROOT_TAG_NAME:
243*4882a593Smuzhiyun            self.popTag()
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun    def reset(self):
246*4882a593Smuzhiyun        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
247*4882a593Smuzhiyun        self.hidden = 1
248*4882a593Smuzhiyun        self.builder.reset()
249*4882a593Smuzhiyun        self.current_data = []
250*4882a593Smuzhiyun        self.currentTag = None
251*4882a593Smuzhiyun        self.tagStack = []
252*4882a593Smuzhiyun        self.preserve_whitespace_tag_stack = []
253*4882a593Smuzhiyun        self.pushTag(self)
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
256*4882a593Smuzhiyun        """Create a new tag associated with this soup."""
257*4882a593Smuzhiyun        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun    def new_string(self, s, subclass=NavigableString):
260*4882a593Smuzhiyun        """Create a new NavigableString associated with this soup."""
261*4882a593Smuzhiyun        return subclass(s)
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun    def insert_before(self, successor):
264*4882a593Smuzhiyun        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun    def insert_after(self, successor):
267*4882a593Smuzhiyun        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun    def popTag(self):
270*4882a593Smuzhiyun        tag = self.tagStack.pop()
271*4882a593Smuzhiyun        if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
272*4882a593Smuzhiyun            self.preserve_whitespace_tag_stack.pop()
273*4882a593Smuzhiyun        #print "Pop", tag.name
274*4882a593Smuzhiyun        if self.tagStack:
275*4882a593Smuzhiyun            self.currentTag = self.tagStack[-1]
276*4882a593Smuzhiyun        return self.currentTag
277*4882a593Smuzhiyun
278*4882a593Smuzhiyun    def pushTag(self, tag):
279*4882a593Smuzhiyun        #print "Push", tag.name
280*4882a593Smuzhiyun        if self.currentTag:
281*4882a593Smuzhiyun            self.currentTag.contents.append(tag)
282*4882a593Smuzhiyun        self.tagStack.append(tag)
283*4882a593Smuzhiyun        self.currentTag = self.tagStack[-1]
284*4882a593Smuzhiyun        if tag.name in self.builder.preserve_whitespace_tags:
285*4882a593Smuzhiyun            self.preserve_whitespace_tag_stack.append(tag)
286*4882a593Smuzhiyun
287*4882a593Smuzhiyun    def endData(self, containerClass=NavigableString):
288*4882a593Smuzhiyun        if self.current_data:
289*4882a593Smuzhiyun            current_data = ''.join(self.current_data)
290*4882a593Smuzhiyun            # If whitespace is not preserved, and this string contains
291*4882a593Smuzhiyun            # nothing but ASCII spaces, replace it with a single space
292*4882a593Smuzhiyun            # or newline.
293*4882a593Smuzhiyun            if not self.preserve_whitespace_tag_stack:
294*4882a593Smuzhiyun                strippable = True
295*4882a593Smuzhiyun                for i in current_data:
296*4882a593Smuzhiyun                    if i not in self.ASCII_SPACES:
297*4882a593Smuzhiyun                        strippable = False
298*4882a593Smuzhiyun                        break
299*4882a593Smuzhiyun                if strippable:
300*4882a593Smuzhiyun                    if '\n' in current_data:
301*4882a593Smuzhiyun                        current_data = '\n'
302*4882a593Smuzhiyun                    else:
303*4882a593Smuzhiyun                        current_data = ' '
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun            # Reset the data collector.
306*4882a593Smuzhiyun            self.current_data = []
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun            # Should we add this string to the tree at all?
309*4882a593Smuzhiyun            if self.parse_only and len(self.tagStack) <= 1 and \
310*4882a593Smuzhiyun                   (not self.parse_only.text or \
311*4882a593Smuzhiyun                    not self.parse_only.search(current_data)):
312*4882a593Smuzhiyun                return
313*4882a593Smuzhiyun
314*4882a593Smuzhiyun            o = containerClass(current_data)
315*4882a593Smuzhiyun            self.object_was_parsed(o)
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun    def object_was_parsed(self, o, parent=None, most_recent_element=None):
318*4882a593Smuzhiyun        """Add an object to the parse tree."""
319*4882a593Smuzhiyun        parent = parent or self.currentTag
320*4882a593Smuzhiyun        previous_element = most_recent_element or self._most_recent_element
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun        next_element = previous_sibling = next_sibling = None
323*4882a593Smuzhiyun        if isinstance(o, Tag):
324*4882a593Smuzhiyun            next_element = o.next_element
325*4882a593Smuzhiyun            next_sibling = o.next_sibling
326*4882a593Smuzhiyun            previous_sibling = o.previous_sibling
327*4882a593Smuzhiyun            if not previous_element:
328*4882a593Smuzhiyun                previous_element = o.previous_element
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun        o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun        self._most_recent_element = o
333*4882a593Smuzhiyun        parent.contents.append(o)
334*4882a593Smuzhiyun
335*4882a593Smuzhiyun        if parent.next_sibling:
336*4882a593Smuzhiyun            # This node is being inserted into an element that has
337*4882a593Smuzhiyun            # already been parsed. Deal with any dangling references.
338*4882a593Smuzhiyun            index = parent.contents.index(o)
339*4882a593Smuzhiyun            if index == 0:
340*4882a593Smuzhiyun                previous_element = parent
341*4882a593Smuzhiyun                previous_sibling = None
342*4882a593Smuzhiyun            else:
343*4882a593Smuzhiyun                previous_element = previous_sibling = parent.contents[index-1]
344*4882a593Smuzhiyun            if index == len(parent.contents)-1:
345*4882a593Smuzhiyun                next_element = parent.next_sibling
346*4882a593Smuzhiyun                next_sibling = None
347*4882a593Smuzhiyun            else:
348*4882a593Smuzhiyun                next_element = next_sibling = parent.contents[index+1]
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun            o.previous_element = previous_element
351*4882a593Smuzhiyun            if previous_element:
352*4882a593Smuzhiyun                previous_element.next_element = o
353*4882a593Smuzhiyun            o.next_element = next_element
354*4882a593Smuzhiyun            if next_element:
355*4882a593Smuzhiyun                next_element.previous_element = o
356*4882a593Smuzhiyun            o.next_sibling = next_sibling
357*4882a593Smuzhiyun            if next_sibling:
358*4882a593Smuzhiyun                next_sibling.previous_sibling = o
359*4882a593Smuzhiyun            o.previous_sibling = previous_sibling
360*4882a593Smuzhiyun            if previous_sibling:
361*4882a593Smuzhiyun                previous_sibling.next_sibling = o
362*4882a593Smuzhiyun
363*4882a593Smuzhiyun    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
364*4882a593Smuzhiyun        """Pops the tag stack up to and including the most recent
365*4882a593Smuzhiyun        instance of the given tag. If inclusivePop is false, pops the tag
366*4882a593Smuzhiyun        stack up to but *not* including the most recent instqance of
367*4882a593Smuzhiyun        the given tag."""
368*4882a593Smuzhiyun        #print "Popping to %s" % name
369*4882a593Smuzhiyun        if name == self.ROOT_TAG_NAME:
370*4882a593Smuzhiyun            # The BeautifulSoup object itself can never be popped.
371*4882a593Smuzhiyun            return
372*4882a593Smuzhiyun
373*4882a593Smuzhiyun        most_recently_popped = None
374*4882a593Smuzhiyun
375*4882a593Smuzhiyun        stack_size = len(self.tagStack)
376*4882a593Smuzhiyun        for i in range(stack_size - 1, 0, -1):
377*4882a593Smuzhiyun            t = self.tagStack[i]
378*4882a593Smuzhiyun            if (name == t.name and nsprefix == t.prefix):
379*4882a593Smuzhiyun                if inclusivePop:
380*4882a593Smuzhiyun                    most_recently_popped = self.popTag()
381*4882a593Smuzhiyun                break
382*4882a593Smuzhiyun            most_recently_popped = self.popTag()
383*4882a593Smuzhiyun
384*4882a593Smuzhiyun        return most_recently_popped
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun    def handle_starttag(self, name, namespace, nsprefix, attrs):
387*4882a593Smuzhiyun        """Push a start tag on to the stack.
388*4882a593Smuzhiyun
389*4882a593Smuzhiyun        If this method returns None, the tag was rejected by the
390*4882a593Smuzhiyun        SoupStrainer. You should proceed as if the tag had not occured
391*4882a593Smuzhiyun        in the document. For instance, if this was a self-closing tag,
392*4882a593Smuzhiyun        don't call handle_endtag.
393*4882a593Smuzhiyun        """
394*4882a593Smuzhiyun
395*4882a593Smuzhiyun        # print "Start tag %s: %s" % (name, attrs)
396*4882a593Smuzhiyun        self.endData()
397*4882a593Smuzhiyun
398*4882a593Smuzhiyun        if (self.parse_only and len(self.tagStack) <= 1
399*4882a593Smuzhiyun            and (self.parse_only.text
400*4882a593Smuzhiyun                 or not self.parse_only.search_tag(name, attrs))):
401*4882a593Smuzhiyun            return None
402*4882a593Smuzhiyun
403*4882a593Smuzhiyun        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
404*4882a593Smuzhiyun                  self.currentTag, self._most_recent_element)
405*4882a593Smuzhiyun        if tag is None:
406*4882a593Smuzhiyun            return tag
407*4882a593Smuzhiyun        if self._most_recent_element:
408*4882a593Smuzhiyun            self._most_recent_element.next_element = tag
409*4882a593Smuzhiyun        self._most_recent_element = tag
410*4882a593Smuzhiyun        self.pushTag(tag)
411*4882a593Smuzhiyun        return tag
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun    def handle_endtag(self, name, nsprefix=None):
414*4882a593Smuzhiyun        #print "End tag: " + name
415*4882a593Smuzhiyun        self.endData()
416*4882a593Smuzhiyun        self._popToTag(name, nsprefix)
417*4882a593Smuzhiyun
418*4882a593Smuzhiyun    def handle_data(self, data):
419*4882a593Smuzhiyun        self.current_data.append(data)
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun    def decode(self, pretty_print=False,
422*4882a593Smuzhiyun               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
423*4882a593Smuzhiyun               formatter="minimal"):
424*4882a593Smuzhiyun        """Returns a string or Unicode representation of this document.
425*4882a593Smuzhiyun        To get Unicode, pass None for encoding."""
426*4882a593Smuzhiyun
427*4882a593Smuzhiyun        if self.is_xml:
428*4882a593Smuzhiyun            # Print the XML declaration
429*4882a593Smuzhiyun            encoding_part = ''
430*4882a593Smuzhiyun            if eventual_encoding is not None:
431*4882a593Smuzhiyun                encoding_part = ' encoding="%s"' % eventual_encoding
432*4882a593Smuzhiyun            prefix = '<?xml version="1.0"%s?>\n' % encoding_part
433*4882a593Smuzhiyun        else:
434*4882a593Smuzhiyun            prefix = ''
435*4882a593Smuzhiyun        if not pretty_print:
436*4882a593Smuzhiyun            indent_level = None
437*4882a593Smuzhiyun        else:
438*4882a593Smuzhiyun            indent_level = 0
439*4882a593Smuzhiyun        return prefix + super(BeautifulSoup, self).decode(
440*4882a593Smuzhiyun            indent_level, eventual_encoding, formatter)
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun# Alias to make it easier to type import: 'from bs4 import _soup'
443*4882a593Smuzhiyun_s = BeautifulSoup
444*4882a593Smuzhiyun_soup = BeautifulSoup
445*4882a593Smuzhiyun
446*4882a593Smuzhiyunclass BeautifulStoneSoup(BeautifulSoup):
447*4882a593Smuzhiyun    """Deprecated interface to an XML parser."""
448*4882a593Smuzhiyun
449*4882a593Smuzhiyun    def __init__(self, *args, **kwargs):
450*4882a593Smuzhiyun        kwargs['features'] = 'xml'
451*4882a593Smuzhiyun        warnings.warn(
452*4882a593Smuzhiyun            'The BeautifulStoneSoup class is deprecated. Instead of using '
453*4882a593Smuzhiyun            'it, pass features="xml" into the BeautifulSoup constructor.')
454*4882a593Smuzhiyun        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun
457*4882a593Smuzhiyunclass StopParsing(Exception):
458*4882a593Smuzhiyun    pass
459*4882a593Smuzhiyun
460*4882a593Smuzhiyunclass FeatureNotFound(ValueError):
461*4882a593Smuzhiyun    pass
462*4882a593Smuzhiyun
463*4882a593Smuzhiyun
464*4882a593Smuzhiyun#By default, act as an HTML pretty-printer.
465*4882a593Smuzhiyunif __name__ == '__main__':
466*4882a593Smuzhiyun    import sys
467*4882a593Smuzhiyun    soup = BeautifulSoup(sys.stdin)
468*4882a593Smuzhiyun    print(soup.prettify())
469