1*4882a593Smuzhiyun__license__ = "MIT" 2*4882a593Smuzhiyun 3*4882a593Smuzhiyunimport collections.abc 4*4882a593Smuzhiyunimport re 5*4882a593Smuzhiyunimport sys 6*4882a593Smuzhiyunimport warnings 7*4882a593Smuzhiyunfrom bs4.dammit import EntitySubstitution 8*4882a593Smuzhiyun 9*4882a593SmuzhiyunDEFAULT_OUTPUT_ENCODING = "utf-8" 10*4882a593SmuzhiyunPY3K = (sys.version_info[0] > 2) 11*4882a593Smuzhiyun 12*4882a593Smuzhiyunwhitespace_re = re.compile(r"\s+") 13*4882a593Smuzhiyun 14*4882a593Smuzhiyundef _alias(attr): 15*4882a593Smuzhiyun """Alias one attribute name to another for backward compatibility""" 16*4882a593Smuzhiyun @property 17*4882a593Smuzhiyun def alias(self): 18*4882a593Smuzhiyun return getattr(self, attr) 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun @alias.setter 21*4882a593Smuzhiyun def alias(self): 22*4882a593Smuzhiyun return setattr(self, attr) 23*4882a593Smuzhiyun return alias 24*4882a593Smuzhiyun 25*4882a593Smuzhiyun 26*4882a593Smuzhiyunclass NamespacedAttribute(str): 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun def __new__(cls, prefix, name, namespace=None): 29*4882a593Smuzhiyun if name is None: 30*4882a593Smuzhiyun obj = str.__new__(cls, prefix) 31*4882a593Smuzhiyun elif prefix is None: 32*4882a593Smuzhiyun # Not really namespaced. 33*4882a593Smuzhiyun obj = str.__new__(cls, name) 34*4882a593Smuzhiyun else: 35*4882a593Smuzhiyun obj = str.__new__(cls, prefix + ":" + name) 36*4882a593Smuzhiyun obj.prefix = prefix 37*4882a593Smuzhiyun obj.name = name 38*4882a593Smuzhiyun obj.namespace = namespace 39*4882a593Smuzhiyun return obj 40*4882a593Smuzhiyun 41*4882a593Smuzhiyunclass AttributeValueWithCharsetSubstitution(str): 42*4882a593Smuzhiyun """A stand-in object for a character encoding specified in HTML.""" 43*4882a593Smuzhiyun 44*4882a593Smuzhiyunclass CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 45*4882a593Smuzhiyun """A generic stand-in for the value of a meta tag's 'charset' attribute. 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun When Beautiful Soup parses the markup '<meta charset="utf8">', the 48*4882a593Smuzhiyun value of the 'charset' attribute will be one of these objects. 49*4882a593Smuzhiyun """ 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun def __new__(cls, original_value): 52*4882a593Smuzhiyun obj = str.__new__(cls, original_value) 53*4882a593Smuzhiyun obj.original_value = original_value 54*4882a593Smuzhiyun return obj 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun def encode(self, encoding): 57*4882a593Smuzhiyun return encoding 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun 60*4882a593Smuzhiyunclass ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 61*4882a593Smuzhiyun """A generic stand-in for the value of a meta tag's 'content' attribute. 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun When Beautiful Soup parses the markup: 64*4882a593Smuzhiyun <meta http-equiv="content-type" content="text/html; charset=utf8"> 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun The value of the 'content' attribute will be one of these objects. 67*4882a593Smuzhiyun """ 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun def __new__(cls, original_value): 72*4882a593Smuzhiyun match = cls.CHARSET_RE.search(original_value) 73*4882a593Smuzhiyun if match is None: 74*4882a593Smuzhiyun # No substitution necessary. 75*4882a593Smuzhiyun return str.__new__(str, original_value) 76*4882a593Smuzhiyun 77*4882a593Smuzhiyun obj = str.__new__(cls, original_value) 78*4882a593Smuzhiyun obj.original_value = original_value 79*4882a593Smuzhiyun return obj 80*4882a593Smuzhiyun 81*4882a593Smuzhiyun def encode(self, encoding): 82*4882a593Smuzhiyun def rewrite(match): 83*4882a593Smuzhiyun return match.group(1) + encoding 84*4882a593Smuzhiyun return self.CHARSET_RE.sub(rewrite, self.original_value) 85*4882a593Smuzhiyun 86*4882a593Smuzhiyunclass HTMLAwareEntitySubstitution(EntitySubstitution): 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun """Entity substitution rules that are aware of some HTML quirks. 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun Specifically, the contents of <script> and <style> tags should not 91*4882a593Smuzhiyun undergo entity substitution. 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun Incoming NavigableString objects are checked to see if they're the 94*4882a593Smuzhiyun direct children of a <script> or <style> tag. 95*4882a593Smuzhiyun """ 96*4882a593Smuzhiyun 97*4882a593Smuzhiyun cdata_containing_tags = set(["script", "style"]) 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun preformatted_tags = set(["pre"]) 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun @classmethod 102*4882a593Smuzhiyun def _substitute_if_appropriate(cls, ns, f): 103*4882a593Smuzhiyun if (isinstance(ns, NavigableString) 104*4882a593Smuzhiyun and ns.parent is not None 105*4882a593Smuzhiyun and ns.parent.name in cls.cdata_containing_tags): 106*4882a593Smuzhiyun # Do nothing. 107*4882a593Smuzhiyun return ns 108*4882a593Smuzhiyun # Substitute. 109*4882a593Smuzhiyun return f(ns) 110*4882a593Smuzhiyun 111*4882a593Smuzhiyun @classmethod 112*4882a593Smuzhiyun def substitute_html(cls, ns): 113*4882a593Smuzhiyun return cls._substitute_if_appropriate( 114*4882a593Smuzhiyun ns, EntitySubstitution.substitute_html) 115*4882a593Smuzhiyun 116*4882a593Smuzhiyun @classmethod 117*4882a593Smuzhiyun def substitute_xml(cls, ns): 118*4882a593Smuzhiyun return cls._substitute_if_appropriate( 119*4882a593Smuzhiyun ns, EntitySubstitution.substitute_xml) 120*4882a593Smuzhiyun 121*4882a593Smuzhiyunclass PageElement(object): 122*4882a593Smuzhiyun """Contains the navigational information for some part of the page 123*4882a593Smuzhiyun (either a tag or a piece of text)""" 124*4882a593Smuzhiyun 125*4882a593Smuzhiyun # There are five possible values for the "formatter" argument passed in 126*4882a593Smuzhiyun # to methods like encode() and prettify(): 127*4882a593Smuzhiyun # 128*4882a593Smuzhiyun # "html" - All Unicode characters with corresponding HTML entities 129*4882a593Smuzhiyun # are converted to those entities on output. 130*4882a593Smuzhiyun # "minimal" - Bare ampersands and angle brackets are converted to 131*4882a593Smuzhiyun # XML entities: & < > 132*4882a593Smuzhiyun # None - The null formatter. Unicode characters are never 133*4882a593Smuzhiyun # converted to entities. This is not recommended, but it's 134*4882a593Smuzhiyun # faster than "minimal". 135*4882a593Smuzhiyun # A function - This function will be called on every string that 136*4882a593Smuzhiyun # needs to undergo entity substitution. 137*4882a593Smuzhiyun # 138*4882a593Smuzhiyun 139*4882a593Smuzhiyun # In an HTML document, the default "html" and "minimal" functions 140*4882a593Smuzhiyun # will leave the contents of <script> and <style> tags alone. For 141*4882a593Smuzhiyun # an XML document, all tags will be given the same treatment. 142*4882a593Smuzhiyun 143*4882a593Smuzhiyun HTML_FORMATTERS = { 144*4882a593Smuzhiyun "html" : HTMLAwareEntitySubstitution.substitute_html, 145*4882a593Smuzhiyun "minimal" : HTMLAwareEntitySubstitution.substitute_xml, 146*4882a593Smuzhiyun None : None 147*4882a593Smuzhiyun } 148*4882a593Smuzhiyun 149*4882a593Smuzhiyun XML_FORMATTERS = { 150*4882a593Smuzhiyun "html" : EntitySubstitution.substitute_html, 151*4882a593Smuzhiyun "minimal" : EntitySubstitution.substitute_xml, 152*4882a593Smuzhiyun None : None 153*4882a593Smuzhiyun } 154*4882a593Smuzhiyun 155*4882a593Smuzhiyun def format_string(self, s, formatter='minimal'): 156*4882a593Smuzhiyun """Format the given string using the given formatter.""" 157*4882a593Smuzhiyun if not isinstance(formatter, collections.abc.Callable): 158*4882a593Smuzhiyun formatter = self._formatter_for_name(formatter) 159*4882a593Smuzhiyun if formatter is None: 160*4882a593Smuzhiyun output = s 161*4882a593Smuzhiyun else: 162*4882a593Smuzhiyun output = formatter(s) 163*4882a593Smuzhiyun return output 164*4882a593Smuzhiyun 165*4882a593Smuzhiyun @property 166*4882a593Smuzhiyun def _is_xml(self): 167*4882a593Smuzhiyun """Is this element part of an XML tree or an HTML tree? 168*4882a593Smuzhiyun 169*4882a593Smuzhiyun This is used when mapping a formatter name ("minimal") to an 170*4882a593Smuzhiyun appropriate function (one that performs entity-substitution on 171*4882a593Smuzhiyun the contents of <script> and <style> tags, or not). It's 172*4882a593Smuzhiyun inefficient, but it should be called very rarely. 173*4882a593Smuzhiyun """ 174*4882a593Smuzhiyun if self.parent is None: 175*4882a593Smuzhiyun # This is the top-level object. It should have .is_xml set 176*4882a593Smuzhiyun # from tree creation. If not, take a guess--BS is usually 177*4882a593Smuzhiyun # used on HTML markup. 178*4882a593Smuzhiyun return getattr(self, 'is_xml', False) 179*4882a593Smuzhiyun return self.parent._is_xml 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun def _formatter_for_name(self, name): 182*4882a593Smuzhiyun "Look up a formatter function based on its name and the tree." 183*4882a593Smuzhiyun if self._is_xml: 184*4882a593Smuzhiyun return self.XML_FORMATTERS.get( 185*4882a593Smuzhiyun name, EntitySubstitution.substitute_xml) 186*4882a593Smuzhiyun else: 187*4882a593Smuzhiyun return self.HTML_FORMATTERS.get( 188*4882a593Smuzhiyun name, HTMLAwareEntitySubstitution.substitute_xml) 189*4882a593Smuzhiyun 190*4882a593Smuzhiyun def setup(self, parent=None, previous_element=None, next_element=None, 191*4882a593Smuzhiyun previous_sibling=None, next_sibling=None): 192*4882a593Smuzhiyun """Sets up the initial relations between this element and 193*4882a593Smuzhiyun other elements.""" 194*4882a593Smuzhiyun self.parent = parent 195*4882a593Smuzhiyun 196*4882a593Smuzhiyun self.previous_element = previous_element 197*4882a593Smuzhiyun if previous_element is not None: 198*4882a593Smuzhiyun self.previous_element.next_element = self 199*4882a593Smuzhiyun 200*4882a593Smuzhiyun self.next_element = next_element 201*4882a593Smuzhiyun if self.next_element: 202*4882a593Smuzhiyun self.next_element.previous_element = self 203*4882a593Smuzhiyun 204*4882a593Smuzhiyun self.next_sibling = next_sibling 205*4882a593Smuzhiyun if self.next_sibling: 206*4882a593Smuzhiyun self.next_sibling.previous_sibling = self 207*4882a593Smuzhiyun 208*4882a593Smuzhiyun if (not previous_sibling 209*4882a593Smuzhiyun and self.parent is not None and self.parent.contents): 210*4882a593Smuzhiyun previous_sibling = self.parent.contents[-1] 211*4882a593Smuzhiyun 212*4882a593Smuzhiyun self.previous_sibling = previous_sibling 213*4882a593Smuzhiyun if previous_sibling: 214*4882a593Smuzhiyun self.previous_sibling.next_sibling = self 215*4882a593Smuzhiyun 216*4882a593Smuzhiyun nextSibling = _alias("next_sibling") # BS3 217*4882a593Smuzhiyun previousSibling = _alias("previous_sibling") # BS3 218*4882a593Smuzhiyun 219*4882a593Smuzhiyun def replace_with(self, replace_with): 220*4882a593Smuzhiyun if not self.parent: 221*4882a593Smuzhiyun raise ValueError( 222*4882a593Smuzhiyun "Cannot replace one element with another when the" 223*4882a593Smuzhiyun "element to be replaced is not part of a tree.") 224*4882a593Smuzhiyun if replace_with is self: 225*4882a593Smuzhiyun return 226*4882a593Smuzhiyun if replace_with is self.parent: 227*4882a593Smuzhiyun raise ValueError("Cannot replace a Tag with its parent.") 228*4882a593Smuzhiyun old_parent = self.parent 229*4882a593Smuzhiyun my_index = self.parent.index(self) 230*4882a593Smuzhiyun self.extract() 231*4882a593Smuzhiyun old_parent.insert(my_index, replace_with) 232*4882a593Smuzhiyun return self 233*4882a593Smuzhiyun replaceWith = replace_with # BS3 234*4882a593Smuzhiyun 235*4882a593Smuzhiyun def unwrap(self): 236*4882a593Smuzhiyun my_parent = self.parent 237*4882a593Smuzhiyun if not self.parent: 238*4882a593Smuzhiyun raise ValueError( 239*4882a593Smuzhiyun "Cannot replace an element with its contents when that" 240*4882a593Smuzhiyun "element is not part of a tree.") 241*4882a593Smuzhiyun my_index = self.parent.index(self) 242*4882a593Smuzhiyun self.extract() 243*4882a593Smuzhiyun for child in reversed(self.contents[:]): 244*4882a593Smuzhiyun my_parent.insert(my_index, child) 245*4882a593Smuzhiyun return self 246*4882a593Smuzhiyun replace_with_children = unwrap 247*4882a593Smuzhiyun replaceWithChildren = unwrap # BS3 248*4882a593Smuzhiyun 249*4882a593Smuzhiyun def wrap(self, wrap_inside): 250*4882a593Smuzhiyun me = self.replace_with(wrap_inside) 251*4882a593Smuzhiyun wrap_inside.append(me) 252*4882a593Smuzhiyun return wrap_inside 253*4882a593Smuzhiyun 254*4882a593Smuzhiyun def extract(self): 255*4882a593Smuzhiyun """Destructively rips this element out of the tree.""" 256*4882a593Smuzhiyun if self.parent is not None: 257*4882a593Smuzhiyun del self.parent.contents[self.parent.index(self)] 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun #Find the two elements that would be next to each other if 260*4882a593Smuzhiyun #this element (and any children) hadn't been parsed. Connect 261*4882a593Smuzhiyun #the two. 262*4882a593Smuzhiyun last_child = self._last_descendant() 263*4882a593Smuzhiyun next_element = last_child.next_element 264*4882a593Smuzhiyun 265*4882a593Smuzhiyun if (self.previous_element is not None and 266*4882a593Smuzhiyun self.previous_element is not next_element): 267*4882a593Smuzhiyun self.previous_element.next_element = next_element 268*4882a593Smuzhiyun if next_element is not None and next_element is not self.previous_element: 269*4882a593Smuzhiyun next_element.previous_element = self.previous_element 270*4882a593Smuzhiyun self.previous_element = None 271*4882a593Smuzhiyun last_child.next_element = None 272*4882a593Smuzhiyun 273*4882a593Smuzhiyun self.parent = None 274*4882a593Smuzhiyun if (self.previous_sibling is not None 275*4882a593Smuzhiyun and self.previous_sibling is not self.next_sibling): 276*4882a593Smuzhiyun self.previous_sibling.next_sibling = self.next_sibling 277*4882a593Smuzhiyun if (self.next_sibling is not None 278*4882a593Smuzhiyun and self.next_sibling is not self.previous_sibling): 279*4882a593Smuzhiyun self.next_sibling.previous_sibling = self.previous_sibling 280*4882a593Smuzhiyun self.previous_sibling = self.next_sibling = None 281*4882a593Smuzhiyun return self 282*4882a593Smuzhiyun 283*4882a593Smuzhiyun def _last_descendant(self, is_initialized=True, accept_self=True): 284*4882a593Smuzhiyun "Finds the last element beneath this object to be parsed." 285*4882a593Smuzhiyun if is_initialized and self.next_sibling: 286*4882a593Smuzhiyun last_child = self.next_sibling.previous_element 287*4882a593Smuzhiyun else: 288*4882a593Smuzhiyun last_child = self 289*4882a593Smuzhiyun while isinstance(last_child, Tag) and last_child.contents: 290*4882a593Smuzhiyun last_child = last_child.contents[-1] 291*4882a593Smuzhiyun if not accept_self and last_child is self: 292*4882a593Smuzhiyun last_child = None 293*4882a593Smuzhiyun return last_child 294*4882a593Smuzhiyun # BS3: Not part of the API! 295*4882a593Smuzhiyun _lastRecursiveChild = _last_descendant 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun def insert(self, position, new_child): 298*4882a593Smuzhiyun if new_child is None: 299*4882a593Smuzhiyun raise ValueError("Cannot insert None into a tag.") 300*4882a593Smuzhiyun if new_child is self: 301*4882a593Smuzhiyun raise ValueError("Cannot insert a tag into itself.") 302*4882a593Smuzhiyun if (isinstance(new_child, str) 303*4882a593Smuzhiyun and not isinstance(new_child, NavigableString)): 304*4882a593Smuzhiyun new_child = NavigableString(new_child) 305*4882a593Smuzhiyun 306*4882a593Smuzhiyun position = min(position, len(self.contents)) 307*4882a593Smuzhiyun if hasattr(new_child, 'parent') and new_child.parent is not None: 308*4882a593Smuzhiyun # We're 'inserting' an element that's already one 309*4882a593Smuzhiyun # of this object's children. 310*4882a593Smuzhiyun if new_child.parent is self: 311*4882a593Smuzhiyun current_index = self.index(new_child) 312*4882a593Smuzhiyun if current_index < position: 313*4882a593Smuzhiyun # We're moving this element further down the list 314*4882a593Smuzhiyun # of this object's children. That means that when 315*4882a593Smuzhiyun # we extract this element, our target index will 316*4882a593Smuzhiyun # jump down one. 317*4882a593Smuzhiyun position -= 1 318*4882a593Smuzhiyun new_child.extract() 319*4882a593Smuzhiyun 320*4882a593Smuzhiyun new_child.parent = self 321*4882a593Smuzhiyun previous_child = None 322*4882a593Smuzhiyun if position == 0: 323*4882a593Smuzhiyun new_child.previous_sibling = None 324*4882a593Smuzhiyun new_child.previous_element = self 325*4882a593Smuzhiyun else: 326*4882a593Smuzhiyun previous_child = self.contents[position - 1] 327*4882a593Smuzhiyun new_child.previous_sibling = previous_child 328*4882a593Smuzhiyun new_child.previous_sibling.next_sibling = new_child 329*4882a593Smuzhiyun new_child.previous_element = previous_child._last_descendant(False) 330*4882a593Smuzhiyun if new_child.previous_element is not None: 331*4882a593Smuzhiyun new_child.previous_element.next_element = new_child 332*4882a593Smuzhiyun 333*4882a593Smuzhiyun new_childs_last_element = new_child._last_descendant(False) 334*4882a593Smuzhiyun 335*4882a593Smuzhiyun if position >= len(self.contents): 336*4882a593Smuzhiyun new_child.next_sibling = None 337*4882a593Smuzhiyun 338*4882a593Smuzhiyun parent = self 339*4882a593Smuzhiyun parents_next_sibling = None 340*4882a593Smuzhiyun while parents_next_sibling is None and parent is not None: 341*4882a593Smuzhiyun parents_next_sibling = parent.next_sibling 342*4882a593Smuzhiyun parent = parent.parent 343*4882a593Smuzhiyun if parents_next_sibling is not None: 344*4882a593Smuzhiyun # We found the element that comes next in the document. 345*4882a593Smuzhiyun break 346*4882a593Smuzhiyun if parents_next_sibling is not None: 347*4882a593Smuzhiyun new_childs_last_element.next_element = parents_next_sibling 348*4882a593Smuzhiyun else: 349*4882a593Smuzhiyun # The last element of this tag is the last element in 350*4882a593Smuzhiyun # the document. 351*4882a593Smuzhiyun new_childs_last_element.next_element = None 352*4882a593Smuzhiyun else: 353*4882a593Smuzhiyun next_child = self.contents[position] 354*4882a593Smuzhiyun new_child.next_sibling = next_child 355*4882a593Smuzhiyun if new_child.next_sibling is not None: 356*4882a593Smuzhiyun new_child.next_sibling.previous_sibling = new_child 357*4882a593Smuzhiyun new_childs_last_element.next_element = next_child 358*4882a593Smuzhiyun 359*4882a593Smuzhiyun if new_childs_last_element.next_element is not None: 360*4882a593Smuzhiyun new_childs_last_element.next_element.previous_element = new_childs_last_element 361*4882a593Smuzhiyun self.contents.insert(position, new_child) 362*4882a593Smuzhiyun 363*4882a593Smuzhiyun def append(self, tag): 364*4882a593Smuzhiyun """Appends the given tag to the contents of this tag.""" 365*4882a593Smuzhiyun self.insert(len(self.contents), tag) 366*4882a593Smuzhiyun 367*4882a593Smuzhiyun def insert_before(self, predecessor): 368*4882a593Smuzhiyun """Makes the given element the immediate predecessor of this one. 369*4882a593Smuzhiyun 370*4882a593Smuzhiyun The two elements will have the same parent, and the given element 371*4882a593Smuzhiyun will be immediately before this one. 372*4882a593Smuzhiyun """ 373*4882a593Smuzhiyun if self is predecessor: 374*4882a593Smuzhiyun raise ValueError("Can't insert an element before itself.") 375*4882a593Smuzhiyun parent = self.parent 376*4882a593Smuzhiyun if parent is None: 377*4882a593Smuzhiyun raise ValueError( 378*4882a593Smuzhiyun "Element has no parent, so 'before' has no meaning.") 379*4882a593Smuzhiyun # Extract first so that the index won't be screwed up if they 380*4882a593Smuzhiyun # are siblings. 381*4882a593Smuzhiyun if isinstance(predecessor, PageElement): 382*4882a593Smuzhiyun predecessor.extract() 383*4882a593Smuzhiyun index = parent.index(self) 384*4882a593Smuzhiyun parent.insert(index, predecessor) 385*4882a593Smuzhiyun 386*4882a593Smuzhiyun def insert_after(self, successor): 387*4882a593Smuzhiyun """Makes the given element the immediate successor of this one. 388*4882a593Smuzhiyun 389*4882a593Smuzhiyun The two elements will have the same parent, and the given element 390*4882a593Smuzhiyun will be immediately after this one. 391*4882a593Smuzhiyun """ 392*4882a593Smuzhiyun if self is successor: 393*4882a593Smuzhiyun raise ValueError("Can't insert an element after itself.") 394*4882a593Smuzhiyun parent = self.parent 395*4882a593Smuzhiyun if parent is None: 396*4882a593Smuzhiyun raise ValueError( 397*4882a593Smuzhiyun "Element has no parent, so 'after' has no meaning.") 398*4882a593Smuzhiyun # Extract first so that the index won't be screwed up if they 399*4882a593Smuzhiyun # are siblings. 400*4882a593Smuzhiyun if isinstance(successor, PageElement): 401*4882a593Smuzhiyun successor.extract() 402*4882a593Smuzhiyun index = parent.index(self) 403*4882a593Smuzhiyun parent.insert(index+1, successor) 404*4882a593Smuzhiyun 405*4882a593Smuzhiyun def find_next(self, name=None, attrs={}, text=None, **kwargs): 406*4882a593Smuzhiyun """Returns the first item that matches the given criteria and 407*4882a593Smuzhiyun appears after this Tag in the document.""" 408*4882a593Smuzhiyun return self._find_one(self.find_all_next, name, attrs, text, **kwargs) 409*4882a593Smuzhiyun findNext = find_next # BS3 410*4882a593Smuzhiyun 411*4882a593Smuzhiyun def find_all_next(self, name=None, attrs={}, text=None, limit=None, 412*4882a593Smuzhiyun **kwargs): 413*4882a593Smuzhiyun """Returns all items that match the given criteria and appear 414*4882a593Smuzhiyun after this Tag in the document.""" 415*4882a593Smuzhiyun return self._find_all(name, attrs, text, limit, self.next_elements, 416*4882a593Smuzhiyun **kwargs) 417*4882a593Smuzhiyun findAllNext = find_all_next # BS3 418*4882a593Smuzhiyun 419*4882a593Smuzhiyun def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): 420*4882a593Smuzhiyun """Returns the closest sibling to this Tag that matches the 421*4882a593Smuzhiyun given criteria and appears after this Tag in the document.""" 422*4882a593Smuzhiyun return self._find_one(self.find_next_siblings, name, attrs, text, 423*4882a593Smuzhiyun **kwargs) 424*4882a593Smuzhiyun findNextSibling = find_next_sibling # BS3 425*4882a593Smuzhiyun 426*4882a593Smuzhiyun def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, 427*4882a593Smuzhiyun **kwargs): 428*4882a593Smuzhiyun """Returns the siblings of this Tag that match the given 429*4882a593Smuzhiyun criteria and appear after this Tag in the document.""" 430*4882a593Smuzhiyun return self._find_all(name, attrs, text, limit, 431*4882a593Smuzhiyun self.next_siblings, **kwargs) 432*4882a593Smuzhiyun findNextSiblings = find_next_siblings # BS3 433*4882a593Smuzhiyun fetchNextSiblings = find_next_siblings # BS2 434*4882a593Smuzhiyun 435*4882a593Smuzhiyun def find_previous(self, name=None, attrs={}, text=None, **kwargs): 436*4882a593Smuzhiyun """Returns the first item that matches the given criteria and 437*4882a593Smuzhiyun appears before this Tag in the document.""" 438*4882a593Smuzhiyun return self._find_one( 439*4882a593Smuzhiyun self.find_all_previous, name, attrs, text, **kwargs) 440*4882a593Smuzhiyun findPrevious = find_previous # BS3 441*4882a593Smuzhiyun 442*4882a593Smuzhiyun def find_all_previous(self, name=None, attrs={}, text=None, limit=None, 443*4882a593Smuzhiyun **kwargs): 444*4882a593Smuzhiyun """Returns all items that match the given criteria and appear 445*4882a593Smuzhiyun before this Tag in the document.""" 446*4882a593Smuzhiyun return self._find_all(name, attrs, text, limit, self.previous_elements, 447*4882a593Smuzhiyun **kwargs) 448*4882a593Smuzhiyun findAllPrevious = find_all_previous # BS3 449*4882a593Smuzhiyun fetchPrevious = find_all_previous # BS2 450*4882a593Smuzhiyun 451*4882a593Smuzhiyun def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): 452*4882a593Smuzhiyun """Returns the closest sibling to this Tag that matches the 453*4882a593Smuzhiyun given criteria and appears before this Tag in the document.""" 454*4882a593Smuzhiyun return self._find_one(self.find_previous_siblings, name, attrs, text, 455*4882a593Smuzhiyun **kwargs) 456*4882a593Smuzhiyun findPreviousSibling = find_previous_sibling # BS3 457*4882a593Smuzhiyun 458*4882a593Smuzhiyun def find_previous_siblings(self, name=None, attrs={}, text=None, 459*4882a593Smuzhiyun limit=None, **kwargs): 460*4882a593Smuzhiyun """Returns the siblings of this Tag that match the given 461*4882a593Smuzhiyun criteria and appear before this Tag in the document.""" 462*4882a593Smuzhiyun return self._find_all(name, attrs, text, limit, 463*4882a593Smuzhiyun self.previous_siblings, **kwargs) 464*4882a593Smuzhiyun findPreviousSiblings = find_previous_siblings # BS3 465*4882a593Smuzhiyun fetchPreviousSiblings = find_previous_siblings # BS2 466*4882a593Smuzhiyun 467*4882a593Smuzhiyun def find_parent(self, name=None, attrs={}, **kwargs): 468*4882a593Smuzhiyun """Returns the closest parent of this Tag that matches the given 469*4882a593Smuzhiyun criteria.""" 470*4882a593Smuzhiyun # NOTE: We can't use _find_one because findParents takes a different 471*4882a593Smuzhiyun # set of arguments. 472*4882a593Smuzhiyun r = None 473*4882a593Smuzhiyun l = self.find_parents(name, attrs, 1, **kwargs) 474*4882a593Smuzhiyun if l: 475*4882a593Smuzhiyun r = l[0] 476*4882a593Smuzhiyun return r 477*4882a593Smuzhiyun findParent = find_parent # BS3 478*4882a593Smuzhiyun 479*4882a593Smuzhiyun def find_parents(self, name=None, attrs={}, limit=None, **kwargs): 480*4882a593Smuzhiyun """Returns the parents of this Tag that match the given 481*4882a593Smuzhiyun criteria.""" 482*4882a593Smuzhiyun 483*4882a593Smuzhiyun return self._find_all(name, attrs, None, limit, self.parents, 484*4882a593Smuzhiyun **kwargs) 485*4882a593Smuzhiyun findParents = find_parents # BS3 486*4882a593Smuzhiyun fetchParents = find_parents # BS2 487*4882a593Smuzhiyun 488*4882a593Smuzhiyun @property 489*4882a593Smuzhiyun def next(self): 490*4882a593Smuzhiyun return self.next_element 491*4882a593Smuzhiyun 492*4882a593Smuzhiyun @property 493*4882a593Smuzhiyun def previous(self): 494*4882a593Smuzhiyun return self.previous_element 495*4882a593Smuzhiyun 496*4882a593Smuzhiyun #These methods do the real heavy lifting. 497*4882a593Smuzhiyun 498*4882a593Smuzhiyun def _find_one(self, method, name, attrs, text, **kwargs): 499*4882a593Smuzhiyun r = None 500*4882a593Smuzhiyun l = method(name, attrs, text, 1, **kwargs) 501*4882a593Smuzhiyun if l: 502*4882a593Smuzhiyun r = l[0] 503*4882a593Smuzhiyun return r 504*4882a593Smuzhiyun 505*4882a593Smuzhiyun def _find_all(self, name, attrs, text, limit, generator, **kwargs): 506*4882a593Smuzhiyun "Iterates over a generator looking for things that match." 507*4882a593Smuzhiyun 508*4882a593Smuzhiyun if text is None and 'string' in kwargs: 509*4882a593Smuzhiyun text = kwargs['string'] 510*4882a593Smuzhiyun del kwargs['string'] 511*4882a593Smuzhiyun 512*4882a593Smuzhiyun if isinstance(name, SoupStrainer): 513*4882a593Smuzhiyun strainer = name 514*4882a593Smuzhiyun else: 515*4882a593Smuzhiyun strainer = SoupStrainer(name, attrs, text, **kwargs) 516*4882a593Smuzhiyun 517*4882a593Smuzhiyun if text is None and not limit and not attrs and not kwargs: 518*4882a593Smuzhiyun if name is True or name is None: 519*4882a593Smuzhiyun # Optimization to find all tags. 520*4882a593Smuzhiyun result = (element for element in generator 521*4882a593Smuzhiyun if isinstance(element, Tag)) 522*4882a593Smuzhiyun return ResultSet(strainer, result) 523*4882a593Smuzhiyun elif isinstance(name, str): 524*4882a593Smuzhiyun # Optimization to find all tags with a given name. 525*4882a593Smuzhiyun result = (element for element in generator 526*4882a593Smuzhiyun if isinstance(element, Tag) 527*4882a593Smuzhiyun and element.name == name) 528*4882a593Smuzhiyun return ResultSet(strainer, result) 529*4882a593Smuzhiyun results = ResultSet(strainer) 530*4882a593Smuzhiyun while True: 531*4882a593Smuzhiyun try: 532*4882a593Smuzhiyun i = next(generator) 533*4882a593Smuzhiyun except StopIteration: 534*4882a593Smuzhiyun break 535*4882a593Smuzhiyun if i: 536*4882a593Smuzhiyun found = strainer.search(i) 537*4882a593Smuzhiyun if found: 538*4882a593Smuzhiyun results.append(found) 539*4882a593Smuzhiyun if limit and len(results) >= limit: 540*4882a593Smuzhiyun break 541*4882a593Smuzhiyun return results 542*4882a593Smuzhiyun 543*4882a593Smuzhiyun #These generators can be used to navigate starting from both 544*4882a593Smuzhiyun #NavigableStrings and Tags. 545*4882a593Smuzhiyun @property 546*4882a593Smuzhiyun def next_elements(self): 547*4882a593Smuzhiyun i = self.next_element 548*4882a593Smuzhiyun while i is not None: 549*4882a593Smuzhiyun yield i 550*4882a593Smuzhiyun i = i.next_element 551*4882a593Smuzhiyun 552*4882a593Smuzhiyun @property 553*4882a593Smuzhiyun def next_siblings(self): 554*4882a593Smuzhiyun i = self.next_sibling 555*4882a593Smuzhiyun while i is not None: 556*4882a593Smuzhiyun yield i 557*4882a593Smuzhiyun i = i.next_sibling 558*4882a593Smuzhiyun 559*4882a593Smuzhiyun @property 560*4882a593Smuzhiyun def previous_elements(self): 561*4882a593Smuzhiyun i = self.previous_element 562*4882a593Smuzhiyun while i is not None: 563*4882a593Smuzhiyun yield i 564*4882a593Smuzhiyun i = i.previous_element 565*4882a593Smuzhiyun 566*4882a593Smuzhiyun @property 567*4882a593Smuzhiyun def previous_siblings(self): 568*4882a593Smuzhiyun i = self.previous_sibling 569*4882a593Smuzhiyun while i is not None: 570*4882a593Smuzhiyun yield i 571*4882a593Smuzhiyun i = i.previous_sibling 572*4882a593Smuzhiyun 573*4882a593Smuzhiyun @property 574*4882a593Smuzhiyun def parents(self): 575*4882a593Smuzhiyun i = self.parent 576*4882a593Smuzhiyun while i is not None: 577*4882a593Smuzhiyun yield i 578*4882a593Smuzhiyun i = i.parent 579*4882a593Smuzhiyun 580*4882a593Smuzhiyun # Methods for supporting CSS selectors. 581*4882a593Smuzhiyun 582*4882a593Smuzhiyun tag_name_re = re.compile(r'^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') 583*4882a593Smuzhiyun 584*4882a593Smuzhiyun # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ 585*4882a593Smuzhiyun # \---------------------------/ \---/\-------------/ \-------/ 586*4882a593Smuzhiyun # | | | | 587*4882a593Smuzhiyun # | | | The value 588*4882a593Smuzhiyun # | | ~,|,^,$,* or = 589*4882a593Smuzhiyun # | Attribute 590*4882a593Smuzhiyun # Tag 591*4882a593Smuzhiyun attribselect_re = re.compile( 592*4882a593Smuzhiyun r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' + 593*4882a593Smuzhiyun r'=?"?(?P<value>[^\]"]*)"?\]$' 594*4882a593Smuzhiyun ) 595*4882a593Smuzhiyun 596*4882a593Smuzhiyun def _attr_value_as_string(self, value, default=None): 597*4882a593Smuzhiyun """Force an attribute value into a string representation. 598*4882a593Smuzhiyun 599*4882a593Smuzhiyun A multi-valued attribute will be converted into a 600*4882a593Smuzhiyun space-separated stirng. 601*4882a593Smuzhiyun """ 602*4882a593Smuzhiyun value = self.get(value, default) 603*4882a593Smuzhiyun if isinstance(value, list) or isinstance(value, tuple): 604*4882a593Smuzhiyun value =" ".join(value) 605*4882a593Smuzhiyun return value 606*4882a593Smuzhiyun 607*4882a593Smuzhiyun def _tag_name_matches_and(self, function, tag_name): 608*4882a593Smuzhiyun if not tag_name: 609*4882a593Smuzhiyun return function 610*4882a593Smuzhiyun else: 611*4882a593Smuzhiyun def _match(tag): 612*4882a593Smuzhiyun return tag.name == tag_name and function(tag) 613*4882a593Smuzhiyun return _match 614*4882a593Smuzhiyun 615*4882a593Smuzhiyun def _attribute_checker(self, operator, attribute, value=''): 616*4882a593Smuzhiyun """Create a function that performs a CSS selector operation. 617*4882a593Smuzhiyun 618*4882a593Smuzhiyun Takes an operator, attribute and optional value. Returns a 619*4882a593Smuzhiyun function that will return True for elements that match that 620*4882a593Smuzhiyun combination. 621*4882a593Smuzhiyun """ 622*4882a593Smuzhiyun if operator == '=': 623*4882a593Smuzhiyun # string representation of `attribute` is equal to `value` 624*4882a593Smuzhiyun return lambda el: el._attr_value_as_string(attribute) == value 625*4882a593Smuzhiyun elif operator == '~': 626*4882a593Smuzhiyun # space-separated list representation of `attribute` 627*4882a593Smuzhiyun # contains `value` 628*4882a593Smuzhiyun def _includes_value(element): 629*4882a593Smuzhiyun attribute_value = element.get(attribute, []) 630*4882a593Smuzhiyun if not isinstance(attribute_value, list): 631*4882a593Smuzhiyun attribute_value = attribute_value.split() 632*4882a593Smuzhiyun return value in attribute_value 633*4882a593Smuzhiyun return _includes_value 634*4882a593Smuzhiyun elif operator == '^': 635*4882a593Smuzhiyun # string representation of `attribute` starts with `value` 636*4882a593Smuzhiyun return lambda el: el._attr_value_as_string( 637*4882a593Smuzhiyun attribute, '').startswith(value) 638*4882a593Smuzhiyun elif operator == '$': 639*4882a593Smuzhiyun # string represenation of `attribute` ends with `value` 640*4882a593Smuzhiyun return lambda el: el._attr_value_as_string( 641*4882a593Smuzhiyun attribute, '').endswith(value) 642*4882a593Smuzhiyun elif operator == '*': 643*4882a593Smuzhiyun # string representation of `attribute` contains `value` 644*4882a593Smuzhiyun return lambda el: value in el._attr_value_as_string(attribute, '') 645*4882a593Smuzhiyun elif operator == '|': 646*4882a593Smuzhiyun # string representation of `attribute` is either exactly 647*4882a593Smuzhiyun # `value` or starts with `value` and then a dash. 648*4882a593Smuzhiyun def _is_or_starts_with_dash(element): 649*4882a593Smuzhiyun attribute_value = element._attr_value_as_string(attribute, '') 650*4882a593Smuzhiyun return (attribute_value == value or attribute_value.startswith( 651*4882a593Smuzhiyun value + '-')) 652*4882a593Smuzhiyun return _is_or_starts_with_dash 653*4882a593Smuzhiyun else: 654*4882a593Smuzhiyun return lambda el: el.has_attr(attribute) 655*4882a593Smuzhiyun 656*4882a593Smuzhiyun # Old non-property versions of the generators, for backwards 657*4882a593Smuzhiyun # compatibility with BS3. 658*4882a593Smuzhiyun def nextGenerator(self): 659*4882a593Smuzhiyun return self.next_elements 660*4882a593Smuzhiyun 661*4882a593Smuzhiyun def nextSiblingGenerator(self): 662*4882a593Smuzhiyun return self.next_siblings 663*4882a593Smuzhiyun 664*4882a593Smuzhiyun def previousGenerator(self): 665*4882a593Smuzhiyun return self.previous_elements 666*4882a593Smuzhiyun 667*4882a593Smuzhiyun def previousSiblingGenerator(self): 668*4882a593Smuzhiyun return self.previous_siblings 669*4882a593Smuzhiyun 670*4882a593Smuzhiyun def parentGenerator(self): 671*4882a593Smuzhiyun return self.parents 672*4882a593Smuzhiyun 673*4882a593Smuzhiyun 674*4882a593Smuzhiyunclass NavigableString(str, PageElement): 675*4882a593Smuzhiyun 676*4882a593Smuzhiyun PREFIX = '' 677*4882a593Smuzhiyun SUFFIX = '' 678*4882a593Smuzhiyun 679*4882a593Smuzhiyun def __new__(cls, value): 680*4882a593Smuzhiyun """Create a new NavigableString. 681*4882a593Smuzhiyun 682*4882a593Smuzhiyun When unpickling a NavigableString, this method is called with 683*4882a593Smuzhiyun the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 684*4882a593Smuzhiyun passed in to the superclass's __new__ or the superclass won't know 685*4882a593Smuzhiyun how to handle non-ASCII characters. 686*4882a593Smuzhiyun """ 687*4882a593Smuzhiyun if isinstance(value, str): 688*4882a593Smuzhiyun u = str.__new__(cls, value) 689*4882a593Smuzhiyun else: 690*4882a593Smuzhiyun u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 691*4882a593Smuzhiyun u.setup() 692*4882a593Smuzhiyun return u 693*4882a593Smuzhiyun 694*4882a593Smuzhiyun def __copy__(self): 695*4882a593Smuzhiyun """A copy of a NavigableString has the same contents and class 696*4882a593Smuzhiyun as the original, but it is not connected to the parse tree. 697*4882a593Smuzhiyun """ 698*4882a593Smuzhiyun return type(self)(self) 699*4882a593Smuzhiyun 700*4882a593Smuzhiyun def __getnewargs__(self): 701*4882a593Smuzhiyun return (str(self),) 702*4882a593Smuzhiyun 703*4882a593Smuzhiyun def __getattr__(self, attr): 704*4882a593Smuzhiyun """text.string gives you text. This is for backwards 705*4882a593Smuzhiyun compatibility for Navigable*String, but for CData* it lets you 706*4882a593Smuzhiyun get the string without the CData wrapper.""" 707*4882a593Smuzhiyun if attr == 'string': 708*4882a593Smuzhiyun return self 709*4882a593Smuzhiyun else: 710*4882a593Smuzhiyun raise AttributeError( 711*4882a593Smuzhiyun "'%s' object has no attribute '%s'" % ( 712*4882a593Smuzhiyun self.__class__.__name__, attr)) 713*4882a593Smuzhiyun 714*4882a593Smuzhiyun def output_ready(self, formatter="minimal"): 715*4882a593Smuzhiyun output = self.format_string(self, formatter) 716*4882a593Smuzhiyun return self.PREFIX + output + self.SUFFIX 717*4882a593Smuzhiyun 718*4882a593Smuzhiyun @property 719*4882a593Smuzhiyun def name(self): 720*4882a593Smuzhiyun return None 721*4882a593Smuzhiyun 722*4882a593Smuzhiyun @name.setter 723*4882a593Smuzhiyun def name(self, name): 724*4882a593Smuzhiyun raise AttributeError("A NavigableString cannot be given a name.") 725*4882a593Smuzhiyun 726*4882a593Smuzhiyunclass PreformattedString(NavigableString): 727*4882a593Smuzhiyun """A NavigableString not subject to the normal formatting rules. 728*4882a593Smuzhiyun 729*4882a593Smuzhiyun The string will be passed into the formatter (to trigger side effects), 730*4882a593Smuzhiyun but the return value will be ignored. 731*4882a593Smuzhiyun """ 732*4882a593Smuzhiyun 733*4882a593Smuzhiyun def output_ready(self, formatter="minimal"): 734*4882a593Smuzhiyun """CData strings are passed into the formatter. 735*4882a593Smuzhiyun But the return value is ignored.""" 736*4882a593Smuzhiyun self.format_string(self, formatter) 737*4882a593Smuzhiyun return self.PREFIX + self + self.SUFFIX 738*4882a593Smuzhiyun 739*4882a593Smuzhiyunclass CData(PreformattedString): 740*4882a593Smuzhiyun 741*4882a593Smuzhiyun PREFIX = '<![CDATA[' 742*4882a593Smuzhiyun SUFFIX = ']]>' 743*4882a593Smuzhiyun 744*4882a593Smuzhiyunclass ProcessingInstruction(PreformattedString): 745*4882a593Smuzhiyun 746*4882a593Smuzhiyun PREFIX = '<?' 747*4882a593Smuzhiyun SUFFIX = '>' 748*4882a593Smuzhiyun 749*4882a593Smuzhiyunclass Comment(PreformattedString): 750*4882a593Smuzhiyun 751*4882a593Smuzhiyun PREFIX = '<!--' 752*4882a593Smuzhiyun SUFFIX = '-->' 753*4882a593Smuzhiyun 754*4882a593Smuzhiyun 755*4882a593Smuzhiyunclass Declaration(PreformattedString): 756*4882a593Smuzhiyun PREFIX = '<?' 757*4882a593Smuzhiyun SUFFIX = '?>' 758*4882a593Smuzhiyun 759*4882a593Smuzhiyun 760*4882a593Smuzhiyunclass Doctype(PreformattedString): 761*4882a593Smuzhiyun 762*4882a593Smuzhiyun @classmethod 763*4882a593Smuzhiyun def for_name_and_ids(cls, name, pub_id, system_id): 764*4882a593Smuzhiyun value = name or '' 765*4882a593Smuzhiyun if pub_id is not None: 766*4882a593Smuzhiyun value += ' PUBLIC "%s"' % pub_id 767*4882a593Smuzhiyun if system_id is not None: 768*4882a593Smuzhiyun value += ' "%s"' % system_id 769*4882a593Smuzhiyun elif system_id is not None: 770*4882a593Smuzhiyun value += ' SYSTEM "%s"' % system_id 771*4882a593Smuzhiyun 772*4882a593Smuzhiyun return Doctype(value) 773*4882a593Smuzhiyun 774*4882a593Smuzhiyun PREFIX = '<!DOCTYPE ' 775*4882a593Smuzhiyun SUFFIX = '>\n' 776*4882a593Smuzhiyun 777*4882a593Smuzhiyun 778*4882a593Smuzhiyunclass Tag(PageElement): 779*4882a593Smuzhiyun 780*4882a593Smuzhiyun """Represents a found HTML tag with its attributes and contents.""" 781*4882a593Smuzhiyun 782*4882a593Smuzhiyun def __init__(self, parser=None, builder=None, name=None, namespace=None, 783*4882a593Smuzhiyun prefix=None, attrs=None, parent=None, previous=None): 784*4882a593Smuzhiyun "Basic constructor." 785*4882a593Smuzhiyun 786*4882a593Smuzhiyun if parser is None: 787*4882a593Smuzhiyun self.parser_class = None 788*4882a593Smuzhiyun else: 789*4882a593Smuzhiyun # We don't actually store the parser object: that lets extracted 790*4882a593Smuzhiyun # chunks be garbage-collected. 791*4882a593Smuzhiyun self.parser_class = parser.__class__ 792*4882a593Smuzhiyun if name is None: 793*4882a593Smuzhiyun raise ValueError("No value provided for new tag's name.") 794*4882a593Smuzhiyun self.name = name 795*4882a593Smuzhiyun self.namespace = namespace 796*4882a593Smuzhiyun self.prefix = prefix 797*4882a593Smuzhiyun if attrs is None: 798*4882a593Smuzhiyun attrs = {} 799*4882a593Smuzhiyun elif attrs: 800*4882a593Smuzhiyun if builder is not None and builder.cdata_list_attributes: 801*4882a593Smuzhiyun attrs = builder._replace_cdata_list_attribute_values( 802*4882a593Smuzhiyun self.name, attrs) 803*4882a593Smuzhiyun else: 804*4882a593Smuzhiyun attrs = dict(attrs) 805*4882a593Smuzhiyun else: 806*4882a593Smuzhiyun attrs = dict(attrs) 807*4882a593Smuzhiyun self.attrs = attrs 808*4882a593Smuzhiyun self.contents = [] 809*4882a593Smuzhiyun self.setup(parent, previous) 810*4882a593Smuzhiyun self.hidden = False 811*4882a593Smuzhiyun 812*4882a593Smuzhiyun # Set up any substitutions, such as the charset in a META tag. 813*4882a593Smuzhiyun if builder is not None: 814*4882a593Smuzhiyun builder.set_up_substitutions(self) 815*4882a593Smuzhiyun self.can_be_empty_element = builder.can_be_empty_element(name) 816*4882a593Smuzhiyun else: 817*4882a593Smuzhiyun self.can_be_empty_element = False 818*4882a593Smuzhiyun 819*4882a593Smuzhiyun parserClass = _alias("parser_class") # BS3 820*4882a593Smuzhiyun 821*4882a593Smuzhiyun def __copy__(self): 822*4882a593Smuzhiyun """A copy of a Tag is a new Tag, unconnected to the parse tree. 823*4882a593Smuzhiyun Its contents are a copy of the old Tag's contents. 824*4882a593Smuzhiyun """ 825*4882a593Smuzhiyun clone = type(self)(None, self.builder, self.name, self.namespace, 826*4882a593Smuzhiyun self.nsprefix, self.attrs) 827*4882a593Smuzhiyun for attr in ('can_be_empty_element', 'hidden'): 828*4882a593Smuzhiyun setattr(clone, attr, getattr(self, attr)) 829*4882a593Smuzhiyun for child in self.contents: 830*4882a593Smuzhiyun clone.append(child.__copy__()) 831*4882a593Smuzhiyun return clone 832*4882a593Smuzhiyun 833*4882a593Smuzhiyun @property 834*4882a593Smuzhiyun def is_empty_element(self): 835*4882a593Smuzhiyun """Is this tag an empty-element tag? (aka a self-closing tag) 836*4882a593Smuzhiyun 837*4882a593Smuzhiyun A tag that has contents is never an empty-element tag. 838*4882a593Smuzhiyun 839*4882a593Smuzhiyun A tag that has no contents may or may not be an empty-element 840*4882a593Smuzhiyun tag. It depends on the builder used to create the tag. If the 841*4882a593Smuzhiyun builder has a designated list of empty-element tags, then only 842*4882a593Smuzhiyun a tag whose name shows up in that list is considered an 843*4882a593Smuzhiyun empty-element tag. 844*4882a593Smuzhiyun 845*4882a593Smuzhiyun If the builder has no designated list of empty-element tags, 846*4882a593Smuzhiyun then any tag with no contents is an empty-element tag. 847*4882a593Smuzhiyun """ 848*4882a593Smuzhiyun return len(self.contents) == 0 and self.can_be_empty_element 849*4882a593Smuzhiyun isSelfClosing = is_empty_element # BS3 850*4882a593Smuzhiyun 851*4882a593Smuzhiyun @property 852*4882a593Smuzhiyun def string(self): 853*4882a593Smuzhiyun """Convenience property to get the single string within this tag. 854*4882a593Smuzhiyun 855*4882a593Smuzhiyun :Return: If this tag has a single string child, return value 856*4882a593Smuzhiyun is that string. If this tag has no children, or more than one 857*4882a593Smuzhiyun child, return value is None. If this tag has one child tag, 858*4882a593Smuzhiyun return value is the 'string' attribute of the child tag, 859*4882a593Smuzhiyun recursively. 860*4882a593Smuzhiyun """ 861*4882a593Smuzhiyun if len(self.contents) != 1: 862*4882a593Smuzhiyun return None 863*4882a593Smuzhiyun child = self.contents[0] 864*4882a593Smuzhiyun if isinstance(child, NavigableString): 865*4882a593Smuzhiyun return child 866*4882a593Smuzhiyun return child.string 867*4882a593Smuzhiyun 868*4882a593Smuzhiyun @string.setter 869*4882a593Smuzhiyun def string(self, string): 870*4882a593Smuzhiyun self.clear() 871*4882a593Smuzhiyun self.append(string.__class__(string)) 872*4882a593Smuzhiyun 873*4882a593Smuzhiyun def _all_strings(self, strip=False, types=(NavigableString, CData)): 874*4882a593Smuzhiyun """Yield all strings of certain classes, possibly stripping them. 875*4882a593Smuzhiyun 876*4882a593Smuzhiyun By default, yields only NavigableString and CData objects. So 877*4882a593Smuzhiyun no comments, processing instructions, etc. 878*4882a593Smuzhiyun """ 879*4882a593Smuzhiyun for descendant in self.descendants: 880*4882a593Smuzhiyun if ( 881*4882a593Smuzhiyun (types is None and not isinstance(descendant, NavigableString)) 882*4882a593Smuzhiyun or 883*4882a593Smuzhiyun (types is not None and type(descendant) not in types)): 884*4882a593Smuzhiyun continue 885*4882a593Smuzhiyun if strip: 886*4882a593Smuzhiyun descendant = descendant.strip() 887*4882a593Smuzhiyun if len(descendant) == 0: 888*4882a593Smuzhiyun continue 889*4882a593Smuzhiyun yield descendant 890*4882a593Smuzhiyun 891*4882a593Smuzhiyun strings = property(_all_strings) 892*4882a593Smuzhiyun 893*4882a593Smuzhiyun @property 894*4882a593Smuzhiyun def stripped_strings(self): 895*4882a593Smuzhiyun for string in self._all_strings(True): 896*4882a593Smuzhiyun yield string 897*4882a593Smuzhiyun 898*4882a593Smuzhiyun def get_text(self, separator="", strip=False, 899*4882a593Smuzhiyun types=(NavigableString, CData)): 900*4882a593Smuzhiyun """ 901*4882a593Smuzhiyun Get all child strings, concatenated using the given separator. 902*4882a593Smuzhiyun """ 903*4882a593Smuzhiyun return separator.join([s for s in self._all_strings( 904*4882a593Smuzhiyun strip, types=types)]) 905*4882a593Smuzhiyun getText = get_text 906*4882a593Smuzhiyun text = property(get_text) 907*4882a593Smuzhiyun 908*4882a593Smuzhiyun def decompose(self): 909*4882a593Smuzhiyun """Recursively destroys the contents of this tree.""" 910*4882a593Smuzhiyun self.extract() 911*4882a593Smuzhiyun i = self 912*4882a593Smuzhiyun while i is not None: 913*4882a593Smuzhiyun next = i.next_element 914*4882a593Smuzhiyun i.__dict__.clear() 915*4882a593Smuzhiyun i.contents = [] 916*4882a593Smuzhiyun i = next 917*4882a593Smuzhiyun 918*4882a593Smuzhiyun def clear(self, decompose=False): 919*4882a593Smuzhiyun """ 920*4882a593Smuzhiyun Extract all children. If decompose is True, decompose instead. 921*4882a593Smuzhiyun """ 922*4882a593Smuzhiyun if decompose: 923*4882a593Smuzhiyun for element in self.contents[:]: 924*4882a593Smuzhiyun if isinstance(element, Tag): 925*4882a593Smuzhiyun element.decompose() 926*4882a593Smuzhiyun else: 927*4882a593Smuzhiyun element.extract() 928*4882a593Smuzhiyun else: 929*4882a593Smuzhiyun for element in self.contents[:]: 930*4882a593Smuzhiyun element.extract() 931*4882a593Smuzhiyun 932*4882a593Smuzhiyun def index(self, element): 933*4882a593Smuzhiyun """ 934*4882a593Smuzhiyun Find the index of a child by identity, not value. Avoids issues with 935*4882a593Smuzhiyun tag.contents.index(element) getting the index of equal elements. 936*4882a593Smuzhiyun """ 937*4882a593Smuzhiyun for i, child in enumerate(self.contents): 938*4882a593Smuzhiyun if child is element: 939*4882a593Smuzhiyun return i 940*4882a593Smuzhiyun raise ValueError("Tag.index: element not in tag") 941*4882a593Smuzhiyun 942*4882a593Smuzhiyun def get(self, key, default=None): 943*4882a593Smuzhiyun """Returns the value of the 'key' attribute for the tag, or 944*4882a593Smuzhiyun the value given for 'default' if it doesn't have that 945*4882a593Smuzhiyun attribute.""" 946*4882a593Smuzhiyun return self.attrs.get(key, default) 947*4882a593Smuzhiyun 948*4882a593Smuzhiyun def has_attr(self, key): 949*4882a593Smuzhiyun return key in self.attrs 950*4882a593Smuzhiyun 951*4882a593Smuzhiyun def __hash__(self): 952*4882a593Smuzhiyun return str(self).__hash__() 953*4882a593Smuzhiyun 954*4882a593Smuzhiyun def __getitem__(self, key): 955*4882a593Smuzhiyun """tag[key] returns the value of the 'key' attribute for the tag, 956*4882a593Smuzhiyun and throws an exception if it's not there.""" 957*4882a593Smuzhiyun return self.attrs[key] 958*4882a593Smuzhiyun 959*4882a593Smuzhiyun def __iter__(self): 960*4882a593Smuzhiyun "Iterating over a tag iterates over its contents." 961*4882a593Smuzhiyun return iter(self.contents) 962*4882a593Smuzhiyun 963*4882a593Smuzhiyun def __len__(self): 964*4882a593Smuzhiyun "The length of a tag is the length of its list of contents." 965*4882a593Smuzhiyun return len(self.contents) 966*4882a593Smuzhiyun 967*4882a593Smuzhiyun def __contains__(self, x): 968*4882a593Smuzhiyun return x in self.contents 969*4882a593Smuzhiyun 970*4882a593Smuzhiyun def __bool__(self): 971*4882a593Smuzhiyun "A tag is non-None even if it has no contents." 972*4882a593Smuzhiyun return True 973*4882a593Smuzhiyun 974*4882a593Smuzhiyun def __setitem__(self, key, value): 975*4882a593Smuzhiyun """Setting tag[key] sets the value of the 'key' attribute for the 976*4882a593Smuzhiyun tag.""" 977*4882a593Smuzhiyun self.attrs[key] = value 978*4882a593Smuzhiyun 979*4882a593Smuzhiyun def __delitem__(self, key): 980*4882a593Smuzhiyun "Deleting tag[key] deletes all 'key' attributes for the tag." 981*4882a593Smuzhiyun self.attrs.pop(key, None) 982*4882a593Smuzhiyun 983*4882a593Smuzhiyun def __call__(self, *args, **kwargs): 984*4882a593Smuzhiyun """Calling a tag like a function is the same as calling its 985*4882a593Smuzhiyun find_all() method. Eg. tag('a') returns a list of all the A tags 986*4882a593Smuzhiyun found within this tag.""" 987*4882a593Smuzhiyun return self.find_all(*args, **kwargs) 988*4882a593Smuzhiyun 989*4882a593Smuzhiyun def __getattr__(self, tag): 990*4882a593Smuzhiyun #print "Getattr %s.%s" % (self.__class__, tag) 991*4882a593Smuzhiyun if len(tag) > 3 and tag.endswith('Tag'): 992*4882a593Smuzhiyun # BS3: soup.aTag -> "soup.find("a") 993*4882a593Smuzhiyun tag_name = tag[:-3] 994*4882a593Smuzhiyun warnings.warn( 995*4882a593Smuzhiyun '.%sTag is deprecated, use .find("%s") instead.' % ( 996*4882a593Smuzhiyun tag_name, tag_name)) 997*4882a593Smuzhiyun return self.find(tag_name) 998*4882a593Smuzhiyun # We special case contents to avoid recursion. 999*4882a593Smuzhiyun elif not tag.startswith("__") and not tag=="contents": 1000*4882a593Smuzhiyun return self.find(tag) 1001*4882a593Smuzhiyun raise AttributeError( 1002*4882a593Smuzhiyun "'%s' object has no attribute '%s'" % (self.__class__, tag)) 1003*4882a593Smuzhiyun 1004*4882a593Smuzhiyun def __eq__(self, other): 1005*4882a593Smuzhiyun """Returns true iff this tag has the same name, the same attributes, 1006*4882a593Smuzhiyun and the same contents (recursively) as the given tag.""" 1007*4882a593Smuzhiyun if self is other: 1008*4882a593Smuzhiyun return True 1009*4882a593Smuzhiyun if (not hasattr(other, 'name') or 1010*4882a593Smuzhiyun not hasattr(other, 'attrs') or 1011*4882a593Smuzhiyun not hasattr(other, 'contents') or 1012*4882a593Smuzhiyun self.name != other.name or 1013*4882a593Smuzhiyun self.attrs != other.attrs or 1014*4882a593Smuzhiyun len(self) != len(other)): 1015*4882a593Smuzhiyun return False 1016*4882a593Smuzhiyun for i, my_child in enumerate(self.contents): 1017*4882a593Smuzhiyun if my_child != other.contents[i]: 1018*4882a593Smuzhiyun return False 1019*4882a593Smuzhiyun return True 1020*4882a593Smuzhiyun 1021*4882a593Smuzhiyun def __ne__(self, other): 1022*4882a593Smuzhiyun """Returns true iff this tag is not identical to the other tag, 1023*4882a593Smuzhiyun as defined in __eq__.""" 1024*4882a593Smuzhiyun return not self == other 1025*4882a593Smuzhiyun 1026*4882a593Smuzhiyun def __repr__(self, encoding="unicode-escape"): 1027*4882a593Smuzhiyun """Renders this tag as a string.""" 1028*4882a593Smuzhiyun if PY3K: 1029*4882a593Smuzhiyun # "The return value must be a string object", i.e. Unicode 1030*4882a593Smuzhiyun return self.decode() 1031*4882a593Smuzhiyun else: 1032*4882a593Smuzhiyun # "The return value must be a string object", i.e. a bytestring. 1033*4882a593Smuzhiyun # By convention, the return value of __repr__ should also be 1034*4882a593Smuzhiyun # an ASCII string. 1035*4882a593Smuzhiyun return self.encode(encoding) 1036*4882a593Smuzhiyun 1037*4882a593Smuzhiyun def __unicode__(self): 1038*4882a593Smuzhiyun return self.decode() 1039*4882a593Smuzhiyun 1040*4882a593Smuzhiyun def __str__(self): 1041*4882a593Smuzhiyun if PY3K: 1042*4882a593Smuzhiyun return self.decode() 1043*4882a593Smuzhiyun else: 1044*4882a593Smuzhiyun return self.encode() 1045*4882a593Smuzhiyun 1046*4882a593Smuzhiyun if PY3K: 1047*4882a593Smuzhiyun __str__ = __repr__ = __unicode__ 1048*4882a593Smuzhiyun 1049*4882a593Smuzhiyun def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 1050*4882a593Smuzhiyun indent_level=None, formatter="minimal", 1051*4882a593Smuzhiyun errors="xmlcharrefreplace"): 1052*4882a593Smuzhiyun # Turn the data structure into Unicode, then encode the 1053*4882a593Smuzhiyun # Unicode. 1054*4882a593Smuzhiyun u = self.decode(indent_level, encoding, formatter) 1055*4882a593Smuzhiyun return u.encode(encoding, errors) 1056*4882a593Smuzhiyun 1057*4882a593Smuzhiyun def _should_pretty_print(self, indent_level): 1058*4882a593Smuzhiyun """Should this tag be pretty-printed?""" 1059*4882a593Smuzhiyun return ( 1060*4882a593Smuzhiyun indent_level is not None and 1061*4882a593Smuzhiyun (self.name not in HTMLAwareEntitySubstitution.preformatted_tags 1062*4882a593Smuzhiyun or self._is_xml)) 1063*4882a593Smuzhiyun 1064*4882a593Smuzhiyun def decode(self, indent_level=None, 1065*4882a593Smuzhiyun eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1066*4882a593Smuzhiyun formatter="minimal"): 1067*4882a593Smuzhiyun """Returns a Unicode representation of this tag and its contents. 1068*4882a593Smuzhiyun 1069*4882a593Smuzhiyun :param eventual_encoding: The tag is destined to be 1070*4882a593Smuzhiyun encoded into this encoding. This method is _not_ 1071*4882a593Smuzhiyun responsible for performing that encoding. This information 1072*4882a593Smuzhiyun is passed in so that it can be substituted in if the 1073*4882a593Smuzhiyun document contains a <META> tag that mentions the document's 1074*4882a593Smuzhiyun encoding. 1075*4882a593Smuzhiyun """ 1076*4882a593Smuzhiyun 1077*4882a593Smuzhiyun # First off, turn a string formatter into a function. This 1078*4882a593Smuzhiyun # will stop the lookup from happening over and over again. 1079*4882a593Smuzhiyun if not isinstance(formatter, collections.abc.Callable): 1080*4882a593Smuzhiyun formatter = self._formatter_for_name(formatter) 1081*4882a593Smuzhiyun 1082*4882a593Smuzhiyun attrs = [] 1083*4882a593Smuzhiyun if self.attrs: 1084*4882a593Smuzhiyun for key, val in sorted(self.attrs.items()): 1085*4882a593Smuzhiyun if val is None: 1086*4882a593Smuzhiyun decoded = key 1087*4882a593Smuzhiyun else: 1088*4882a593Smuzhiyun if isinstance(val, list) or isinstance(val, tuple): 1089*4882a593Smuzhiyun val = ' '.join(val) 1090*4882a593Smuzhiyun elif not isinstance(val, str): 1091*4882a593Smuzhiyun val = str(val) 1092*4882a593Smuzhiyun elif ( 1093*4882a593Smuzhiyun isinstance(val, AttributeValueWithCharsetSubstitution) 1094*4882a593Smuzhiyun and eventual_encoding is not None): 1095*4882a593Smuzhiyun val = val.encode(eventual_encoding) 1096*4882a593Smuzhiyun 1097*4882a593Smuzhiyun text = self.format_string(val, formatter) 1098*4882a593Smuzhiyun decoded = ( 1099*4882a593Smuzhiyun str(key) + '=' 1100*4882a593Smuzhiyun + EntitySubstitution.quoted_attribute_value(text)) 1101*4882a593Smuzhiyun attrs.append(decoded) 1102*4882a593Smuzhiyun close = '' 1103*4882a593Smuzhiyun closeTag = '' 1104*4882a593Smuzhiyun 1105*4882a593Smuzhiyun prefix = '' 1106*4882a593Smuzhiyun if self.prefix: 1107*4882a593Smuzhiyun prefix = self.prefix + ":" 1108*4882a593Smuzhiyun 1109*4882a593Smuzhiyun if self.is_empty_element: 1110*4882a593Smuzhiyun close = '/' 1111*4882a593Smuzhiyun else: 1112*4882a593Smuzhiyun closeTag = '</%s%s>' % (prefix, self.name) 1113*4882a593Smuzhiyun 1114*4882a593Smuzhiyun pretty_print = self._should_pretty_print(indent_level) 1115*4882a593Smuzhiyun space = '' 1116*4882a593Smuzhiyun indent_space = '' 1117*4882a593Smuzhiyun if indent_level is not None: 1118*4882a593Smuzhiyun indent_space = (' ' * (indent_level - 1)) 1119*4882a593Smuzhiyun if pretty_print: 1120*4882a593Smuzhiyun space = indent_space 1121*4882a593Smuzhiyun indent_contents = indent_level + 1 1122*4882a593Smuzhiyun else: 1123*4882a593Smuzhiyun indent_contents = None 1124*4882a593Smuzhiyun contents = self.decode_contents( 1125*4882a593Smuzhiyun indent_contents, eventual_encoding, formatter) 1126*4882a593Smuzhiyun 1127*4882a593Smuzhiyun if self.hidden: 1128*4882a593Smuzhiyun # This is the 'document root' object. 1129*4882a593Smuzhiyun s = contents 1130*4882a593Smuzhiyun else: 1131*4882a593Smuzhiyun s = [] 1132*4882a593Smuzhiyun attribute_string = '' 1133*4882a593Smuzhiyun if attrs: 1134*4882a593Smuzhiyun attribute_string = ' ' + ' '.join(attrs) 1135*4882a593Smuzhiyun if indent_level is not None: 1136*4882a593Smuzhiyun # Even if this particular tag is not pretty-printed, 1137*4882a593Smuzhiyun # we should indent up to the start of the tag. 1138*4882a593Smuzhiyun s.append(indent_space) 1139*4882a593Smuzhiyun s.append('<%s%s%s%s>' % ( 1140*4882a593Smuzhiyun prefix, self.name, attribute_string, close)) 1141*4882a593Smuzhiyun if pretty_print: 1142*4882a593Smuzhiyun s.append("\n") 1143*4882a593Smuzhiyun s.append(contents) 1144*4882a593Smuzhiyun if pretty_print and contents and contents[-1] != "\n": 1145*4882a593Smuzhiyun s.append("\n") 1146*4882a593Smuzhiyun if pretty_print and closeTag: 1147*4882a593Smuzhiyun s.append(space) 1148*4882a593Smuzhiyun s.append(closeTag) 1149*4882a593Smuzhiyun if indent_level is not None and closeTag and self.next_sibling: 1150*4882a593Smuzhiyun # Even if this particular tag is not pretty-printed, 1151*4882a593Smuzhiyun # we're now done with the tag, and we should add a 1152*4882a593Smuzhiyun # newline if appropriate. 1153*4882a593Smuzhiyun s.append("\n") 1154*4882a593Smuzhiyun s = ''.join(s) 1155*4882a593Smuzhiyun return s 1156*4882a593Smuzhiyun 1157*4882a593Smuzhiyun def prettify(self, encoding=None, formatter="minimal"): 1158*4882a593Smuzhiyun if encoding is None: 1159*4882a593Smuzhiyun return self.decode(True, formatter=formatter) 1160*4882a593Smuzhiyun else: 1161*4882a593Smuzhiyun return self.encode(encoding, True, formatter=formatter) 1162*4882a593Smuzhiyun 1163*4882a593Smuzhiyun def decode_contents(self, indent_level=None, 1164*4882a593Smuzhiyun eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1165*4882a593Smuzhiyun formatter="minimal"): 1166*4882a593Smuzhiyun """Renders the contents of this tag as a Unicode string. 1167*4882a593Smuzhiyun 1168*4882a593Smuzhiyun :param indent_level: Each line of the rendering will be 1169*4882a593Smuzhiyun indented this many spaces. 1170*4882a593Smuzhiyun 1171*4882a593Smuzhiyun :param eventual_encoding: The tag is destined to be 1172*4882a593Smuzhiyun encoded into this encoding. This method is _not_ 1173*4882a593Smuzhiyun responsible for performing that encoding. This information 1174*4882a593Smuzhiyun is passed in so that it can be substituted in if the 1175*4882a593Smuzhiyun document contains a <META> tag that mentions the document's 1176*4882a593Smuzhiyun encoding. 1177*4882a593Smuzhiyun 1178*4882a593Smuzhiyun :param formatter: The output formatter responsible for converting 1179*4882a593Smuzhiyun entities to Unicode characters. 1180*4882a593Smuzhiyun """ 1181*4882a593Smuzhiyun # First off, turn a string formatter into a function. This 1182*4882a593Smuzhiyun # will stop the lookup from happening over and over again. 1183*4882a593Smuzhiyun if not isinstance(formatter, collections.abc.Callable): 1184*4882a593Smuzhiyun formatter = self._formatter_for_name(formatter) 1185*4882a593Smuzhiyun 1186*4882a593Smuzhiyun pretty_print = (indent_level is not None) 1187*4882a593Smuzhiyun s = [] 1188*4882a593Smuzhiyun for c in self: 1189*4882a593Smuzhiyun text = None 1190*4882a593Smuzhiyun if isinstance(c, NavigableString): 1191*4882a593Smuzhiyun text = c.output_ready(formatter) 1192*4882a593Smuzhiyun elif isinstance(c, Tag): 1193*4882a593Smuzhiyun s.append(c.decode(indent_level, eventual_encoding, 1194*4882a593Smuzhiyun formatter)) 1195*4882a593Smuzhiyun if text and indent_level and not self.name == 'pre': 1196*4882a593Smuzhiyun text = text.strip() 1197*4882a593Smuzhiyun if text: 1198*4882a593Smuzhiyun if pretty_print and not self.name == 'pre': 1199*4882a593Smuzhiyun s.append(" " * (indent_level - 1)) 1200*4882a593Smuzhiyun s.append(text) 1201*4882a593Smuzhiyun if pretty_print and not self.name == 'pre': 1202*4882a593Smuzhiyun s.append("\n") 1203*4882a593Smuzhiyun return ''.join(s) 1204*4882a593Smuzhiyun 1205*4882a593Smuzhiyun def encode_contents( 1206*4882a593Smuzhiyun self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 1207*4882a593Smuzhiyun formatter="minimal"): 1208*4882a593Smuzhiyun """Renders the contents of this tag as a bytestring. 1209*4882a593Smuzhiyun 1210*4882a593Smuzhiyun :param indent_level: Each line of the rendering will be 1211*4882a593Smuzhiyun indented this many spaces. 1212*4882a593Smuzhiyun 1213*4882a593Smuzhiyun :param eventual_encoding: The bytestring will be in this encoding. 1214*4882a593Smuzhiyun 1215*4882a593Smuzhiyun :param formatter: The output formatter responsible for converting 1216*4882a593Smuzhiyun entities to Unicode characters. 1217*4882a593Smuzhiyun """ 1218*4882a593Smuzhiyun 1219*4882a593Smuzhiyun contents = self.decode_contents(indent_level, encoding, formatter) 1220*4882a593Smuzhiyun return contents.encode(encoding) 1221*4882a593Smuzhiyun 1222*4882a593Smuzhiyun # Old method for BS3 compatibility 1223*4882a593Smuzhiyun def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 1224*4882a593Smuzhiyun prettyPrint=False, indentLevel=0): 1225*4882a593Smuzhiyun if not prettyPrint: 1226*4882a593Smuzhiyun indentLevel = None 1227*4882a593Smuzhiyun return self.encode_contents( 1228*4882a593Smuzhiyun indent_level=indentLevel, encoding=encoding) 1229*4882a593Smuzhiyun 1230*4882a593Smuzhiyun #Soup methods 1231*4882a593Smuzhiyun 1232*4882a593Smuzhiyun def find(self, name=None, attrs={}, recursive=True, text=None, 1233*4882a593Smuzhiyun **kwargs): 1234*4882a593Smuzhiyun """Return only the first child of this Tag matching the given 1235*4882a593Smuzhiyun criteria.""" 1236*4882a593Smuzhiyun r = None 1237*4882a593Smuzhiyun l = self.find_all(name, attrs, recursive, text, 1, **kwargs) 1238*4882a593Smuzhiyun if l: 1239*4882a593Smuzhiyun r = l[0] 1240*4882a593Smuzhiyun return r 1241*4882a593Smuzhiyun findChild = find 1242*4882a593Smuzhiyun 1243*4882a593Smuzhiyun def find_all(self, name=None, attrs={}, recursive=True, text=None, 1244*4882a593Smuzhiyun limit=None, **kwargs): 1245*4882a593Smuzhiyun """Extracts a list of Tag objects that match the given 1246*4882a593Smuzhiyun criteria. You can specify the name of the Tag and any 1247*4882a593Smuzhiyun attributes you want the Tag to have. 1248*4882a593Smuzhiyun 1249*4882a593Smuzhiyun The value of a key-value pair in the 'attrs' map can be a 1250*4882a593Smuzhiyun string, a list of strings, a regular expression object, or a 1251*4882a593Smuzhiyun callable that takes a string and returns whether or not the 1252*4882a593Smuzhiyun string matches for some custom definition of 'matches'. The 1253*4882a593Smuzhiyun same is true of the tag name.""" 1254*4882a593Smuzhiyun 1255*4882a593Smuzhiyun generator = self.descendants 1256*4882a593Smuzhiyun if not recursive: 1257*4882a593Smuzhiyun generator = self.children 1258*4882a593Smuzhiyun return self._find_all(name, attrs, text, limit, generator, **kwargs) 1259*4882a593Smuzhiyun findAll = find_all # BS3 1260*4882a593Smuzhiyun findChildren = find_all # BS2 1261*4882a593Smuzhiyun 1262*4882a593Smuzhiyun #Generator methods 1263*4882a593Smuzhiyun @property 1264*4882a593Smuzhiyun def children(self): 1265*4882a593Smuzhiyun # return iter() to make the purpose of the method clear 1266*4882a593Smuzhiyun return iter(self.contents) # XXX This seems to be untested. 1267*4882a593Smuzhiyun 1268*4882a593Smuzhiyun @property 1269*4882a593Smuzhiyun def descendants(self): 1270*4882a593Smuzhiyun if not len(self.contents): 1271*4882a593Smuzhiyun return 1272*4882a593Smuzhiyun stopNode = self._last_descendant().next_element 1273*4882a593Smuzhiyun current = self.contents[0] 1274*4882a593Smuzhiyun while current is not stopNode: 1275*4882a593Smuzhiyun yield current 1276*4882a593Smuzhiyun current = current.next_element 1277*4882a593Smuzhiyun 1278*4882a593Smuzhiyun # CSS selector code 1279*4882a593Smuzhiyun 1280*4882a593Smuzhiyun _selector_combinators = ['>', '+', '~'] 1281*4882a593Smuzhiyun _select_debug = False 1282*4882a593Smuzhiyun def select_one(self, selector): 1283*4882a593Smuzhiyun """Perform a CSS selection operation on the current element.""" 1284*4882a593Smuzhiyun value = self.select(selector, limit=1) 1285*4882a593Smuzhiyun if value: 1286*4882a593Smuzhiyun return value[0] 1287*4882a593Smuzhiyun return None 1288*4882a593Smuzhiyun 1289*4882a593Smuzhiyun def select(self, selector, _candidate_generator=None, limit=None): 1290*4882a593Smuzhiyun """Perform a CSS selection operation on the current element.""" 1291*4882a593Smuzhiyun 1292*4882a593Smuzhiyun # Handle grouping selectors if ',' exists, ie: p,a 1293*4882a593Smuzhiyun if ',' in selector: 1294*4882a593Smuzhiyun context = [] 1295*4882a593Smuzhiyun for partial_selector in selector.split(','): 1296*4882a593Smuzhiyun partial_selector = partial_selector.strip() 1297*4882a593Smuzhiyun if partial_selector == '': 1298*4882a593Smuzhiyun raise ValueError('Invalid group selection syntax: %s' % selector) 1299*4882a593Smuzhiyun candidates = self.select(partial_selector, limit=limit) 1300*4882a593Smuzhiyun for candidate in candidates: 1301*4882a593Smuzhiyun if candidate not in context: 1302*4882a593Smuzhiyun context.append(candidate) 1303*4882a593Smuzhiyun 1304*4882a593Smuzhiyun if limit and len(context) >= limit: 1305*4882a593Smuzhiyun break 1306*4882a593Smuzhiyun return context 1307*4882a593Smuzhiyun 1308*4882a593Smuzhiyun tokens = selector.split() 1309*4882a593Smuzhiyun current_context = [self] 1310*4882a593Smuzhiyun 1311*4882a593Smuzhiyun if tokens[-1] in self._selector_combinators: 1312*4882a593Smuzhiyun raise ValueError( 1313*4882a593Smuzhiyun 'Final combinator "%s" is missing an argument.' % tokens[-1]) 1314*4882a593Smuzhiyun 1315*4882a593Smuzhiyun if self._select_debug: 1316*4882a593Smuzhiyun print('Running CSS selector "%s"' % selector) 1317*4882a593Smuzhiyun 1318*4882a593Smuzhiyun for index, token in enumerate(tokens): 1319*4882a593Smuzhiyun new_context = [] 1320*4882a593Smuzhiyun new_context_ids = set([]) 1321*4882a593Smuzhiyun 1322*4882a593Smuzhiyun if tokens[index-1] in self._selector_combinators: 1323*4882a593Smuzhiyun # This token was consumed by the previous combinator. Skip it. 1324*4882a593Smuzhiyun if self._select_debug: 1325*4882a593Smuzhiyun print(' Token was consumed by the previous combinator.') 1326*4882a593Smuzhiyun continue 1327*4882a593Smuzhiyun 1328*4882a593Smuzhiyun if self._select_debug: 1329*4882a593Smuzhiyun print(' Considering token "%s"' % token) 1330*4882a593Smuzhiyun recursive_candidate_generator = None 1331*4882a593Smuzhiyun tag_name = None 1332*4882a593Smuzhiyun 1333*4882a593Smuzhiyun # Each operation corresponds to a checker function, a rule 1334*4882a593Smuzhiyun # for determining whether a candidate matches the 1335*4882a593Smuzhiyun # selector. Candidates are generated by the active 1336*4882a593Smuzhiyun # iterator. 1337*4882a593Smuzhiyun checker = None 1338*4882a593Smuzhiyun 1339*4882a593Smuzhiyun m = self.attribselect_re.match(token) 1340*4882a593Smuzhiyun if m is not None: 1341*4882a593Smuzhiyun # Attribute selector 1342*4882a593Smuzhiyun tag_name, attribute, operator, value = m.groups() 1343*4882a593Smuzhiyun checker = self._attribute_checker(operator, attribute, value) 1344*4882a593Smuzhiyun 1345*4882a593Smuzhiyun elif '#' in token: 1346*4882a593Smuzhiyun # ID selector 1347*4882a593Smuzhiyun tag_name, tag_id = token.split('#', 1) 1348*4882a593Smuzhiyun def id_matches(tag): 1349*4882a593Smuzhiyun return tag.get('id', None) == tag_id 1350*4882a593Smuzhiyun checker = id_matches 1351*4882a593Smuzhiyun 1352*4882a593Smuzhiyun elif '.' in token: 1353*4882a593Smuzhiyun # Class selector 1354*4882a593Smuzhiyun tag_name, klass = token.split('.', 1) 1355*4882a593Smuzhiyun classes = set(klass.split('.')) 1356*4882a593Smuzhiyun def classes_match(candidate): 1357*4882a593Smuzhiyun return classes.issubset(candidate.get('class', [])) 1358*4882a593Smuzhiyun checker = classes_match 1359*4882a593Smuzhiyun 1360*4882a593Smuzhiyun elif ':' in token: 1361*4882a593Smuzhiyun # Pseudo-class 1362*4882a593Smuzhiyun tag_name, pseudo = token.split(':', 1) 1363*4882a593Smuzhiyun if tag_name == '': 1364*4882a593Smuzhiyun raise ValueError( 1365*4882a593Smuzhiyun "A pseudo-class must be prefixed with a tag name.") 1366*4882a593Smuzhiyun pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) 1367*4882a593Smuzhiyun found = [] 1368*4882a593Smuzhiyun if pseudo_attributes is None: 1369*4882a593Smuzhiyun pseudo_type = pseudo 1370*4882a593Smuzhiyun pseudo_value = None 1371*4882a593Smuzhiyun else: 1372*4882a593Smuzhiyun pseudo_type, pseudo_value = pseudo_attributes.groups() 1373*4882a593Smuzhiyun if pseudo_type == 'nth-of-type': 1374*4882a593Smuzhiyun try: 1375*4882a593Smuzhiyun pseudo_value = int(pseudo_value) 1376*4882a593Smuzhiyun except: 1377*4882a593Smuzhiyun raise NotImplementedError( 1378*4882a593Smuzhiyun 'Only numeric values are currently supported for the nth-of-type pseudo-class.') 1379*4882a593Smuzhiyun if pseudo_value < 1: 1380*4882a593Smuzhiyun raise ValueError( 1381*4882a593Smuzhiyun 'nth-of-type pseudo-class value must be at least 1.') 1382*4882a593Smuzhiyun class Counter(object): 1383*4882a593Smuzhiyun def __init__(self, destination): 1384*4882a593Smuzhiyun self.count = 0 1385*4882a593Smuzhiyun self.destination = destination 1386*4882a593Smuzhiyun 1387*4882a593Smuzhiyun def nth_child_of_type(self, tag): 1388*4882a593Smuzhiyun self.count += 1 1389*4882a593Smuzhiyun if self.count == self.destination: 1390*4882a593Smuzhiyun return True 1391*4882a593Smuzhiyun if self.count > self.destination: 1392*4882a593Smuzhiyun # Stop the generator that's sending us 1393*4882a593Smuzhiyun # these things. 1394*4882a593Smuzhiyun raise StopIteration() 1395*4882a593Smuzhiyun return False 1396*4882a593Smuzhiyun checker = Counter(pseudo_value).nth_child_of_type 1397*4882a593Smuzhiyun else: 1398*4882a593Smuzhiyun raise NotImplementedError( 1399*4882a593Smuzhiyun 'Only the following pseudo-classes are implemented: nth-of-type.') 1400*4882a593Smuzhiyun 1401*4882a593Smuzhiyun elif token == '*': 1402*4882a593Smuzhiyun # Star selector -- matches everything 1403*4882a593Smuzhiyun pass 1404*4882a593Smuzhiyun elif token == '>': 1405*4882a593Smuzhiyun # Run the next token as a CSS selector against the 1406*4882a593Smuzhiyun # direct children of each tag in the current context. 1407*4882a593Smuzhiyun recursive_candidate_generator = lambda tag: tag.children 1408*4882a593Smuzhiyun elif token == '~': 1409*4882a593Smuzhiyun # Run the next token as a CSS selector against the 1410*4882a593Smuzhiyun # siblings of each tag in the current context. 1411*4882a593Smuzhiyun recursive_candidate_generator = lambda tag: tag.next_siblings 1412*4882a593Smuzhiyun elif token == '+': 1413*4882a593Smuzhiyun # For each tag in the current context, run the next 1414*4882a593Smuzhiyun # token as a CSS selector against the tag's next 1415*4882a593Smuzhiyun # sibling that's a tag. 1416*4882a593Smuzhiyun def next_tag_sibling(tag): 1417*4882a593Smuzhiyun yield tag.find_next_sibling(True) 1418*4882a593Smuzhiyun recursive_candidate_generator = next_tag_sibling 1419*4882a593Smuzhiyun 1420*4882a593Smuzhiyun elif self.tag_name_re.match(token): 1421*4882a593Smuzhiyun # Just a tag name. 1422*4882a593Smuzhiyun tag_name = token 1423*4882a593Smuzhiyun else: 1424*4882a593Smuzhiyun raise ValueError( 1425*4882a593Smuzhiyun 'Unsupported or invalid CSS selector: "%s"' % token) 1426*4882a593Smuzhiyun if recursive_candidate_generator: 1427*4882a593Smuzhiyun # This happens when the selector looks like "> foo". 1428*4882a593Smuzhiyun # 1429*4882a593Smuzhiyun # The generator calls select() recursively on every 1430*4882a593Smuzhiyun # member of the current context, passing in a different 1431*4882a593Smuzhiyun # candidate generator and a different selector. 1432*4882a593Smuzhiyun # 1433*4882a593Smuzhiyun # In the case of "> foo", the candidate generator is 1434*4882a593Smuzhiyun # one that yields a tag's direct children (">"), and 1435*4882a593Smuzhiyun # the selector is "foo". 1436*4882a593Smuzhiyun next_token = tokens[index+1] 1437*4882a593Smuzhiyun def recursive_select(tag): 1438*4882a593Smuzhiyun if self._select_debug: 1439*4882a593Smuzhiyun print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)) 1440*4882a593Smuzhiyun print('-' * 40) 1441*4882a593Smuzhiyun for i in tag.select(next_token, recursive_candidate_generator): 1442*4882a593Smuzhiyun if self._select_debug: 1443*4882a593Smuzhiyun print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)) 1444*4882a593Smuzhiyun yield i 1445*4882a593Smuzhiyun if self._select_debug: 1446*4882a593Smuzhiyun print('-' * 40) 1447*4882a593Smuzhiyun _use_candidate_generator = recursive_select 1448*4882a593Smuzhiyun elif _candidate_generator is None: 1449*4882a593Smuzhiyun # By default, a tag's candidates are all of its 1450*4882a593Smuzhiyun # children. If tag_name is defined, only yield tags 1451*4882a593Smuzhiyun # with that name. 1452*4882a593Smuzhiyun if self._select_debug: 1453*4882a593Smuzhiyun if tag_name: 1454*4882a593Smuzhiyun check = "[any]" 1455*4882a593Smuzhiyun else: 1456*4882a593Smuzhiyun check = tag_name 1457*4882a593Smuzhiyun print(' Default candidate generator, tag name="%s"' % check) 1458*4882a593Smuzhiyun if self._select_debug: 1459*4882a593Smuzhiyun # This is redundant with later code, but it stops 1460*4882a593Smuzhiyun # a bunch of bogus tags from cluttering up the 1461*4882a593Smuzhiyun # debug log. 1462*4882a593Smuzhiyun def default_candidate_generator(tag): 1463*4882a593Smuzhiyun for child in tag.descendants: 1464*4882a593Smuzhiyun if not isinstance(child, Tag): 1465*4882a593Smuzhiyun continue 1466*4882a593Smuzhiyun if tag_name and not child.name == tag_name: 1467*4882a593Smuzhiyun continue 1468*4882a593Smuzhiyun yield child 1469*4882a593Smuzhiyun _use_candidate_generator = default_candidate_generator 1470*4882a593Smuzhiyun else: 1471*4882a593Smuzhiyun _use_candidate_generator = lambda tag: tag.descendants 1472*4882a593Smuzhiyun else: 1473*4882a593Smuzhiyun _use_candidate_generator = _candidate_generator 1474*4882a593Smuzhiyun 1475*4882a593Smuzhiyun count = 0 1476*4882a593Smuzhiyun for tag in current_context: 1477*4882a593Smuzhiyun if self._select_debug: 1478*4882a593Smuzhiyun print(" Running candidate generator on %s %s" % ( 1479*4882a593Smuzhiyun tag.name, repr(tag.attrs))) 1480*4882a593Smuzhiyun for candidate in _use_candidate_generator(tag): 1481*4882a593Smuzhiyun if not isinstance(candidate, Tag): 1482*4882a593Smuzhiyun continue 1483*4882a593Smuzhiyun if tag_name and candidate.name != tag_name: 1484*4882a593Smuzhiyun continue 1485*4882a593Smuzhiyun if checker is not None: 1486*4882a593Smuzhiyun try: 1487*4882a593Smuzhiyun result = checker(candidate) 1488*4882a593Smuzhiyun except StopIteration: 1489*4882a593Smuzhiyun # The checker has decided we should no longer 1490*4882a593Smuzhiyun # run the generator. 1491*4882a593Smuzhiyun break 1492*4882a593Smuzhiyun if checker is None or result: 1493*4882a593Smuzhiyun if self._select_debug: 1494*4882a593Smuzhiyun print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))) 1495*4882a593Smuzhiyun if id(candidate) not in new_context_ids: 1496*4882a593Smuzhiyun # If a tag matches a selector more than once, 1497*4882a593Smuzhiyun # don't include it in the context more than once. 1498*4882a593Smuzhiyun new_context.append(candidate) 1499*4882a593Smuzhiyun new_context_ids.add(id(candidate)) 1500*4882a593Smuzhiyun if limit and len(new_context) >= limit: 1501*4882a593Smuzhiyun break 1502*4882a593Smuzhiyun elif self._select_debug: 1503*4882a593Smuzhiyun print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs))) 1504*4882a593Smuzhiyun 1505*4882a593Smuzhiyun 1506*4882a593Smuzhiyun current_context = new_context 1507*4882a593Smuzhiyun 1508*4882a593Smuzhiyun if self._select_debug: 1509*4882a593Smuzhiyun print("Final verdict:") 1510*4882a593Smuzhiyun for i in current_context: 1511*4882a593Smuzhiyun print(" %s %s" % (i.name, i.attrs)) 1512*4882a593Smuzhiyun return current_context 1513*4882a593Smuzhiyun 1514*4882a593Smuzhiyun # Old names for backwards compatibility 1515*4882a593Smuzhiyun def childGenerator(self): 1516*4882a593Smuzhiyun return self.children 1517*4882a593Smuzhiyun 1518*4882a593Smuzhiyun def recursiveChildGenerator(self): 1519*4882a593Smuzhiyun return self.descendants 1520*4882a593Smuzhiyun 1521*4882a593Smuzhiyun def has_key(self, key): 1522*4882a593Smuzhiyun """This was kind of misleading because has_key() (attributes) 1523*4882a593Smuzhiyun was different from __in__ (contents). has_key() is gone in 1524*4882a593Smuzhiyun Python 3, anyway.""" 1525*4882a593Smuzhiyun warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( 1526*4882a593Smuzhiyun key)) 1527*4882a593Smuzhiyun return self.has_attr(key) 1528*4882a593Smuzhiyun 1529*4882a593Smuzhiyun# Next, a couple classes to represent queries and their results. 1530*4882a593Smuzhiyunclass SoupStrainer(object): 1531*4882a593Smuzhiyun """Encapsulates a number of ways of matching a markup element (tag or 1532*4882a593Smuzhiyun text).""" 1533*4882a593Smuzhiyun 1534*4882a593Smuzhiyun def __init__(self, name=None, attrs={}, text=None, **kwargs): 1535*4882a593Smuzhiyun self.name = self._normalize_search_value(name) 1536*4882a593Smuzhiyun if not isinstance(attrs, dict): 1537*4882a593Smuzhiyun # Treat a non-dict value for attrs as a search for the 'class' 1538*4882a593Smuzhiyun # attribute. 1539*4882a593Smuzhiyun kwargs['class'] = attrs 1540*4882a593Smuzhiyun attrs = None 1541*4882a593Smuzhiyun 1542*4882a593Smuzhiyun if 'class_' in kwargs: 1543*4882a593Smuzhiyun # Treat class_="foo" as a search for the 'class' 1544*4882a593Smuzhiyun # attribute, overriding any non-dict value for attrs. 1545*4882a593Smuzhiyun kwargs['class'] = kwargs['class_'] 1546*4882a593Smuzhiyun del kwargs['class_'] 1547*4882a593Smuzhiyun 1548*4882a593Smuzhiyun if kwargs: 1549*4882a593Smuzhiyun if attrs: 1550*4882a593Smuzhiyun attrs = attrs.copy() 1551*4882a593Smuzhiyun attrs.update(kwargs) 1552*4882a593Smuzhiyun else: 1553*4882a593Smuzhiyun attrs = kwargs 1554*4882a593Smuzhiyun normalized_attrs = {} 1555*4882a593Smuzhiyun for key, value in list(attrs.items()): 1556*4882a593Smuzhiyun normalized_attrs[key] = self._normalize_search_value(value) 1557*4882a593Smuzhiyun 1558*4882a593Smuzhiyun self.attrs = normalized_attrs 1559*4882a593Smuzhiyun self.text = self._normalize_search_value(text) 1560*4882a593Smuzhiyun 1561*4882a593Smuzhiyun def _normalize_search_value(self, value): 1562*4882a593Smuzhiyun # Leave it alone if it's a Unicode string, a callable, a 1563*4882a593Smuzhiyun # regular expression, a boolean, or None. 1564*4882a593Smuzhiyun if (isinstance(value, str) or isinstance(value, collections.abc.Callable) or hasattr(value, 'match') 1565*4882a593Smuzhiyun or isinstance(value, bool) or value is None): 1566*4882a593Smuzhiyun return value 1567*4882a593Smuzhiyun 1568*4882a593Smuzhiyun # If it's a bytestring, convert it to Unicode, treating it as UTF-8. 1569*4882a593Smuzhiyun if isinstance(value, bytes): 1570*4882a593Smuzhiyun return value.decode("utf8") 1571*4882a593Smuzhiyun 1572*4882a593Smuzhiyun # If it's listlike, convert it into a list of strings. 1573*4882a593Smuzhiyun if hasattr(value, '__iter__'): 1574*4882a593Smuzhiyun new_value = [] 1575*4882a593Smuzhiyun for v in value: 1576*4882a593Smuzhiyun if (hasattr(v, '__iter__') and not isinstance(v, bytes) 1577*4882a593Smuzhiyun and not isinstance(v, str)): 1578*4882a593Smuzhiyun # This is almost certainly the user's mistake. In the 1579*4882a593Smuzhiyun # interests of avoiding infinite loops, we'll let 1580*4882a593Smuzhiyun # it through as-is rather than doing a recursive call. 1581*4882a593Smuzhiyun new_value.append(v) 1582*4882a593Smuzhiyun else: 1583*4882a593Smuzhiyun new_value.append(self._normalize_search_value(v)) 1584*4882a593Smuzhiyun return new_value 1585*4882a593Smuzhiyun 1586*4882a593Smuzhiyun # Otherwise, convert it into a Unicode string. 1587*4882a593Smuzhiyun # The unicode(str()) thing is so this will do the same thing on Python 2 1588*4882a593Smuzhiyun # and Python 3. 1589*4882a593Smuzhiyun return str(str(value)) 1590*4882a593Smuzhiyun 1591*4882a593Smuzhiyun def __str__(self): 1592*4882a593Smuzhiyun if self.text: 1593*4882a593Smuzhiyun return self.text 1594*4882a593Smuzhiyun else: 1595*4882a593Smuzhiyun return "%s|%s" % (self.name, self.attrs) 1596*4882a593Smuzhiyun 1597*4882a593Smuzhiyun def search_tag(self, markup_name=None, markup_attrs={}): 1598*4882a593Smuzhiyun found = None 1599*4882a593Smuzhiyun markup = None 1600*4882a593Smuzhiyun if isinstance(markup_name, Tag): 1601*4882a593Smuzhiyun markup = markup_name 1602*4882a593Smuzhiyun markup_attrs = markup 1603*4882a593Smuzhiyun call_function_with_tag_data = ( 1604*4882a593Smuzhiyun isinstance(self.name, collections.abc.Callable) 1605*4882a593Smuzhiyun and not isinstance(markup_name, Tag)) 1606*4882a593Smuzhiyun 1607*4882a593Smuzhiyun if ((not self.name) 1608*4882a593Smuzhiyun or call_function_with_tag_data 1609*4882a593Smuzhiyun or (markup and self._matches(markup, self.name)) 1610*4882a593Smuzhiyun or (not markup and self._matches(markup_name, self.name))): 1611*4882a593Smuzhiyun if call_function_with_tag_data: 1612*4882a593Smuzhiyun match = self.name(markup_name, markup_attrs) 1613*4882a593Smuzhiyun else: 1614*4882a593Smuzhiyun match = True 1615*4882a593Smuzhiyun markup_attr_map = None 1616*4882a593Smuzhiyun for attr, match_against in list(self.attrs.items()): 1617*4882a593Smuzhiyun if not markup_attr_map: 1618*4882a593Smuzhiyun if hasattr(markup_attrs, 'get'): 1619*4882a593Smuzhiyun markup_attr_map = markup_attrs 1620*4882a593Smuzhiyun else: 1621*4882a593Smuzhiyun markup_attr_map = {} 1622*4882a593Smuzhiyun for k, v in markup_attrs: 1623*4882a593Smuzhiyun markup_attr_map[k] = v 1624*4882a593Smuzhiyun attr_value = markup_attr_map.get(attr) 1625*4882a593Smuzhiyun if not self._matches(attr_value, match_against): 1626*4882a593Smuzhiyun match = False 1627*4882a593Smuzhiyun break 1628*4882a593Smuzhiyun if match: 1629*4882a593Smuzhiyun if markup: 1630*4882a593Smuzhiyun found = markup 1631*4882a593Smuzhiyun else: 1632*4882a593Smuzhiyun found = markup_name 1633*4882a593Smuzhiyun if found and self.text and not self._matches(found.string, self.text): 1634*4882a593Smuzhiyun found = None 1635*4882a593Smuzhiyun return found 1636*4882a593Smuzhiyun searchTag = search_tag 1637*4882a593Smuzhiyun 1638*4882a593Smuzhiyun def search(self, markup): 1639*4882a593Smuzhiyun # print 'looking for %s in %s' % (self, markup) 1640*4882a593Smuzhiyun found = None 1641*4882a593Smuzhiyun # If given a list of items, scan it for a text element that 1642*4882a593Smuzhiyun # matches. 1643*4882a593Smuzhiyun if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): 1644*4882a593Smuzhiyun for element in markup: 1645*4882a593Smuzhiyun if isinstance(element, NavigableString) \ 1646*4882a593Smuzhiyun and self.search(element): 1647*4882a593Smuzhiyun found = element 1648*4882a593Smuzhiyun break 1649*4882a593Smuzhiyun # If it's a Tag, make sure its name or attributes match. 1650*4882a593Smuzhiyun # Don't bother with Tags if we're searching for text. 1651*4882a593Smuzhiyun elif isinstance(markup, Tag): 1652*4882a593Smuzhiyun if not self.text or self.name or self.attrs: 1653*4882a593Smuzhiyun found = self.search_tag(markup) 1654*4882a593Smuzhiyun # If it's text, make sure the text matches. 1655*4882a593Smuzhiyun elif isinstance(markup, NavigableString) or \ 1656*4882a593Smuzhiyun isinstance(markup, str): 1657*4882a593Smuzhiyun if not self.name and not self.attrs and self._matches(markup, self.text): 1658*4882a593Smuzhiyun found = markup 1659*4882a593Smuzhiyun else: 1660*4882a593Smuzhiyun raise Exception( 1661*4882a593Smuzhiyun "I don't know how to match against a %s" % markup.__class__) 1662*4882a593Smuzhiyun return found 1663*4882a593Smuzhiyun 1664*4882a593Smuzhiyun def _matches(self, markup, match_against): 1665*4882a593Smuzhiyun # print u"Matching %s against %s" % (markup, match_against) 1666*4882a593Smuzhiyun result = False 1667*4882a593Smuzhiyun if isinstance(markup, list) or isinstance(markup, tuple): 1668*4882a593Smuzhiyun # This should only happen when searching a multi-valued attribute 1669*4882a593Smuzhiyun # like 'class'. 1670*4882a593Smuzhiyun if (isinstance(match_against, str) 1671*4882a593Smuzhiyun and ' ' in match_against): 1672*4882a593Smuzhiyun # A bit of a special case. If they try to match "foo 1673*4882a593Smuzhiyun # bar" on a multivalue attribute's value, only accept 1674*4882a593Smuzhiyun # the literal value "foo bar" 1675*4882a593Smuzhiyun # 1676*4882a593Smuzhiyun # XXX This is going to be pretty slow because we keep 1677*4882a593Smuzhiyun # splitting match_against. But it shouldn't come up 1678*4882a593Smuzhiyun # too often. 1679*4882a593Smuzhiyun return (whitespace_re.split(match_against) == markup) 1680*4882a593Smuzhiyun else: 1681*4882a593Smuzhiyun for item in markup: 1682*4882a593Smuzhiyun if self._matches(item, match_against): 1683*4882a593Smuzhiyun return True 1684*4882a593Smuzhiyun return False 1685*4882a593Smuzhiyun 1686*4882a593Smuzhiyun if match_against is True: 1687*4882a593Smuzhiyun # True matches any non-None value. 1688*4882a593Smuzhiyun return markup is not None 1689*4882a593Smuzhiyun 1690*4882a593Smuzhiyun if isinstance(match_against, collections.abc.Callable): 1691*4882a593Smuzhiyun return match_against(markup) 1692*4882a593Smuzhiyun 1693*4882a593Smuzhiyun # Custom callables take the tag as an argument, but all 1694*4882a593Smuzhiyun # other ways of matching match the tag name as a string. 1695*4882a593Smuzhiyun if isinstance(markup, Tag): 1696*4882a593Smuzhiyun markup = markup.name 1697*4882a593Smuzhiyun 1698*4882a593Smuzhiyun # Ensure that `markup` is either a Unicode string, or None. 1699*4882a593Smuzhiyun markup = self._normalize_search_value(markup) 1700*4882a593Smuzhiyun 1701*4882a593Smuzhiyun if markup is None: 1702*4882a593Smuzhiyun # None matches None, False, an empty string, an empty list, and so on. 1703*4882a593Smuzhiyun return not match_against 1704*4882a593Smuzhiyun 1705*4882a593Smuzhiyun if isinstance(match_against, str): 1706*4882a593Smuzhiyun # Exact string match 1707*4882a593Smuzhiyun return markup == match_against 1708*4882a593Smuzhiyun 1709*4882a593Smuzhiyun if hasattr(match_against, 'match'): 1710*4882a593Smuzhiyun # Regexp match 1711*4882a593Smuzhiyun return match_against.search(markup) 1712*4882a593Smuzhiyun 1713*4882a593Smuzhiyun if hasattr(match_against, '__iter__'): 1714*4882a593Smuzhiyun # The markup must be an exact match against something 1715*4882a593Smuzhiyun # in the iterable. 1716*4882a593Smuzhiyun return markup in match_against 1717*4882a593Smuzhiyun 1718*4882a593Smuzhiyun 1719*4882a593Smuzhiyunclass ResultSet(list): 1720*4882a593Smuzhiyun """A ResultSet is just a list that keeps track of the SoupStrainer 1721*4882a593Smuzhiyun that created it.""" 1722*4882a593Smuzhiyun def __init__(self, source, result=()): 1723*4882a593Smuzhiyun super(ResultSet, self).__init__(result) 1724*4882a593Smuzhiyun self.source = source 1725