Lines Matching full:encoding
7 XML or HTML to reflect a new encoding; that's the tree builder's job.
23 return cchardet.detect(s)['encoding']
31 return chardet.detect(s)['encoding']
40 r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
177 This differs from data.encode(encoding, 'xmlcharrefreplace')
197 2. An encoding declared within the bytestring itself, either in an
202 3. An encoding detected through textual analysis by chardet,
221 def _usable(self, encoding, tried): argument
222 if encoding is not None:
223 encoding = encoding.lower()
224 if encoding in self.exclude_encodings:
226 if encoding not in tried:
227 tried.add(encoding)
240 # that indicated its encoding?
244 # Look within the document for an XML or HTML encoding
253 # encoding.
266 """If a byte-order mark is present, strip it and return the encoding it implies."""
267 encoding = None
270 return data, encoding
273 encoding = 'utf-16be'
277 encoding = 'utf-16le'
280 encoding = 'utf-8'
283 encoding = 'utf-32be'
286 encoding = 'utf-32le'
288 return data, encoding
292 """Given a document, tries to find its declared encoding.
294 An XML encoding is declared at the beginning of the document.
296 An HTML encoding is declared in a <meta> tag, hopefully near the
317 """A class for detecting the encoding of a *ML document and
318 converting it to a Unicode string. If the source encoding is
352 # The encoding detector may have stripped a byte-order mark.
357 for encoding in self.detector.encodings:
359 u = self._convert_from(encoding)
367 for encoding in self.detector.encodings:
368 if encoding != "ascii":
369 u = self._convert_from(encoding, "replace")
407 # Convert smart quotes to HTML if coming from an encoding
425 #print "Correct encoding: %s" % proposed
428 def _to_unicode(self, data, encoding, errors="strict"): argument
429 '''Given a string and its encoding, decodes the string into Unicode.
430 %encoding is a string recognized by encodings.aliases'''
431 return str(data, encoding, errors)
773 """Fix characters from one encoding embedded in some other encoding.
793 "UTF-8 is the only currently supported main encoding.")