本文整理匯總了Python中lxml.html.HTMLParser方法的典型用法代碼示例。如果您正苦於以下問題:Python html.HTMLParser方法的具體用法?Python html.HTMLParser怎麽用?Python html.HTMLParser使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類lxml.html
的用法示例。
在下文中一共展示了html.HTMLParser方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: parse_rsc_html
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def parse_rsc_html(htmlstring):
"""Messy RSC HTML needs this special parser to fix problems before creating selector."""
converted = UnicodeDammit(htmlstring)
if not converted.unicode_markup:
raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
# Add p.otherpara tags around orphan text
newp = None
for child in root.get_element_by_id('wrapper'):
if newp is not None:
if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
child.addprevious(newp)
newp = None
else:
newp.append(child)
if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
newp = Element('p', **{'class': 'otherpara'})
newp.text = child.tail
child.tail = ''
return root
示例2: parse_html_string
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def parse_html_string(s):
from lxml import html
utf8_parser = html.HTMLParser(encoding='utf-8')
html_tree = html.document_fromstring(s , parser=utf8_parser)
return html_tree
示例3: check_same_tpl
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def check_same_tpl(html_a, html_b):
""" Given html_a and html_b, two HTML pages, check that they contain the same structure.
Raises an exception if it's not the case. Otherwise, returns html_a.
"""
structa = fromstring(str(html_a), parser=HTMLParser(remove_blank_text=True))
structb = fromstring(str(html_b), parser=HTMLParser(remove_blank_text=True))
if not elements_equal(structa, structb):
raise Exception("The two templates do not contain the same thing!")
return html_a
示例4: __init__
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def __init__(self, str):
try:
self._tree = html.fromstring(str, parser=html.HTMLParser(recover=True))
except (XMLSyntaxError, ParserError) as e:
if str and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'):
logger.exception('Failed to parse HTML string')
self._tree = html.Element('div')
示例5: __init__
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def __init__(self, *args, **kwargs):
self.nofollow = kwargs.pop('nofollow', True)
self.texoid = TexoidRenderer() if kwargs.pop('texoid', False) else None
self.parser = HTMLParser()
super(AwesomeRenderer, self).__init__(*args, **kwargs)
示例6: fragments_to_tree
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def fragments_to_tree(fragment):
tree = html.Element('div')
try:
parsed = html.fragments_fromstring(fragment, parser=html.HTMLParser(recover=True))
except (XMLSyntaxError, ParserError) as e:
if fragment and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'):
logger.exception('Failed to parse HTML string')
return tree
if parsed and isinstance(parsed[0], str):
tree.text = parsed[0]
parsed = parsed[1:]
tree.extend(parsed)
return tree
示例7: parse_html
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def parse_html(fileobj, encoding):
"""
Given a file object *fileobj*, get an ElementTree instance.
The *encoding* is assumed to be utf8.
"""
parser = HTMLParser(encoding=encoding, remove_blank_text=True)
return parse(fileobj, parser)
示例8: parse
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def parse(src):
""" Returns an element tree create by `LXML <http://lxml.de/>`_.
:param src: A readable object such as a :class:`wex.response.Response`.
"""
if not hasattr(src, 'read'):
return src
etree = _ElementTree()
try:
stream = HTMLStream(src)
# Sometimes we get URLs containing characters that aren't
# acceptable to lxml (e.g. "http:/foo.com/bar?this=array[]").
# When this happens lxml will quote the whole URL.
# We don't want to have to check for this so we just always
# quote it here and then unquote it in the `base_url` function.
quoted_base_url = quote_base_url(src.url) if src.url else src.url
while True:
try:
fp = replace_invalid_ncr(stream)
# fp is a Unicode stream
# The lxml FAQ tells us that it is inefficient to do this
# http://lxml.de/FAQ.html#can-lxml-parse-from-file-objects-opened-in-unicode-text-mode
# but actually it seems just fine as long as you tell the parser to use 'utf-8'!?
parser = HTMLParser(encoding='utf-8')
etree.parse(fp, parser=parser, base_url=quoted_base_url)
break
except UnicodeDecodeError as exc:
stream.next_encoding()
except IOError as exc:
logger = logging.getLogger(__name__)
logger.warning("IOError parsing %s (%s)", src.url, exc)
root = etree.getroot()
if root is None:
etree._setroot(UNPARSEABLE)
return etree
示例9: pre_parse
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def pre_parse(self):
http_content_type = self.response.headers.get('content-type', '')
target = HTMLEncodings(http_content_type)
# parser will fail on non-ascii unless we set it explicitly
parser = HTMLParser(target=target, encoding='ISO-8859-1')
total_bytes = 0
self.response.seek(0)
while target:
chunk = self.response.read(PRE_PARSE_CHUNK_SIZE)
if not chunk:
try:
parser.close()
except XMLSyntaxError:
pass
break
if self.bom is None:
assert PRE_PARSE_CHUNK_SIZE >= 4
self.bom = b''
for i in range(4, 1, -1):
if chunk[:i] in BOM_ENC:
self.bom = chunk[:i]
target.encodings.append(('bom', BOM_ENC[self.bom]))
# the can only be one BOM - stop here
break
parser.feed(chunk)
total_bytes += len(chunk)
if total_bytes >= MAX_PRE_PARSE_BYTES:
break
return target.encodings
示例10: create_html_parser
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def create_html_parser(headers):
charset = headers.get_content_charset()
try:
if charset and codecs.lookup(charset).name == 'iso8859-1':
charset = 'windows-1252'
except LookupError:
pass
# if charset is not specified in the Content-Type, this will be
# None ; encoding=None produces default (ISO 8859-1) behavior.
return HTMLParser(encoding=charset)
示例11: from_text
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def from_text(cls, text, base_url=None, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None, encoding=None):
log.debug('Parsing {} with {}'.format(fmt, parser))
root = fromstring(text, parser=parser(recover=True, encoding=cls._get_encoding(text, encoding)), base_url=base_url)
if base_url and hasattr(root, 'make_links_absolute'):
root.make_links_absolute()
return cls(root, translator=translator, fmt=fmt, namespaces=namespaces)
示例12: from_html_text
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def from_html_text(cls, text, base_url=None, namespaces=None, encoding=None):
return cls.from_text(text, base_url=base_url, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces, encoding=encoding)
示例13: from_response
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def from_response(cls, response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None):
return cls.from_text(response.content, response.url, parser, translator, fmt, namespaces=namespaces, encoding=response.encoding)
示例14: from_html
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def from_html(cls, response, namespaces=None):
return cls.from_response(response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces)
示例15: replace_img_url
# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def replace_img_url(self, content):
utf8_parser = html.HTMLParser(encoding='utf-8')
tree = html.document_fromstring(str(content), parser=utf8_parser)
for _pic_link in tree.xpath("//img"):
href = str(_pic_link.get('src'))
pic_id, pic_type = href.split('.')
_pic_link.set('src', "https://pic4.zhimg.com/" + pic_id + "_b." + pic_type)
replaced_content = etree.tostring(tree, encoding=str)
return replaced_content