本文整理匯總了Python中html5lib.HTMLParser方法的典型用法代碼示例。如果您正苦於以下問題:Python html5lib.HTMLParser方法的具體用法?Python html5lib.HTMLParser怎麽用?Python html5lib.HTMLParser使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類html5lib
的用法示例。
在下文中一共展示了html5lib.HTMLParser方法的11個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_with_serializer
# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def test_with_serializer():
"""Verify filter works in the context of everything else"""
parser = html5lib.HTMLParser()
dom = parser.parseFragment('<svg><pattern xlink:href="#patt2" id="patt1"></svg>')
walker = html5lib.getTreeWalker('etree')
ser = HTMLSerializer(
alphabetical_attributes=True,
quote_attr_values='always'
)
# FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When
# that gets fixed, we can fix this expected result.
assert (
ser.render(walker(dom)) ==
'<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
)
示例2: __init__
# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def __init__(self, namespaceHTMLElements, soup=None,
store_line_numbers=True, **kwargs):
if soup:
self.soup = soup
else:
from bs4 import BeautifulSoup
# TODO: Why is the parser 'html.parser' here? To avoid an
# infinite loop?
self.soup = BeautifulSoup(
"", "html.parser", store_line_numbers=store_line_numbers,
**kwargs
)
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
# This will be set later to an html5lib.html5parser.HTMLParser
# object, which we can use to track the current line number.
self.parser = None
self.store_line_numbers = store_line_numbers
示例3: feed
# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def feed(self, markup):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
extra_kwargs = dict()
if not isinstance(markup, unicode):
if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
extra_kwargs['encoding'] = self.user_specified_encoding
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
original_encoding = parser.tokenizer.stream.charEncoding[0]
if not isinstance(original_encoding, basestring):
# In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility
# with other tree builders.
original_encoding = original_encoding.name
doc.original_encoding = original_encoding
示例4: htmlparser_trace
# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.feed(data)
示例5: validate_content
# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def validate_content(testcase, data, page_descr="unknown page"):
"""
Validate data as HTML5.
testcase should be a unittest.TestCase object (or similar).
page_descr should be a human-readable description of the page being tested.
"""
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
parser.parse(data)
if parser.errors:
fh = open("tmp-validation.html", "wb")
fh.write(data)
fh.close()
testcase.fail("Invalid HTML5 produced in %s:\n %s" % (page_descr, str(parser.errors)))
示例6: feed
# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def feed(self, markup):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
doc = parser.parse(markup, encoding=self.user_specified_encoding)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
示例7: clean
# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
strip_comments=True):
"""Clean an HTML fragment of malicious content and return it
This function is a security-focused function whose sole purpose is to
remove malicious content from a string such that it can be displayed as
content in a web page.
This function is not designed to use to transform content to be used in
non-web-page contexts.
:arg text: the text to clean
:arg tags: whitelist of allowed tags; defaults to
``bleach.ALLOWED_TAGS``
:arg attributes: whitelist of allowed attributes; defaults to
``bleach.ALLOWED_ATTRIBUTES``
:arg styles: whitelist of allowed css; defaults to
``bleach.ALLOWED_STYLES``
:arg protocols: whitelist of allowed protocols for links; defaults
to ``bleach.ALLOWED_PROTOCOLS``
:arg strip: whether or not to strip disallowed elements
:arg strip_comments: whether or not to strip HTML comments
"""
if not text:
return ''
text = force_unicode(text)
class s(BleachSanitizer):
allowed_elements = tags
allowed_attributes = attributes
allowed_css_properties = styles
allowed_protocols = protocols
strip_disallowed_elements = strip
strip_html_comments = strip_comments
parser = html5lib.HTMLParser(tokenizer=s)
return _render(parser.parseFragment(text))
示例8: getspoiler
# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def getspoiler(mtgsid):
try:
fp = open("%d.html" % mtgsid, "rb")
enc = None
except FileNotFoundError:
print("Downloading set %d..." % mtgsid)
fp = urllib.request.urlopen(MTGS_URL % mtgsid)
enc = fp.info().get_content_charset()
parser = html5lib.HTMLParser(namespaceHTMLElements=False)
return parser.parse(fp, encoding=enc)
示例9: _validate_html5
# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def _validate_html5(self, response_data):
parser = html5lib.HTMLParser(strict=True)
parser.parse(response_data)
示例10: feed
# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def feed(self, markup):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
extra_kwargs = dict()
if not isinstance(markup, str):
if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
extra_kwargs['encoding'] = self.user_specified_encoding
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, str):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
original_encoding = parser.tokenizer.stream.charEncoding[0]
if not isinstance(original_encoding, str):
# In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility
# with other tree builders.
original_encoding = original_encoding.name
doc.original_encoding = original_encoding
示例11: benchmark_parsers
# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements)
print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
try:
a = time.time()
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
except Exception as e:
print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print("Raw html5lib parsed the markup in %.2fs." % (b-a))