Python html5lib.HTMLParser方法代码示例

本文整理汇总了Python中html5lib.HTMLParser方法的典型用法代码示例。如果您正苦于以下问题：Python html5lib.HTMLParser方法的具体用法？Python html5lib.HTMLParser怎么用？Python html5lib.HTMLParser使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类html5lib的用法示例。

在下文中一共展示了html5lib.HTMLParser方法的11个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_with_serializer

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import HTMLParser [as 别名]
def test_with_serializer():
    """Verify filter works in the context of everything else"""
    parser = html5lib.HTMLParser()
    dom = parser.parseFragment('<svg><pattern xlink:href="#patt2" id="patt1"></svg>')
    walker = html5lib.getTreeWalker('etree')
    ser = HTMLSerializer(
        alphabetical_attributes=True,
        quote_attr_values='always'
    )

    # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When
    # that gets fixed, we can fix this expected result.
    assert (
        ser.render(walker(dom)) ==
        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
    )

开发者ID:morpheus65535，项目名称:bazarr，代码行数:18，代码来源:test_alphabeticalattributes.py

示例2: init

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import HTMLParser [as 别名]
def __init__(self, namespaceHTMLElements, soup=None,
                 store_line_numbers=True, **kwargs):
        if soup:
            self.soup = soup
        else:
            from bs4 import BeautifulSoup
            # TODO: Why is the parser 'html.parser' here? To avoid an
            # infinite loop?
            self.soup = BeautifulSoup(
                "", "html.parser", store_line_numbers=store_line_numbers,
                **kwargs
            )
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

        # This will be set later to an html5lib.html5parser.HTMLParser
        # object, which we can use to track the current line number.
        self.parser = None
        self.store_line_numbers = store_line_numbers

开发者ID:Tautulli，项目名称:Tautulli，代码行数:20，代码来源:_html5lib.py

示例3: feed

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import HTMLParser [as 别名]
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)

        extra_kwargs = dict()
        if not isinstance(markup, unicode):
            if new_html5lib:
                extra_kwargs['override_encoding'] = self.user_specified_encoding
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            original_encoding = parser.tokenizer.stream.charEncoding[0]
            if not isinstance(original_encoding, basestring):
                # In 0.99999999 and up, the encoding is an html5lib
                # Encoding object. We want to use a string for compatibility
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding

开发者ID:MarcelloLins，项目名称:ServerlessCrawler-VancouverRealState，代码行数:28，代码来源:_html5lib.py

示例4: htmlparser_trace

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import HTMLParser [as 别名]
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data)

开发者ID:MarcelloLins，项目名称:ServerlessCrawler-VancouverRealState，代码行数:10，代码来源:diagnose.py

示例5: validate_content

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import HTMLParser [as 别名]
def validate_content(testcase, data, page_descr="unknown page"):
    """
    Validate data as HTML5.

    testcase should be a unittest.TestCase object (or similar).
    page_descr should be a human-readable description of the page being tested.
    """
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
    parser.parse(data)
    if parser.errors:
        fh = open("tmp-validation.html", "wb")
        fh.write(data)
        fh.close()
        testcase.fail("Invalid HTML5 produced in %s:\n  %s" % (page_descr, str(parser.errors)))

开发者ID:sfu-fas，项目名称:coursys，代码行数:16，代码来源:testing.py

示例6: feed

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import HTMLParser [as 别名]
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        doc = parser.parse(markup, encoding=self.user_specified_encoding)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]

开发者ID:MayOneUS，项目名称:pledgeservice，代码行数:15，代码来源:_html5lib.py

示例7: clean

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import HTMLParser [as 别名]
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
          styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
          strip_comments=True):
    """Clean an HTML fragment of malicious content and return it

    This function is a security-focused function whose sole purpose is to
    remove malicious content from a string such that it can be displayed as
    content in a web page.

    This function is not designed to use to transform content to be used in
    non-web-page contexts.

    :arg text: the text to clean
    :arg tags: whitelist of allowed tags; defaults to
        ``bleach.ALLOWED_TAGS``
    :arg attributes: whitelist of allowed attributes; defaults to
        ``bleach.ALLOWED_ATTRIBUTES``
    :arg styles: whitelist of allowed css; defaults to
        ``bleach.ALLOWED_STYLES``
    :arg protocols: whitelist of allowed protocols for links; defaults
        to ``bleach.ALLOWED_PROTOCOLS``
    :arg strip: whether or not to strip disallowed elements
    :arg strip_comments: whether or not to strip HTML comments

    """
    if not text:
        return ''

    text = force_unicode(text)

    class s(BleachSanitizer):
        allowed_elements = tags
        allowed_attributes = attributes
        allowed_css_properties = styles
        allowed_protocols = protocols
        strip_disallowed_elements = strip
        strip_html_comments = strip_comments

    parser = html5lib.HTMLParser(tokenizer=s)

    return _render(parser.parseFragment(text))

开发者ID:ryfeus，项目名称:lambda-packs，代码行数:43，代码来源:__init__.py

示例8: getspoiler

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import HTMLParser [as 别名]
def getspoiler(mtgsid):
	try:
		fp = open("%d.html" % mtgsid, "rb")
		enc = None
	except FileNotFoundError:
		print("Downloading set %d..." % mtgsid)
		fp = urllib.request.urlopen(MTGS_URL % mtgsid)
		enc = fp.info().get_content_charset()
	parser = html5lib.HTMLParser(namespaceHTMLElements=False)
	return parser.parse(fp, encoding=enc)

开发者ID:mrphlip，项目名称:lrrbot，代码行数:12，代码来源:scrape_salvation.py

示例9: _validate_html5

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import HTMLParser [as 别名]
def _validate_html5(self, response_data):
        parser = html5lib.HTMLParser(strict=True)
        parser.parse(response_data)

开发者ID:bwaldvogel，项目名称:openmoves，代码行数:5，代码来源:test_openmoves.py

示例10: feed

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import HTMLParser [as 别名]
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)

        extra_kwargs = dict()
        if not isinstance(markup, str):
            if new_html5lib:
                extra_kwargs['override_encoding'] = self.user_specified_encoding
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, str):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            original_encoding = parser.tokenizer.stream.charEncoding[0]
            if not isinstance(original_encoding, str):
                # In 0.99999999 and up, the encoding is an html5lib
                # Encoding object. We want to use a string for compatibility
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding

开发者ID:the-ethan-hunt，项目名称:B.E.N.J.I.，代码行数:28，代码来源:_html5lib.py

示例11: benchmark_parsers

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import HTMLParser [as 别名]
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a))

开发者ID:the-ethan-hunt，项目名称:B.E.N.J.I.，代码行数:33，代码来源:diagnose.py

注：本文中的html5lib.HTMLParser方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。