Python html5lib.HTMLParser方法代碼示例

本文整理匯總了Python中html5lib.HTMLParser方法的典型用法代碼示例。如果您正苦於以下問題：Python html5lib.HTMLParser方法的具體用法？Python html5lib.HTMLParser怎麽用？Python html5lib.HTMLParser使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類html5lib的用法示例。

在下文中一共展示了html5lib.HTMLParser方法的11個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: test_with_serializer

# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def test_with_serializer():
    """Verify filter works in the context of everything else"""
    parser = html5lib.HTMLParser()
    dom = parser.parseFragment('<svg><pattern xlink:href="#patt2" id="patt1"></svg>')
    walker = html5lib.getTreeWalker('etree')
    ser = HTMLSerializer(
        alphabetical_attributes=True,
        quote_attr_values='always'
    )

    # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When
    # that gets fixed, we can fix this expected result.
    assert (
        ser.render(walker(dom)) ==
        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
    )

開發者ID:morpheus65535，項目名稱:bazarr，代碼行數:18，代碼來源:test_alphabeticalattributes.py

示例2: init

# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def __init__(self, namespaceHTMLElements, soup=None,
                 store_line_numbers=True, **kwargs):
        if soup:
            self.soup = soup
        else:
            from bs4 import BeautifulSoup
            # TODO: Why is the parser 'html.parser' here? To avoid an
            # infinite loop?
            self.soup = BeautifulSoup(
                "", "html.parser", store_line_numbers=store_line_numbers,
                **kwargs
            )
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

        # This will be set later to an html5lib.html5parser.HTMLParser
        # object, which we can use to track the current line number.
        self.parser = None
        self.store_line_numbers = store_line_numbers

開發者ID:Tautulli，項目名稱:Tautulli，代碼行數:20，代碼來源:_html5lib.py

示例3: feed

# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)

        extra_kwargs = dict()
        if not isinstance(markup, unicode):
            if new_html5lib:
                extra_kwargs['override_encoding'] = self.user_specified_encoding
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            original_encoding = parser.tokenizer.stream.charEncoding[0]
            if not isinstance(original_encoding, basestring):
                # In 0.99999999 and up, the encoding is an html5lib
                # Encoding object. We want to use a string for compatibility
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding

開發者ID:MarcelloLins，項目名稱:ServerlessCrawler-VancouverRealState，代碼行數:28，代碼來源:_html5lib.py

示例4: htmlparser_trace

# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data)

開發者ID:MarcelloLins，項目名稱:ServerlessCrawler-VancouverRealState，代碼行數:10，代碼來源:diagnose.py

示例5: validate_content

# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def validate_content(testcase, data, page_descr="unknown page"):
    """
    Validate data as HTML5.

    testcase should be a unittest.TestCase object (or similar).
    page_descr should be a human-readable description of the page being tested.
    """
    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
    parser.parse(data)
    if parser.errors:
        fh = open("tmp-validation.html", "wb")
        fh.write(data)
        fh.close()
        testcase.fail("Invalid HTML5 produced in %s:\n  %s" % (page_descr, str(parser.errors)))

開發者ID:sfu-fas，項目名稱:coursys，代碼行數:16，代碼來源:testing.py

示例6: feed

# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        doc = parser.parse(markup, encoding=self.user_specified_encoding)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]

開發者ID:MayOneUS，項目名稱:pledgeservice，代碼行數:15，代碼來源:_html5lib.py

示例7: clean

# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
          styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
          strip_comments=True):
    """Clean an HTML fragment of malicious content and return it

    This function is a security-focused function whose sole purpose is to
    remove malicious content from a string such that it can be displayed as
    content in a web page.

    This function is not designed to use to transform content to be used in
    non-web-page contexts.

    :arg text: the text to clean
    :arg tags: whitelist of allowed tags; defaults to
        ``bleach.ALLOWED_TAGS``
    :arg attributes: whitelist of allowed attributes; defaults to
        ``bleach.ALLOWED_ATTRIBUTES``
    :arg styles: whitelist of allowed css; defaults to
        ``bleach.ALLOWED_STYLES``
    :arg protocols: whitelist of allowed protocols for links; defaults
        to ``bleach.ALLOWED_PROTOCOLS``
    :arg strip: whether or not to strip disallowed elements
    :arg strip_comments: whether or not to strip HTML comments

    """
    if not text:
        return ''

    text = force_unicode(text)

    class s(BleachSanitizer):
        allowed_elements = tags
        allowed_attributes = attributes
        allowed_css_properties = styles
        allowed_protocols = protocols
        strip_disallowed_elements = strip
        strip_html_comments = strip_comments

    parser = html5lib.HTMLParser(tokenizer=s)

    return _render(parser.parseFragment(text))

開發者ID:ryfeus，項目名稱:lambda-packs，代碼行數:43，代碼來源:__init__.py

示例8: getspoiler

# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def getspoiler(mtgsid):
	try:
		fp = open("%d.html" % mtgsid, "rb")
		enc = None
	except FileNotFoundError:
		print("Downloading set %d..." % mtgsid)
		fp = urllib.request.urlopen(MTGS_URL % mtgsid)
		enc = fp.info().get_content_charset()
	parser = html5lib.HTMLParser(namespaceHTMLElements=False)
	return parser.parse(fp, encoding=enc)

開發者ID:mrphlip，項目名稱:lrrbot，代碼行數:12，代碼來源:scrape_salvation.py

示例9: _validate_html5

# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def _validate_html5(self, response_data):
        parser = html5lib.HTMLParser(strict=True)
        parser.parse(response_data)

開發者ID:bwaldvogel，項目名稱:openmoves，代碼行數:5，代碼來源:test_openmoves.py

示例10: feed

# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)

        extra_kwargs = dict()
        if not isinstance(markup, str):
            if new_html5lib:
                extra_kwargs['override_encoding'] = self.user_specified_encoding
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, str):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            original_encoding = parser.tokenizer.stream.charEncoding[0]
            if not isinstance(original_encoding, str):
                # In 0.99999999 and up, the encoding is an html5lib
                # Encoding object. We want to use a string for compatibility
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding

開發者ID:the-ethan-hunt，項目名稱:B.E.N.J.I.，代碼行數:28，代碼來源:_html5lib.py

示例11: benchmark_parsers

# 需要導入模塊: import html5lib [as 別名]
# 或者: from html5lib import HTMLParser [as 別名]
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a))

開發者ID:the-ethan-hunt，項目名稱:B.E.N.J.I.，代碼行數:33，代碼來源:diagnose.py

注：本文中的html5lib.HTMLParser方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。