當前位置: 首頁>>代碼示例>>Python>>正文


Python html.HTMLParser方法代碼示例

本文整理匯總了Python中lxml.html.HTMLParser方法的典型用法代碼示例。如果您正苦於以下問題:Python html.HTMLParser方法的具體用法?Python html.HTMLParser怎麽用?Python html.HTMLParser使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在lxml.html的用法示例。


在下文中一共展示了html.HTMLParser方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: parse_rsc_html

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def parse_rsc_html(htmlstring):
    """Messy RSC HTML needs this special parser to fix problems before creating selector."""
    converted = UnicodeDammit(htmlstring)
    if not converted.unicode_markup:
        raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
    root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
    # Add p.otherpara tags around orphan text
    newp = None
    for child in root.get_element_by_id('wrapper'):
        if newp is not None:
            if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
                child.addprevious(newp)
                newp = None
            else:
                newp.append(child)
        if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
            newp = Element('p', **{'class': 'otherpara'})
            newp.text = child.tail
            child.tail = ''
    return root 
開發者ID:mcs07,項目名稱:ChemDataExtractor,代碼行數:22,代碼來源:rsc.py

示例2: parse_html_string

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def parse_html_string(s):
    from lxml import html

    utf8_parser = html.HTMLParser(encoding='utf-8')
    html_tree = html.document_fromstring(s , parser=utf8_parser)

    return html_tree 
開發者ID:booktype,項目名稱:python-ooxml,代碼行數:9,代碼來源:importer.py

示例3: check_same_tpl

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def check_same_tpl(html_a, html_b):
    """ Given html_a and html_b, two HTML pages, check that they contain the same structure.
        Raises an exception if it's not the case. Otherwise, returns html_a.
    """
    structa = fromstring(str(html_a), parser=HTMLParser(remove_blank_text=True))
    structb = fromstring(str(html_b), parser=HTMLParser(remove_blank_text=True))
    if not elements_equal(structa, structb):
        raise Exception("The two templates do not contain the same thing!")
    return html_a 
開發者ID:UCL-INGI,項目名稱:INGInious,代碼行數:11,代碼來源:jinja_migration.py

示例4: __init__

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def __init__(self, str):
        try:
            self._tree = html.fromstring(str, parser=html.HTMLParser(recover=True))
        except (XMLSyntaxError, ParserError) as e:
            if str and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'):
                logger.exception('Failed to parse HTML string')
            self._tree = html.Element('div') 
開發者ID:DMOJ,項目名稱:online-judge,代碼行數:9,代碼來源:lxml_tree.py

示例5: __init__

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def __init__(self, *args, **kwargs):
        self.nofollow = kwargs.pop('nofollow', True)
        self.texoid = TexoidRenderer() if kwargs.pop('texoid', False) else None
        self.parser = HTMLParser()
        super(AwesomeRenderer, self).__init__(*args, **kwargs) 
開發者ID:DMOJ,項目名稱:online-judge,代碼行數:7,代碼來源:__init__.py

示例6: fragments_to_tree

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def fragments_to_tree(fragment):
    tree = html.Element('div')
    try:
        parsed = html.fragments_fromstring(fragment, parser=html.HTMLParser(recover=True))
    except (XMLSyntaxError, ParserError) as e:
        if fragment and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'):
            logger.exception('Failed to parse HTML string')
        return tree

    if parsed and isinstance(parsed[0], str):
        tree.text = parsed[0]
        parsed = parsed[1:]
    tree.extend(parsed)
    return tree 
開發者ID:DMOJ,項目名稱:online-judge,代碼行數:16,代碼來源:__init__.py

示例7: parse_html

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def parse_html(fileobj, encoding):
    """
    Given a file object *fileobj*, get an ElementTree instance.
    The *encoding* is assumed to be utf8.
    """
    parser = HTMLParser(encoding=encoding, remove_blank_text=True)
    return parse(fileobj, parser) 
開發者ID:datalib,項目名稱:libextract,代碼行數:9,代碼來源:core.py

示例8: parse

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def parse(src):
    """ Returns an element tree create by `LXML <http://lxml.de/>`_.
       :param src: A readable object such as a :class:`wex.response.Response`.
    """

    if not hasattr(src, 'read'):
        return src

    etree = _ElementTree()
    try:
        stream = HTMLStream(src)
        # Sometimes we get URLs containing characters that aren't
        # acceptable to lxml (e.g. "http:/foo.com/bar?this=array[]").
        # When this happens lxml will quote the whole URL.
        # We don't want to have to check for this so we just always
        # quote it here and then unquote it in the `base_url` function.
        quoted_base_url = quote_base_url(src.url) if src.url else src.url
        while True:
            try:
                fp = replace_invalid_ncr(stream)
                # fp is a Unicode stream
                # The lxml FAQ tells us that it is inefficient to do this
                # http://lxml.de/FAQ.html#can-lxml-parse-from-file-objects-opened-in-unicode-text-mode
                # but actually it seems just fine as long as you tell the parser to use 'utf-8'!?
                parser = HTMLParser(encoding='utf-8')
                etree.parse(fp, parser=parser, base_url=quoted_base_url)
                break
            except UnicodeDecodeError as exc:
                stream.next_encoding()
    except IOError as exc:
        logger = logging.getLogger(__name__)
        logger.warning("IOError parsing %s (%s)", src.url, exc)

    root = etree.getroot()
    if root is None:
        etree._setroot(UNPARSEABLE)

    return etree 
開發者ID:eBay,項目名稱:wextracto,代碼行數:40,代碼來源:etree.py

示例9: pre_parse

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def pre_parse(self):

        http_content_type = self.response.headers.get('content-type', '')
        target = HTMLEncodings(http_content_type)
        # parser will fail on non-ascii unless we set it explicitly
        parser = HTMLParser(target=target, encoding='ISO-8859-1')
        total_bytes = 0

        self.response.seek(0)
        while target:
            chunk = self.response.read(PRE_PARSE_CHUNK_SIZE)
            if not chunk:
                try:
                    parser.close()
                except XMLSyntaxError:
                    pass
                break

            if self.bom is None:
                assert PRE_PARSE_CHUNK_SIZE >= 4
                self.bom = b''
                for i in range(4, 1, -1):
                    if chunk[:i] in BOM_ENC:
                        self.bom = chunk[:i]
                        target.encodings.append(('bom', BOM_ENC[self.bom]))
                        # the can only be one BOM - stop here
                        break

            parser.feed(chunk)
            total_bytes += len(chunk)
            if total_bytes >= MAX_PRE_PARSE_BYTES:
                break

        return target.encodings 
開發者ID:eBay,項目名稱:wextracto,代碼行數:36,代碼來源:htmlstream.py

示例10: create_html_parser

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def create_html_parser(headers):

    charset = headers.get_content_charset()
    try:
        if charset and codecs.lookup(charset).name == 'iso8859-1':
            charset = 'windows-1252'
    except LookupError:
        pass

    # if charset is not specified in the Content-Type, this will be
    # None ; encoding=None produces default (ISO 8859-1) behavior.
    return HTMLParser(encoding=charset) 
開發者ID:eBay,項目名稱:wextracto,代碼行數:14,代碼來源:form.py

示例11: from_text

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def from_text(cls, text, base_url=None, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None, encoding=None):
        log.debug('Parsing {} with {}'.format(fmt, parser))
        root = fromstring(text, parser=parser(recover=True, encoding=cls._get_encoding(text, encoding)), base_url=base_url)
        if base_url and hasattr(root, 'make_links_absolute'):
            root.make_links_absolute()
        return cls(root, translator=translator, fmt=fmt, namespaces=namespaces) 
開發者ID:mcs07,項目名稱:ChemDataExtractor,代碼行數:8,代碼來源:selector.py

示例12: from_html_text

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def from_html_text(cls, text, base_url=None, namespaces=None, encoding=None):
        return cls.from_text(text, base_url=base_url, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces, encoding=encoding) 
開發者ID:mcs07,項目名稱:ChemDataExtractor,代碼行數:4,代碼來源:selector.py

示例13: from_response

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def from_response(cls, response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None):
        return cls.from_text(response.content, response.url, parser, translator, fmt, namespaces=namespaces, encoding=response.encoding) 
開發者ID:mcs07,項目名稱:ChemDataExtractor,代碼行數:4,代碼來源:selector.py

示例14: from_html

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def from_html(cls, response, namespaces=None):
        return cls.from_response(response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces) 
開發者ID:mcs07,項目名稱:ChemDataExtractor,代碼行數:4,代碼來源:selector.py

示例15: replace_img_url

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HTMLParser [as 別名]
def replace_img_url(self, content):
        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(str(content), parser=utf8_parser)

        for _pic_link in tree.xpath("//img"):
            href = str(_pic_link.get('src'))
            pic_id, pic_type = href.split('.')
            _pic_link.set('src', "https://pic4.zhimg.com/" + pic_id + "_b." + pic_type)
        replaced_content = etree.tostring(tree, encoding=str)
        return replaced_content 
開發者ID:knarfeh,項目名稱:zhihu2ebook,代碼行數:12,代碼來源:column.py


注:本文中的lxml.html.HTMLParser方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。