当前位置: 首页>>代码示例>>Python>>正文


Python html.HTMLParser方法代码示例

本文整理汇总了Python中lxml.html.HTMLParser方法的典型用法代码示例。如果您正苦于以下问题:Python html.HTMLParser方法的具体用法?Python html.HTMLParser怎么用?Python html.HTMLParser使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lxml.html的用法示例。


在下文中一共展示了html.HTMLParser方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_rsc_html

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def parse_rsc_html(htmlstring):
    """Messy RSC HTML needs this special parser to fix problems before creating selector."""
    converted = UnicodeDammit(htmlstring)
    if not converted.unicode_markup:
        raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
    root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
    # Add p.otherpara tags around orphan text
    newp = None
    for child in root.get_element_by_id('wrapper'):
        if newp is not None:
            if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
                child.addprevious(newp)
                newp = None
            else:
                newp.append(child)
        if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
            newp = Element('p', **{'class': 'otherpara'})
            newp.text = child.tail
            child.tail = ''
    return root 
开发者ID:mcs07,项目名称:ChemDataExtractor,代码行数:22,代码来源:rsc.py

示例2: parse_html_string

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def parse_html_string(s):
    from lxml import html

    utf8_parser = html.HTMLParser(encoding='utf-8')
    html_tree = html.document_fromstring(s , parser=utf8_parser)

    return html_tree 
开发者ID:booktype,项目名称:python-ooxml,代码行数:9,代码来源:importer.py

示例3: check_same_tpl

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def check_same_tpl(html_a, html_b):
    """ Given html_a and html_b, two HTML pages, check that they contain the same structure.
        Raises an exception if it's not the case. Otherwise, returns html_a.
    """
    structa = fromstring(str(html_a), parser=HTMLParser(remove_blank_text=True))
    structb = fromstring(str(html_b), parser=HTMLParser(remove_blank_text=True))
    if not elements_equal(structa, structb):
        raise Exception("The two templates do not contain the same thing!")
    return html_a 
开发者ID:UCL-INGI,项目名称:INGInious,代码行数:11,代码来源:jinja_migration.py

示例4: __init__

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def __init__(self, str):
        try:
            self._tree = html.fromstring(str, parser=html.HTMLParser(recover=True))
        except (XMLSyntaxError, ParserError) as e:
            if str and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'):
                logger.exception('Failed to parse HTML string')
            self._tree = html.Element('div') 
开发者ID:DMOJ,项目名称:online-judge,代码行数:9,代码来源:lxml_tree.py

示例5: __init__

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def __init__(self, *args, **kwargs):
        self.nofollow = kwargs.pop('nofollow', True)
        self.texoid = TexoidRenderer() if kwargs.pop('texoid', False) else None
        self.parser = HTMLParser()
        super(AwesomeRenderer, self).__init__(*args, **kwargs) 
开发者ID:DMOJ,项目名称:online-judge,代码行数:7,代码来源:__init__.py

示例6: fragments_to_tree

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def fragments_to_tree(fragment):
    tree = html.Element('div')
    try:
        parsed = html.fragments_fromstring(fragment, parser=html.HTMLParser(recover=True))
    except (XMLSyntaxError, ParserError) as e:
        if fragment and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'):
            logger.exception('Failed to parse HTML string')
        return tree

    if parsed and isinstance(parsed[0], str):
        tree.text = parsed[0]
        parsed = parsed[1:]
    tree.extend(parsed)
    return tree 
开发者ID:DMOJ,项目名称:online-judge,代码行数:16,代码来源:__init__.py

示例7: parse_html

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def parse_html(fileobj, encoding):
    """
    Given a file object *fileobj*, get an ElementTree instance.
    The *encoding* is assumed to be utf8.
    """
    parser = HTMLParser(encoding=encoding, remove_blank_text=True)
    return parse(fileobj, parser) 
开发者ID:datalib,项目名称:libextract,代码行数:9,代码来源:core.py

示例8: parse

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def parse(src):
    """ Returns an element tree create by `LXML <http://lxml.de/>`_.
       :param src: A readable object such as a :class:`wex.response.Response`.
    """

    if not hasattr(src, 'read'):
        return src

    etree = _ElementTree()
    try:
        stream = HTMLStream(src)
        # Sometimes we get URLs containing characters that aren't
        # acceptable to lxml (e.g. "http:/foo.com/bar?this=array[]").
        # When this happens lxml will quote the whole URL.
        # We don't want to have to check for this so we just always
        # quote it here and then unquote it in the `base_url` function.
        quoted_base_url = quote_base_url(src.url) if src.url else src.url
        while True:
            try:
                fp = replace_invalid_ncr(stream)
                # fp is a Unicode stream
                # The lxml FAQ tells us that it is inefficient to do this
                # http://lxml.de/FAQ.html#can-lxml-parse-from-file-objects-opened-in-unicode-text-mode
                # but actually it seems just fine as long as you tell the parser to use 'utf-8'!?
                parser = HTMLParser(encoding='utf-8')
                etree.parse(fp, parser=parser, base_url=quoted_base_url)
                break
            except UnicodeDecodeError as exc:
                stream.next_encoding()
    except IOError as exc:
        logger = logging.getLogger(__name__)
        logger.warning("IOError parsing %s (%s)", src.url, exc)

    root = etree.getroot()
    if root is None:
        etree._setroot(UNPARSEABLE)

    return etree 
开发者ID:eBay,项目名称:wextracto,代码行数:40,代码来源:etree.py

示例9: pre_parse

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def pre_parse(self):

        http_content_type = self.response.headers.get('content-type', '')
        target = HTMLEncodings(http_content_type)
        # parser will fail on non-ascii unless we set it explicitly
        parser = HTMLParser(target=target, encoding='ISO-8859-1')
        total_bytes = 0

        self.response.seek(0)
        while target:
            chunk = self.response.read(PRE_PARSE_CHUNK_SIZE)
            if not chunk:
                try:
                    parser.close()
                except XMLSyntaxError:
                    pass
                break

            if self.bom is None:
                assert PRE_PARSE_CHUNK_SIZE >= 4
                self.bom = b''
                for i in range(4, 1, -1):
                    if chunk[:i] in BOM_ENC:
                        self.bom = chunk[:i]
                        target.encodings.append(('bom', BOM_ENC[self.bom]))
                        # the can only be one BOM - stop here
                        break

            parser.feed(chunk)
            total_bytes += len(chunk)
            if total_bytes >= MAX_PRE_PARSE_BYTES:
                break

        return target.encodings 
开发者ID:eBay,项目名称:wextracto,代码行数:36,代码来源:htmlstream.py

示例10: create_html_parser

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def create_html_parser(headers):

    charset = headers.get_content_charset()
    try:
        if charset and codecs.lookup(charset).name == 'iso8859-1':
            charset = 'windows-1252'
    except LookupError:
        pass

    # if charset is not specified in the Content-Type, this will be
    # None ; encoding=None produces default (ISO 8859-1) behavior.
    return HTMLParser(encoding=charset) 
开发者ID:eBay,项目名称:wextracto,代码行数:14,代码来源:form.py

示例11: from_text

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def from_text(cls, text, base_url=None, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None, encoding=None):
        log.debug('Parsing {} with {}'.format(fmt, parser))
        root = fromstring(text, parser=parser(recover=True, encoding=cls._get_encoding(text, encoding)), base_url=base_url)
        if base_url and hasattr(root, 'make_links_absolute'):
            root.make_links_absolute()
        return cls(root, translator=translator, fmt=fmt, namespaces=namespaces) 
开发者ID:mcs07,项目名称:ChemDataExtractor,代码行数:8,代码来源:selector.py

示例12: from_html_text

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def from_html_text(cls, text, base_url=None, namespaces=None, encoding=None):
        return cls.from_text(text, base_url=base_url, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces, encoding=encoding) 
开发者ID:mcs07,项目名称:ChemDataExtractor,代码行数:4,代码来源:selector.py

示例13: from_response

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def from_response(cls, response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None):
        return cls.from_text(response.content, response.url, parser, translator, fmt, namespaces=namespaces, encoding=response.encoding) 
开发者ID:mcs07,项目名称:ChemDataExtractor,代码行数:4,代码来源:selector.py

示例14: from_html

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def from_html(cls, response, namespaces=None):
        return cls.from_response(response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces) 
开发者ID:mcs07,项目名称:ChemDataExtractor,代码行数:4,代码来源:selector.py

示例15: replace_img_url

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HTMLParser [as 别名]
def replace_img_url(self, content):
        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(str(content), parser=utf8_parser)

        for _pic_link in tree.xpath("//img"):
            href = str(_pic_link.get('src'))
            pic_id, pic_type = href.split('.')
            _pic_link.set('src', "https://pic4.zhimg.com/" + pic_id + "_b." + pic_type)
        replaced_content = etree.tostring(tree, encoding=str)
        return replaced_content 
开发者ID:knarfeh,项目名称:zhihu2ebook,代码行数:12,代码来源:column.py


注:本文中的lxml.html.HTMLParser方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。