Python html.parse方法代码示例

本文整理汇总了Python中lxml.html.parse方法的典型用法代码示例。如果您正苦于以下问题：Python html.parse方法的具体用法？Python html.parse怎么用？Python html.parse使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.html的用法示例。

在下文中一共展示了html.parse方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _parse_tables

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def _parse_tables(self, doc, match, attrs):
        """
        Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : the DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            multiple tables on a page.

        Raises
        ------
        ValueError : `match` does not match any text in the document.

        Returns
        -------
        list of node-like
            HTML <table> elements to be parsed into raw data.
        """
        raise AbstractMethodError(self)

开发者ID:Frank-qlu，项目名称:recruit，代码行数:27，代码来源:html.py

示例2: _build_doc

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def _build_doc(self):
        """
        Return a tree-like object that can be used to iterate over the DOM.

        Returns
        -------
        node-like
            The DOM from which to parse the table element.
        """
        raise AbstractMethodError(self)

开发者ID:Frank-qlu，项目名称:recruit，代码行数:12，代码来源:html.py

示例3: _parse

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
    flavor = _validate_flavor(flavor)
    compiled_match = re.compile(match)  # you can pass a compiled regex here

    # hack around python 3 deleting the exception variable
    retained = None
    for flav in flavor:
        parser = _parser_dispatch(flav)
        p = parser(io, compiled_match, attrs, encoding, displayed_only)

        try:
            tables = p.parse_tables()
        except Exception as caught:
            # if `io` is an io-like object, check if it's seekable
            # and try to rewind it before trying the next parser
            if hasattr(io, 'seekable') and io.seekable():
                io.seek(0)
            elif hasattr(io, 'seekable') and not io.seekable():
                # if we couldn't rewind it, let the user know
                raise ValueError('The flavor {} failed to parse your input. '
                                 'Since you passed a non-rewindable file '
                                 'object, we can\'t rewind it to try '
                                 'another parser. Try read_html() with a '
                                 'different flavor.'.format(flav))

            retained = caught
        else:
            break
    else:
        raise_with_traceback(retained)

    ret = []
    for table in tables:
        try:
            ret.append(_data_to_frame(data=table, **kwargs))
        except EmptyDataError:  # empty table
            continue
    return ret

开发者ID:Frank-qlu，项目名称:recruit，代码行数:40，代码来源:html.py

示例4: fetch_through_redirects

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def fetch_through_redirects(url):
    tree = None
    while True:
        cont = False
        resp = requests.get(
            url,
            verify=certifi.where(),
            headers={"User-Agent": USER_AGENT},
            timeout=10,
            stream=True,
        )
        try:
            if resp.status_code != 200:
                raise Not200(resp.status_code)
            # Convince urllib3 to decode gzipped pages.
            resp.raw.decode_content = True
            tree = html.parse(resp.raw)
        finally:
            resp.close()
        # Check for sneaky <meta> redirects.
        for meta in META_XPATH(tree):
            m = re.match(r"0;\s*url=['\"](.+?)['\"]", meta.get("content"))
            if m is not None:
                url = m.groups()[0]
                cont = True
                break
        if not cont:
            break
    return resp, tree

开发者ID:benjaminp，项目名称:httpswatch，代码行数:31，代码来源:check_https.py

示例5: main

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def main():
    with open("mangalist.csv", "w") as f:
        tree = parse("http://www.mangapanda.com/alphabetical")
        manga_name_list = tree.xpath("//ul[@class='series_alpha']/li/a/text()")
        manga_url_list = tree.xpath("//ul[@class='series_alpha']/li/a/@href")
        f.write("\"Manga Name\", URL\n")

        for i in range(len(manga_name_list)):
            f.write("\"{0}\", http://www.mangapanda.com{1}\n".format(manga_name_list[i].replace("\"", ""), manga_url_list[i]))

开发者ID:AnimeshShaw，项目名称:MangaScrapper，代码行数:11，代码来源:MangaList.py

示例6: _parse_tables

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def _parse_tables(self, doc, match, attrs):
        """Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : tree-like
            The DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            multiple tables on a page.

        Raises
        ------
        ValueError
            * If `match` does not match any text in the document.

        Returns
        -------
        tables : list of node-like
            A list of <table> elements to be parsed into raw data.
        """
        raise com.AbstractMethodError(self)

开发者ID:birforce，项目名称:vnpy_crypto，代码行数:28，代码来源:html.py

示例7: _parse_tables

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def _parse_tables(self, doc, match, attrs):
        """Return all tables from the parsed DOM.

        Parameters
        ----------
        doc : tree-like
            The DOM from which to parse the table element.

        match : str or regular expression
            The text to search for in the DOM tree.

        attrs : dict
            A dictionary of table attributes that can be used to disambiguate
            mutliple tables on a page.

        Raises
        ------
        ValueError
            * If `match` does not match any text in the document.

        Returns
        -------
        tables : list of node-like
            A list of <table> elements to be parsed into raw data.
        """
        raise NotImplementedError

开发者ID:ktraunmueller，项目名称:Computable，代码行数:28，代码来源:html.py

示例8: get_lxml_elements

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def get_lxml_elements(url, element):
    _skip_if_no('lxml')
    from lxml.html import parse
    doc = parse(url)
    return doc.xpath('.//{0}'.format(element))

开发者ID:ktraunmueller，项目名称:Computable，代码行数:7，代码来源:test_html.py

示例9: parse_rss

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def parse_rss(url=None, **kwargs):
    try:
        f = fetch(decode(url), **kwargs)
    except (ValueError, URLError):
        parsed = rssparser.parse(url)
    else:
        content = f.read() if speedparser else f

        try:
            parsed = rssparser.parse(content)
        finally:
            f.close()

    return parsed

开发者ID:nerevu，项目名称:riko，代码行数:16，代码来源:parsers.py

示例10: xml2etree

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def xml2etree(f, xml=True, html5=False):
    if xml:
        element_tree = etree.parse(f)
    elif html5 and html5parser:
        element_tree = html5parser.parse(f)
    elif html5parser:
        element_tree = html.parse(f)
    else:
        # html5lib's parser returns an Element, so we must convert it into an
        # ElementTree
        element_tree = ElementTree(html.parse(f))

    return element_tree

开发者ID:nerevu，项目名称:riko，代码行数:15，代码来源:parsers.py

示例11: grab_trending_gif_urls

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def grab_trending_gif_urls():
    doc = parse("http://giphy.com").getroot()
    els = doc.cssselect(".gif-link img")[:10]
    ret = []
    for el in els:
        ret.append("http:" +re.sub(r"\/([^./])*\.gif", "/giphy.gif", el.attrib['src']))
    return ret

开发者ID:agermanidis，项目名称:SnapchatBot，代码行数:9，代码来源:gifbot.py

示例12: parse_html

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def parse_html(fileobj, encoding):
    """
    Given a file object *fileobj*, get an ElementTree instance.
    The *encoding* is assumed to be utf8.
    """
    parser = HTMLParser(encoding=encoding, remove_blank_text=True)
    return parse(fileobj, parser)

开发者ID:datalib，项目名称:libextract，代码行数:9，代码来源:core.py

示例13: extract

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def extract(response):
    tree = parse(response)
    return tree.xpath('//h1/text()')

开发者ID:eBay，项目名称:wextracto，代码行数:5，代码来源:tutorial.py

示例14: extract

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def extract(response):
    tree = parse(response)
    yield "name", text(tree.xpath('//h1'))
    yield "country", text(tree.xpath('//dd[@id="country"]'))
    yield "region", text(tree.xpath('//dd[@id="region"]'))

开发者ID:eBay，项目名称:wextracto，代码行数:7，代码来源:tutorial.py

示例15: extract

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def extract(response):
    tree = parse(response)
    return text(tree.xpath('//h1/text()'))

开发者ID:eBay，项目名称:wextracto，代码行数:5，代码来源:tutorial.py

注：本文中的lxml.html.parse方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。