本文整理汇总了Python中lxml.html.parse方法的典型用法代码示例。如果您正苦于以下问题:Python html.parse方法的具体用法?Python html.parse怎么用?Python html.parse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.html
的用法示例。
在下文中一共展示了html.parse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _parse_tables
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def _parse_tables(self, doc, match, attrs):
"""
Return all tables from the parsed DOM.
Parameters
----------
doc : the DOM from which to parse the table element.
match : str or regular expression
The text to search for in the DOM tree.
attrs : dict
A dictionary of table attributes that can be used to disambiguate
multiple tables on a page.
Raises
------
ValueError : `match` does not match any text in the document.
Returns
-------
list of node-like
HTML <table> elements to be parsed into raw data.
"""
raise AbstractMethodError(self)
示例2: _build_doc
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def _build_doc(self):
"""
Return a tree-like object that can be used to iterate over the DOM.
Returns
-------
node-like
The DOM from which to parse the table element.
"""
raise AbstractMethodError(self)
示例3: _parse
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here
# hack around python 3 deleting the exception variable
retained = None
for flav in flavor:
parser = _parser_dispatch(flav)
p = parser(io, compiled_match, attrs, encoding, displayed_only)
try:
tables = p.parse_tables()
except Exception as caught:
# if `io` is an io-like object, check if it's seekable
# and try to rewind it before trying the next parser
if hasattr(io, 'seekable') and io.seekable():
io.seek(0)
elif hasattr(io, 'seekable') and not io.seekable():
# if we couldn't rewind it, let the user know
raise ValueError('The flavor {} failed to parse your input. '
'Since you passed a non-rewindable file '
'object, we can\'t rewind it to try '
'another parser. Try read_html() with a '
'different flavor.'.format(flav))
retained = caught
else:
break
else:
raise_with_traceback(retained)
ret = []
for table in tables:
try:
ret.append(_data_to_frame(data=table, **kwargs))
except EmptyDataError: # empty table
continue
return ret
示例4: fetch_through_redirects
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def fetch_through_redirects(url):
tree = None
while True:
cont = False
resp = requests.get(
url,
verify=certifi.where(),
headers={"User-Agent": USER_AGENT},
timeout=10,
stream=True,
)
try:
if resp.status_code != 200:
raise Not200(resp.status_code)
# Convince urllib3 to decode gzipped pages.
resp.raw.decode_content = True
tree = html.parse(resp.raw)
finally:
resp.close()
# Check for sneaky <meta> redirects.
for meta in META_XPATH(tree):
m = re.match(r"0;\s*url=['\"](.+?)['\"]", meta.get("content"))
if m is not None:
url = m.groups()[0]
cont = True
break
if not cont:
break
return resp, tree
示例5: main
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def main():
with open("mangalist.csv", "w") as f:
tree = parse("http://www.mangapanda.com/alphabetical")
manga_name_list = tree.xpath("//ul[@class='series_alpha']/li/a/text()")
manga_url_list = tree.xpath("//ul[@class='series_alpha']/li/a/@href")
f.write("\"Manga Name\", URL\n")
for i in range(len(manga_name_list)):
f.write("\"{0}\", http://www.mangapanda.com{1}\n".format(manga_name_list[i].replace("\"", ""), manga_url_list[i]))
示例6: _parse_tables
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def _parse_tables(self, doc, match, attrs):
"""Return all tables from the parsed DOM.
Parameters
----------
doc : tree-like
The DOM from which to parse the table element.
match : str or regular expression
The text to search for in the DOM tree.
attrs : dict
A dictionary of table attributes that can be used to disambiguate
multiple tables on a page.
Raises
------
ValueError
* If `match` does not match any text in the document.
Returns
-------
tables : list of node-like
A list of <table> elements to be parsed into raw data.
"""
raise com.AbstractMethodError(self)
示例7: _parse_tables
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def _parse_tables(self, doc, match, attrs):
"""Return all tables from the parsed DOM.
Parameters
----------
doc : tree-like
The DOM from which to parse the table element.
match : str or regular expression
The text to search for in the DOM tree.
attrs : dict
A dictionary of table attributes that can be used to disambiguate
mutliple tables on a page.
Raises
------
ValueError
* If `match` does not match any text in the document.
Returns
-------
tables : list of node-like
A list of <table> elements to be parsed into raw data.
"""
raise NotImplementedError
示例8: get_lxml_elements
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def get_lxml_elements(url, element):
_skip_if_no('lxml')
from lxml.html import parse
doc = parse(url)
return doc.xpath('.//{0}'.format(element))
示例9: parse_rss
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def parse_rss(url=None, **kwargs):
try:
f = fetch(decode(url), **kwargs)
except (ValueError, URLError):
parsed = rssparser.parse(url)
else:
content = f.read() if speedparser else f
try:
parsed = rssparser.parse(content)
finally:
f.close()
return parsed
示例10: xml2etree
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def xml2etree(f, xml=True, html5=False):
if xml:
element_tree = etree.parse(f)
elif html5 and html5parser:
element_tree = html5parser.parse(f)
elif html5parser:
element_tree = html.parse(f)
else:
# html5lib's parser returns an Element, so we must convert it into an
# ElementTree
element_tree = ElementTree(html.parse(f))
return element_tree
示例11: grab_trending_gif_urls
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def grab_trending_gif_urls():
doc = parse("http://giphy.com").getroot()
els = doc.cssselect(".gif-link img")[:10]
ret = []
for el in els:
ret.append("http:" +re.sub(r"\/([^./])*\.gif", "/giphy.gif", el.attrib['src']))
return ret
示例12: parse_html
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def parse_html(fileobj, encoding):
"""
Given a file object *fileobj*, get an ElementTree instance.
The *encoding* is assumed to be utf8.
"""
parser = HTMLParser(encoding=encoding, remove_blank_text=True)
return parse(fileobj, parser)
示例13: extract
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def extract(response):
tree = parse(response)
return tree.xpath('//h1/text()')
示例14: extract
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def extract(response):
tree = parse(response)
yield "name", text(tree.xpath('//h1'))
yield "country", text(tree.xpath('//dd[@id="country"]'))
yield "region", text(tree.xpath('//dd[@id="region"]'))
示例15: extract
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import parse [as 别名]
def extract(response):
tree = parse(response)
return text(tree.xpath('//h1/text()'))