本文整理汇总了Python中lxml.html.HtmlElement方法的典型用法代码示例。如果您正苦于以下问题:Python html.HtmlElement方法的具体用法?Python html.HtmlElement怎么用?Python html.HtmlElement使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.html
的用法示例。
在下文中一共展示了html.HtmlElement方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _parse_node
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def _parse_node(
self, node: HtmlElement, state: Dict[str, Any]
) -> Iterator[Sentence]:
"""Entry point for parsing all node types.
:param node: The lxml HTML node to parse
:param state: The global state necessary to place the node in context
of the document as a whole.
:return: a *generator* of Sentences
"""
# Processing on entry of node
state = self._parse_section(node, state)
state = self._parse_figure(node, state)
if self.tabular:
state = self._parse_table(node, state)
state = self._parse_caption(node, state)
yield from self._parse_paragraph(node, state)
示例2: _fragments_from_string
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def _fragments_from_string(html_string):
fragments = html.fragments_fromstring(html_string)
if not len(fragments):
return []
# convert and append text node before starting tag
if not isinstance(fragments[0], html.HtmlElement):
if len(fragments[0].strip()) > 0:
if len(fragments) == 1:
return html.fragments_fromstring('<p>%s</p>' % fragments[0])
else:
paragraph = _create_element('p')
paragraph.text = fragments[0]
fragments[1].addprevious(paragraph)
fragments.insert(1, paragraph)
fragments.pop(0)
if not len(fragments):
return []
# remove xml instructions (if cleaning is disabled)
for instruction in fragments[0].xpath('//processing-instruction()'):
instruction.drop_tag()
return fragments
示例3: test_scores
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def test_scores(self):
""" Return a list of dictionnary with test scores """
if isinstance(self.xp_test_scores, html.HtmlElement) is True:
count=int(self.get_clean_xpath(
'count(//div[@id="background-test-scores"]/div[contains(@id, "scores-")])'))
test_scores=[]
for i in range(1, count + 1):
data={}
data['name']=extract_one(
self.get_xp(self.xp_test_scores, './/h4//text()'))
data['score']=extract_one(
self.get_xp(self.xp_test_scores, './/h5//text()'))
data['description']=' '.join((self.get_xp(
self.xp_test_scores, './/p[contains(@class,"description")]//text()')))
data['date']=extract_one(self.get_xp(
self.xp_test_scores, './/span[@class = "date-range"]/time[1]/text()'))
test_scores.append(data)
else:
test_scores=[]
return test_scores
示例4: get_resp
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def get_resp(url):
"""Get webpage response as an lxml.html.HtmlElement object."""
try:
headers = {"User-Agent": random.choice(USER_AGENTS)}
try:
request = requests.get(url, headers=headers, proxies=get_proxies())
except MissingSchema:
url = add_protocol(url)
request = requests.get(url, headers=headers, proxies=get_proxies())
return lh.fromstring(request.text.encode("utf-8") if PY2 else request.text)
except Exception:
sys.stderr.write("Failed to retrieve {0}.\n".format(url))
raise
示例5: parse_html
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def parse_html(infile, xpath):
"""Filter HTML using XPath."""
if not isinstance(infile, lh.HtmlElement):
infile = lh.fromstring(infile)
infile = infile.xpath(xpath)
if not infile:
raise ValueError("XPath {0} returned no results.".format(xpath))
return infile
# URL processing functions
#
示例6: extractor
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str:
publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
publish_time = (self.extract_from_user_xpath(publish_time_xpath, element) # 用户指定的 Xpath 是第一优先级
or self.extract_from_meta(element) # 第二优先级从 Meta 中提取
or self.extract_from_text(element)) # 最坏的情况从正文中提取
return publish_time
示例7: extract_from_user_xpath
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extract_from_user_xpath(self, publish_time_xpath: str, element: HtmlElement) -> str:
if publish_time_xpath:
publish_time = ''.join(element.xpath(publish_time_xpath))
return publish_time
return ''
示例8: extract_from_text
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extract_from_text(self, element: HtmlElement) -> str:
text = ''.join(element.xpath('.//text()'))
for dt in self.time_pattern:
dt_obj = re.search(dt, text)
if dt_obj:
return dt_obj.group(1)
else:
return ''
示例9: extract_from_meta
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extract_from_meta(self, element: HtmlElement) -> str:
"""
一些很规范的新闻网站,会把新闻的发布时间放在 META 中,因此应该优先检查 META 数据
:param element: 网页源代码对应的Dom 树
:return: str
"""
for xpath in PUBLISH_TIME_META:
publish_time = element.xpath(xpath)
if publish_time:
return ''.join(publish_time)
return ''
示例10: extractor
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extractor(self, element: HtmlElement, author_xpath=''):
author_xpath = author_xpath or config.get('author', {}).get('xpath')
if author_xpath:
author = ''.join(element.xpath(author_xpath))
return author
text = ''.join(element.xpath('.//text()'))
for pattern in self.author_pattern:
author_obj = re.search(pattern, text)
if author_obj:
return author_obj.group(1)
return ''
示例11: extract
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extract(self, element: HtmlElement, title_xpath: str = '') -> str:
title_xpath = title_xpath or config.get('title', {}).get('xpath')
title = (self.extract_by_xpath(element, title_xpath)
or self.extract_by_htag_and_title(element)
or self.extract_by_title(element)
or self.extract_by_htag(element)
)
return title.strip()
示例12: normalize_node
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def normalize_node(element: HtmlElement):
etree.strip_elements(element, *USELESS_TAG)
for node in iter_node(element):
# inspired by readability.
if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
remove_node(node)
# merge text in span or strong to parent p tag
if node.tag.lower() == 'p':
etree.strip_tags(node, 'span')
etree.strip_tags(node, 'strong')
# if a div tag does not contain any sub node, it could be converted to p node.
if node.tag.lower() == 'div' and not node.getchildren():
node.tag = 'p'
if node.tag.lower() == 'span' and not node.getchildren():
node.tag = 'p'
# remove empty p tag
if node.tag.lower() == 'p' and not node.xpath('.//img'):
if not (node.text and node.text.strip()):
drop_tag(node)
class_name = node.get('class')
if class_name:
for attribute in USELESS_ATTR:
if attribute in class_name:
remove_node(node)
break
示例13: iter_node
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def iter_node(element: HtmlElement):
yield element
for sub_element in element:
if isinstance(sub_element, HtmlElement):
yield from iter_node(sub_element)
示例14: remove_node
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def remove_node(node: HtmlElement):
"""
this is a in-place operation, not necessary to return
:param node:
:return:
"""
parent = node.getparent()
if parent is not None:
parent.remove(node)
示例15: drop_tag
# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def drop_tag(node: HtmlElement):
"""
only delete the tag, but merge its text to parent.
:param node:
:return:
"""
parent = node.getparent()
if parent is not None:
node.drop_tag()