本文整理汇总了Python中util.ElementHelper.get_elements_by_tagnames方法的典型用法代码示例。如果您正苦于以下问题:Python ElementHelper.get_elements_by_tagnames方法的具体用法?Python ElementHelper.get_elements_by_tagnames怎么用?Python ElementHelper.get_elements_by_tagnames使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类util.ElementHelper
的用法示例。
在下文中一共展示了ElementHelper.get_elements_by_tagnames方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: collect_urls
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_elements_by_tagnames [as 别名]
def collect_urls(html, base_url, encoding=None):
""" only collect url
:param html: page string
:param base_url:
:param encoding:
:return: list of url
"""
h = HtmlHelper()
doctree = h.create_doc(html, encoding)
a_tags = ElementHelper.get_elements_by_tagnames(doctree, 'a')
for a in a_tags:
link = a.get('href',None)
link = m_strip(link)
if link is None or len(link)<2:continue
if link[0]=='#': continue #link to itself
link = normalize_url(link, base_url)
#if url in non visited set
if is_url_visited(link, unvisited_url_set):
continue
if not should_collect_url(link, base_url):
continue
#if url not in same domain
yield link
示例2: get_headline_content_in_cleaned_body
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_elements_by_tagnames [as 别名]
def get_headline_content_in_cleaned_body(body):
headlin_tag = ['h1', 'h2', 'h3', 'h4']
headline_contents = [ElementHelper.element_text_content(node)
for node in ElementHelper.get_elements_by_tagnames(body, headlin_tag)
if not ElementHelper.is_element_content_none(node)]
return '\n'.join(headline_contents)
示例3: get_link_word_by_pair
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_elements_by_tagnames [as 别名]
def get_link_word_by_pair(docstring, base_url, supervisior=None, encoding='utf-8'):
""" collect urls from
:param html:
:param base_url:
:return:
"""
h = HtmlHelper()
doctree = h.create_doc(docstring, encoding)
if isinstance(base_url, unicode):
base_url = base_url.encode('utf-8')
a_tags = ElementHelper.get_elements_by_tagnames(doctree, 'a')
for a in a_tags:
link = a.get('href',None)
link = a.get('href',None)
link = m_strip(link)
if link is None or len(link)<2:continue
if link[0]=='#': continue #link to itself
link = normalize_url(link, base_url)
#if url in non visited set
if is_url_visited(link, unvisited_url_set):
continue
# if not should_collect_url(link, base_url):
# continue
link_item = UrlItem()
link_item['parent_url'] = base_url
link_item['url'] = link
link_item['anchor_text'] = ElementHelper.element_text_content(a).encode('utf-8')
link_item['neigb_text'] = ''
if supervisior is not None:
link_item['label'], link_item['interestness'] = supervisior.predict(link_item['anchor_text'])
else:
link_item['label'], link_item['interestness'] = '1', 0.0 #1为负样本
yield link_item