本文整理匯總了Python中util.ElementHelper.get_elements_by_tagnames方法的典型用法代碼示例。如果您正苦於以下問題:Python ElementHelper.get_elements_by_tagnames方法的具體用法?Python ElementHelper.get_elements_by_tagnames怎麽用?Python ElementHelper.get_elements_by_tagnames使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類util.ElementHelper
的用法示例。
在下文中一共展示了ElementHelper.get_elements_by_tagnames方法的3個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: collect_urls
# 需要導入模塊: from util import ElementHelper [as 別名]
# 或者: from util.ElementHelper import get_elements_by_tagnames [as 別名]
def collect_urls(html, base_url, encoding=None):
""" only collect url
:param html: page string
:param base_url:
:param encoding:
:return: list of url
"""
h = HtmlHelper()
doctree = h.create_doc(html, encoding)
a_tags = ElementHelper.get_elements_by_tagnames(doctree, 'a')
for a in a_tags:
link = a.get('href',None)
link = m_strip(link)
if link is None or len(link)<2:continue
if link[0]=='#': continue #link to itself
link = normalize_url(link, base_url)
#if url in non visited set
if is_url_visited(link, unvisited_url_set):
continue
if not should_collect_url(link, base_url):
continue
#if url not in same domain
yield link
示例2: get_headline_content_in_cleaned_body
# 需要導入模塊: from util import ElementHelper [as 別名]
# 或者: from util.ElementHelper import get_elements_by_tagnames [as 別名]
def get_headline_content_in_cleaned_body(body):
headlin_tag = ['h1', 'h2', 'h3', 'h4']
headline_contents = [ElementHelper.element_text_content(node)
for node in ElementHelper.get_elements_by_tagnames(body, headlin_tag)
if not ElementHelper.is_element_content_none(node)]
return '\n'.join(headline_contents)
示例3: get_link_word_by_pair
# 需要導入模塊: from util import ElementHelper [as 別名]
# 或者: from util.ElementHelper import get_elements_by_tagnames [as 別名]
def get_link_word_by_pair(docstring, base_url, supervisior=None, encoding='utf-8'):
""" collect urls from
:param html:
:param base_url:
:return:
"""
h = HtmlHelper()
doctree = h.create_doc(docstring, encoding)
if isinstance(base_url, unicode):
base_url = base_url.encode('utf-8')
a_tags = ElementHelper.get_elements_by_tagnames(doctree, 'a')
for a in a_tags:
link = a.get('href',None)
link = a.get('href',None)
link = m_strip(link)
if link is None or len(link)<2:continue
if link[0]=='#': continue #link to itself
link = normalize_url(link, base_url)
#if url in non visited set
if is_url_visited(link, unvisited_url_set):
continue
# if not should_collect_url(link, base_url):
# continue
link_item = UrlItem()
link_item['parent_url'] = base_url
link_item['url'] = link
link_item['anchor_text'] = ElementHelper.element_text_content(a).encode('utf-8')
link_item['neigb_text'] = ''
if supervisior is not None:
link_item['label'], link_item['interestness'] = supervisior.predict(link_item['anchor_text'])
else:
link_item['label'], link_item['interestness'] = '1', 0.0 #1為負樣本
yield link_item