本文整理汇总了Python中util.ElementHelper.get_body方法的典型用法代码示例。如果您正苦于以下问题:Python ElementHelper.get_body方法的具体用法?Python ElementHelper.get_body怎么用?Python ElementHelper.get_body使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类util.ElementHelper
的用法示例。
在下文中一共展示了ElementHelper.get_body方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_aricle_cetd
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def get_aricle_cetd(doctree):
cetd_parse(doctree)
body = ElementHelper.get_body(doctree)
# ElementHelper.print_element(body)
CleanTreeByMark(body)
RemoveAttribute(body)
return ElementHelper.element_text_content(body)
示例2: get_clustered_records
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def get_clustered_records(cls, doctree):
#get level_nodes_mapping
all_level_nodes = cls.bfs_tree(doctree)
root = ElementHelper.get_root(doctree)
body = ElementHelper.get_body(doctree)
#get max level and min level
upper_bound = int(ElementHelper.get_element_depth(root))+1
low_bound = int(body.get(px))+1
for level in range(low_bound, upper_bound):
level_nodes = all_level_nodes[level]
#if parent is record node, then do not consider its children
level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]
for j in range(1,len(level_nodes)-1):
left_node = level_nodes[j-1]
#横向比较
right_bound = min(len(level_nodes), j+5)
right_nodes = level_nodes[j:right_bound]
#纵向比较
down_nodes = right_nodes[0]
right_nodes.extend(down_nodes)
for right_node in right_nodes:
if cls.similar_check(left_node, right_node):
left_node.set(kg_record_mark,'1')
right_node.set(kg_record_mark, '1')
break
record_groups = cls.merger_sibling_record_node(doctree)
return record_groups
示例3: merger_sibling_record_node
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def merger_sibling_record_node(cls, doctree, cluster):
''' 融合数据记录
1.首先对数据记录进行修正,然后将连续的数据记录放入到一个集合中
将同层次相同标签的节点的节点放入一个集合中,然后在就行纠正,具体详见correct_record_mark
:param doctree: 经过了初步的相似度比较之后标记了的DOM树
:param cluster: 初步的相似的数据记录的集合
:return:
'''
node_record_mapping = {}
body = ElementHelper.get_body(doctree)
thislevel = []
thislevel.extend(body)
# while thislevel:
# nextlevel = list()
# for node in thislevel:
# # correct nodes which
# cls.correct_record_mark(node)
#
# if cls.is_node_or_ancestor_record(node):
# first_record_sibling = cls.find_first_sibling_record_node(node, doctree)
# node_record_mapping.setdefault(first_record_sibling, []).append(node)
#ToDo 2016-04-20
while thislevel:
nextlevel = list()
cls.correct_record_mark(thislevel, cluster)
for node in thislevel:
if len(node) > 0:
nextlevel.extend([child for child in node if not cls.is_node_or_ancestor_record(node)])
thislevel = nextlevel
return cluster
示例4: html2words
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def html2words(docstring, base_url, encoding=None, supervisior=None):
"""
从网页源码中抽取正文
:param docstring:
:param encoding:
:return:
"""
string_size=sys.getsizeof(docstring)
byte_size=string_size / (1024)
if byte_size < 1:
return
docstring=docstring.lower()
doctree=HtmlHelper.create_doc(docstring, encoding)
if doctree is None: return None
copy_doc=copy.deepcopy(doctree)
# try:
#
# link_ratio=get_page_link_ratio(copy_doc)
# print 'link_ratio: %f' % link_ratio
#
# if link_ratio > 0.6:
# print 'this is home page'
# return None
# except ValueError:
# return None
doctree=HtmlHelper.pre_process_domtree(doctree)
if doctree is None:
return None
# get page title and para content
para, title=HtmlHelper.get_article(doctree, debug=False)
# get page meta keywords and meta description
meta_description=HtmlHelper.get_meta_description(copy_doc)
# get headlines in page
cleaned_body=ElementHelper.get_body(doctree)
headlines=HtmlHelper.get_headline_content_in_cleaned_body(cleaned_body)
# get all urls
url_items=[]
for item in get_link_word_by_pair(docstring, base_url, supervisior): url_items.append(item)
document=Document()
document['base_url']=base_url
document['title']=title
document['meta']=meta_description
document['headlines']=headlines
document['para']=para
document['url_items']=url_items
return document
示例5: get_clustered_records
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def get_clustered_records(cls, doctree):
#get level_nodes_mapping
all_level_nodes = cls.bfs_tree(doctree)
root = ElementHelper.get_root(doctree)
body = ElementHelper.get_body(doctree)
#get max level and min level
upper_bound = int(ElementHelper.get_element_depth(root))+1
low_bound = int(body.get(px))+1
#记录相似的节点
cluster={}
for level in range(low_bound, upper_bound):
level_nodes = all_level_nodes[level]
#if parent is record node, then do not consider its children
level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]
#在同一个父亲节点下进行比较
# tag_names = set([node.getparent() for node in level_nodes])
# tmp = {}
# for tag in tag_names:
# for node in level_nodes:
# tmp.setdefault(tag, []).append(node)
tmp = cls.segement(level_nodes)
for k, nodes in tmp.items():
# if len(nodes)==1:break
first = None
node_set = set()
for i in range(1,len(nodes)):
if nodes[i].get(kg_record_mark)=='1':
continue
left_node = nodes[i-1]
# 和集合类的所有元素比较,查看是否有相同的
right_nodes=nodes[i:]
for node in right_nodes:
if cls.similar_check(left_node, node):
if first is None:
first = left_node
node_set.add(nodes[i-1])
left_node.set(kg_record_mark, '1')
node.set(kg_record_mark, '1')
node_set.add(node)
if first is not None:
cluster[first]=node_set
record_groups = cls.merger_sibling_record_node(doctree, cluster)
# record_groups = cluster
record_groups = {k:v for k,v in record_groups.items() if k.get(kg_record_mark)=='1'}
return record_groups
示例6: get_article_title_element
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def get_article_title_element(doctree):
body = ElementHelper.get_body(doctree)
title_node = HtmlHelper.get_title(doctree)
if title_node is None:
return None
title_text = title_node.text
title = get_title_util(body, title_text)
if title is None: return None
return title
示例7: get_page_link_ratio
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def get_page_link_ratio(doctree):
body = ElementHelper.get_body(doctree)
CountChar(body)
CountTag(body)
CountLinkChar(body)
CountLinkTag(body)
char_num = float(body.attrib.get(kg_char_num))
if char_num==0: char_num=1
linkchar_num = float(body.attrib.get(kg_linkchar_num))
ratio = linkchar_num/char_num
return ratio
示例8: bfs_tree
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def bfs_tree(cls, doctree):
all_level_nodes = {}
body = ElementHelper.get_body(doctree)
level = int(body.get(px))
thislevel = [body]
while thislevel:
nextlevel = list()
all_level_nodes[level] = thislevel
for node in thislevel:
nextlevel.extend([child for child in node if len(node)>0])
thislevel = nextlevel
level += 1
return all_level_nodes
示例9: get_clustered_records
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def get_clustered_records(cls, doctree):
#get level_nodes_mapping
all_level_nodes = cls.bfs_tree(doctree)
root = ElementHelper.get_root(doctree)
body = ElementHelper.get_body(doctree)
#get max level and min level
upper_bound = int(ElementHelper.get_element_depth(root))+1
low_bound = int(body.get(px))+1
for level in range(low_bound, upper_bound):
level_nodes = all_level_nodes[level]
try:
next_level_nodes = all_level_nodes[level+1]
except KeyError:
next_level_nodes=None
#if parent is record node, then do not consider its children
level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]
for j in range(1,len(level_nodes)-1):
left_node = level_nodes[j-1]
#将横向名称相同的节点放到一起进行比较
# right_bound = min(len(level_nodes), j+5)
# right_nodes = level_nodes[j:right_bound]
# #纵向比较
# down_nodes = right_nodes[0]
# right_nodes.extend(down_nodes)
right_nodes = [node for node in level_nodes[j:] if node.tag==left_node.tag]
#纵向查找
# if next_level_nodes is not None:
# for node in next_level_nodes:
# if node.tag==left_node.tag:
# right_node.append(node)
for right_node in right_nodes:
if cls.similar_check(left_node, right_node):
left_node.set(kg_record_mark,'1')
right_node.set(kg_record_mark, '1')
break
record_groups = cls.merger_sibling_record_node(doctree)
return record_groups
示例10: clean_body
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def clean_body(clusters, doctree, title_node=None, debug = False):
#filter user comments and all link records
clusters = filter_cluster(clusters)
if len(clusters) == 0:
title = ElementHelper.element_text_content(title_node)
return title, title
#choose cluster which has most texts
maxCluster = get_biggest_cluster(clusters)
nodes = clusters[maxCluster]
# m = {}
# m[maxCluster] = nodes
# filter_cluster(m)
#get all children of max cluster record
allnodes = []
for node in nodes:
children = ElementHelper.get_children(node)
allnodes.extend(children)
#sort max cluster nodes by its preorder index
allnodes.sort(lambda p,q:ElementHelper.get_element_preorder_num(p)-ElementHelper.get_element_preorder_num(q))
# debug=True
if debug:
print 'maxCluster: %s' % maxCluster
for n in allnodes:
print ElementHelper.get_xpath_by_element(n, doctree), n.get(py)
s = ElementHelper.get_element_preorder_num(allnodes[0])
t = ElementHelper.get_element_preorder_num(allnodes[-1])
#correct start position by title node
title_text = ''
#====================================================================
#在实验时借助title纠正正文文本的起始位置可以提高recall,但是实际过程中不需要
if title_node is not None:
index = ElementHelper.get_element_preorder_num(title_node)
if index < s:
s = index #ToDo:add 2016/03/09
title_text = ElementHelper.element_text_content(title_node)
#====================================================================
body = ElementHelper.get_body(doctree)
# remove nodes which not belong to main text
set_text_mark(body, s, t)
remove_nontext_element(body)
return body, title_text
示例11: cetd_parse
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def cetd_parse(doctree):
body = ElementHelper.get_body(doctree)
CountChar(body)
CountTag(body)
CountLinkChar(body)
CountLinkTag(body)
char_num = float(body.attrib.get(kg_char_num))
if char_num==0: char_num=1
linkchar_num = float(body.attrib.get(kg_linkchar_num))
ratio = linkchar_num/char_num
ComputeTextDensity(body, ratio)
ComputeDensitySum(body, ratio)
max_density_sum = FindMaxDensitySum(body)
SetMark(body, 0)
thresold = GetThreshold(body, max_density_sum)
MarkContent(body, thresold)
示例12: get_article_wish
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def get_article_wish(clusters, doctree, title_node=None, debug = False):
#filter
clusters = filter_cluster(clusters)
if len(clusters) == 0:
return ElementHelper.element_text_content(title_node)
maxCluster = get_biggest_cluster(clusters)
nodes = clusters[maxCluster]
m = {}
m[maxCluster] = nodes
filter_cluster(m)
#============================================
if debug:
# for test
print 'maxCluster: %s' %maxCluster
for n in nodes:
print ElementHelper.get_xpath_by_element(n, doctree)
allnodes = []
for node in nodes:
children = ElementHelper.get_children(node)
allnodes.extend(children)
allnodes.sort(lambda p,q:ElementHelper.get_element_preorder_num(p)-ElementHelper.get_element_preorder_num(q))
s = ElementHelper.get_element_preorder_num(allnodes[0])
t = ElementHelper.get_element_preorder_num(allnodes[-1])
title_text = ''
if title_node is not None:
index = ElementHelper.get_element_preorder_num(title_node)
if index < s:
s = index
title_text = ElementHelper.element_text_content(title_node)
body = ElementHelper.get_body(doctree)
set_text_mark(body, s, t)
remove_nontext_element(body)
# ElementHelper.print_element(body)
return ElementHelper.element_text_content_list(body), title_text
示例13: merger_sibling_record_node
# 需要导入模块: from util import ElementHelper [as 别名]
# 或者: from util.ElementHelper import get_body [as 别名]
def merger_sibling_record_node(cls, doctree):
node_record_mapping = {}
body = ElementHelper.get_body(doctree)
thislevel = []
thislevel.extend(body)
while thislevel:
nextlevel = list()
for node in thislevel:
# correct nodes which
cls.correct_record_mark(node)
if cls.is_node_or_ancestor_record(node):
first_record_sibling = cls.find_first_sibling_record_node(node, doctree)
node_record_mapping.setdefault(first_record_sibling, []).append(node)
for node in thislevel:
if len(node) > 0:
nextlevel.extend([child for child in node if not cls.is_node_or_ancestor_record(node)])
thislevel = nextlevel
return node_record_mapping