当前位置: 首页>>代码示例>>Python>>正文


Python util.ElementHelper类代码示例

本文整理汇总了Python中util.ElementHelper的典型用法代码示例。如果您正苦于以下问题:Python ElementHelper类的具体用法?Python ElementHelper怎么用?Python ElementHelper使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了ElementHelper类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_clustered_records

    def get_clustered_records(cls, doctree):
        #get level_nodes_mapping
        all_level_nodes = cls.bfs_tree(doctree)

        root = ElementHelper.get_root(doctree)
        body = ElementHelper.get_body(doctree)

        #get max level and min level
        upper_bound = int(ElementHelper.get_element_depth(root))+1
        low_bound = int(body.get(px))+1

        for level in range(low_bound, upper_bound):

            level_nodes = all_level_nodes[level]
            #if parent is record node, then do not consider its children
            level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]

            for j in range(1,len(level_nodes)-1):
                left_node = level_nodes[j-1]
                #横向比较
                right_bound = min(len(level_nodes), j+5)
                right_nodes = level_nodes[j:right_bound]

                #纵向比较
                down_nodes = right_nodes[0]
                right_nodes.extend(down_nodes)

                for right_node in right_nodes:
                    if cls.similar_check(left_node, right_node):
                        left_node.set(kg_record_mark,'1')
                        right_node.set(kg_record_mark, '1')
                        break

        record_groups = cls.merger_sibling_record_node(doctree)
        return record_groups
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:35,代码来源:api3.py

示例2: similar_check

    def similar_check(cls, nodeA, nodeB):
        if nodeA.tag != nodeB.tag:
            return False
        #compare distinct nodes
        dnodesA = ElementHelper.get_children(nodeA)
        dnodesB = ElementHelper.get_children(nodeB)

        #dA is node_levels_mapping, rA is level_nodes_mapping
        dA, dB, rA, rB = {}, {}, {}, {}
        for node in dnodesA:
            #ignore <a> tag as distinct tag
            if node.tag == 'a': continue

            dA.setdefault(node.tag, []).append(int(node.get(px)))
            rA.setdefault(int(node.get(px)), []).append(node.tag)
        for node in dnodesB:
            if node.tag == 'a': continue

            dB.setdefault(node.tag, []).append(int(node.get(px)))
            rB.setdefault(int(node.get(px)), []).append(node.tag)

        if abs(len(dA)-len(dB))>1 or abs(len(rA)-len(rB))>1:
            return False

        #check distinct tag is same?
        for tag in dA:
            if tag not in ('em', 'b', 'br','i', 'font') and tag not in dB:
                return False

        sumA = sum([len(StringHelper.unique(rA[A])) for A in rA])
        sumB = sum([len(StringHelper.unique(rB[B])) for B in rB])
        if abs(sumA-sumB)>1:
            return False
        return True
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:34,代码来源:api2.py

示例3: get_title_util

def get_title_util(body, title_text):
    if len(title_text) < 2:
        return None
    #1.计算节点文本与title_text的longest commen lenght
    candidate_nodes = []
    for tag in TITLE_TAG:
        nodes = ElementHelper.get_element_by_tag(body, tag)
        if nodes is None or len(nodes)<1: continue

        nodes = [node for node in nodes if is_possible_title_tag(node)]
        candidate_nodes.extend(nodes)

    mapping = {}
    for node in candidate_nodes:
        node_text = ElementHelper.element_text_content(node)
        # if len(node_text)==0 or len(node_text)>len(title_text): continue  #

        if len(node_text)==0: continue

        llength = longest_common_length(node_text, title_text)
        if llength >= 1:
            mapping[node] = llength

    if len(mapping)==0: return None
    #2.选择长度最大的作为title节点,如果存在多个最大的,选择最靠前的作为
    #title节点
    sorted_nodes = [node for node, _ in sorted(mapping.items(), key=lambda x:x[1], reverse=True)]
    max_len = mapping[sorted_nodes[0]]

    candidates = [node for node in sorted_nodes if mapping[node]==max_len]
    if len(candidate_nodes)==1:
        return sorted_nodes[0]
    else:
        candidates.sort(cmp=lambda x,y: ElementHelper.get_element_preorder_num(x)- ElementHelper.get_element_preorder_num(y), reverse=False)
        return candidates[0]
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:35,代码来源:api2.py

示例4: is_cluster_all_links

def is_cluster_all_links(cluster):
    """ #p判断是否是链接节点的集合。1.如果该集合中所有的文本节点都是链接节点,则属于链接噪声<a> text </a>或<li><a>text</a></li>的形式
    if all tags which contain links are <a> tag, then return True
    For example:
        <a> link </a>
        OR
        <li> <a> link </a> </li>
    """
    all_nodes_contain_text = []
    for node in cluster:
        children = ElementHelper.get_children(node)
        nodes_contain_text = [node for node in children if not ElementHelper.is_element_text_none(node)
            and node.tag not in ('em','strong','span','i','b')]
        all_nodes_contain_text.extend(nodes_contain_text)

    link_nodes = [node for node in all_nodes_contain_text if node.tag=='a' or node.getparent().tag=='a']
    other_nodes = [node for node in all_nodes_contain_text if node.tag!='a' and node.getparent().tag != 'a']

    link_nodes_text_number = cluster_text_number(link_nodes)
    other_nodes_text_number = cluster_text_number(other_nodes)

    if len(other_nodes)==0 or other_nodes_text_number==0:
        return True

    if 1.0 *link_nodes_text_number/other_nodes_text_number>2.0:
        return True

    return False
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:28,代码来源:api2.py

示例5: get_aricle_cetd

def get_aricle_cetd(doctree):
    cetd_parse(doctree)
    body = ElementHelper.get_body(doctree)
    # ElementHelper.print_element(body)
    CleanTreeByMark(body)
    RemoveAttribute(body)
    return ElementHelper.element_text_content(body)
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:7,代码来源:cetd.py

示例6: print_cluster_record

 def print_cluster_record(cls, clusters, doctree):
     ElementHelper.print_element(doctree)
     for cluster in clusters:
         if len(cluster)>1:
             print '===='*10
             nodes = clusters[cluster]
             for node in nodes:
                 print ElementHelper.get_xpath_by_element(node, doctree), node.get(py)
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:8,代码来源:api2.py

示例7: get_headline_content_in_cleaned_body

    def get_headline_content_in_cleaned_body(body):
        headlin_tag = ['h1', 'h2', 'h3', 'h4']

        headline_contents = [ElementHelper.element_text_content(node)
                             for node in ElementHelper.get_elements_by_tagnames(body, headlin_tag)
                             if not ElementHelper.is_element_content_none(node)]

        return '\n'.join(headline_contents)
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:8,代码来源:api2.py

示例8: CleanTreeByMark

def CleanTreeByMark(element):
    mark = long(element.get(kg_mark))
    if 0==mark:
        ElementHelper.remove_element(element)
    elif 1==mark:
        return
    else:
        for child in element:
            CleanTreeByMark(child)
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:9,代码来源:cetd.py

示例9: get_clustered_records

    def get_clustered_records(cls, doctree):
        #get level_nodes_mapping
        all_level_nodes = cls.bfs_tree(doctree)

        root = ElementHelper.get_root(doctree)
        body = ElementHelper.get_body(doctree)

        #get max level and min level
        upper_bound = int(ElementHelper.get_element_depth(root))+1
        low_bound = int(body.get(px))+1

        #记录相似的节点
        cluster={}

        for level in range(low_bound, upper_bound):

            level_nodes = all_level_nodes[level]

            #if parent is record node, then do not consider its children
            level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]
            #在同一个父亲节点下进行比较
            # tag_names = set([node.getparent() for node in level_nodes])
            # tmp = {}
            # for tag in tag_names:
            #     for node in level_nodes:
            #         tmp.setdefault(tag, []).append(node)
            tmp = cls.segement(level_nodes)

            for k, nodes in tmp.items():
                # if len(nodes)==1:break
                first = None
                node_set = set()
                for i in range(1,len(nodes)):
                    if nodes[i].get(kg_record_mark)=='1':
                        continue
                    left_node = nodes[i-1]
                    # 和集合类的所有元素比较,查看是否有相同的
                    right_nodes=nodes[i:]
                    for node in right_nodes:
                        if cls.similar_check(left_node, node):
                            if first is None:
                                first = left_node
                                node_set.add(nodes[i-1])
                            left_node.set(kg_record_mark, '1')
                            node.set(kg_record_mark, '1')
                            node_set.add(node)
                if first is not None:
                    cluster[first]=node_set

        record_groups = cls.merger_sibling_record_node(doctree, cluster)
        # record_groups = cluster
        record_groups = {k:v for k,v in record_groups.items() if k.get(kg_record_mark)=='1'}

        return record_groups
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:54,代码来源:api.py

示例10: get_meta_content

 def get_meta_content(doctree, metaAttrName, value):
     """Extract a given meta content form document.
     Example metaNames:
     (name, description)
     (name, keyword)
     (property, og:type)
     """
     meta  = ElementHelper.get_element_by_tag_attr(doctree, 'meta',metaAttrName, value)
     content = None
     if meta is not None and len(meta)>0:
         content = ElementHelper.get_attribute(meta[0], 'content')
     if content is not None:
         return normalize_word(content)
     return ''
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:14,代码来源:api2.py

示例11: get_clustered_records

    def get_clustered_records(cls, doctree):
        #get level_nodes_mapping
        all_level_nodes = cls.bfs_tree(doctree)

        root = ElementHelper.get_root(doctree)
        body = ElementHelper.get_body(doctree)

        #get max level and min level
        upper_bound = int(ElementHelper.get_element_depth(root))+1
        low_bound = int(body.get(px))+1

        for level in range(low_bound, upper_bound):

            level_nodes = all_level_nodes[level]
            try:
                next_level_nodes = all_level_nodes[level+1]
            except KeyError:
                next_level_nodes=None
            #if parent is record node, then do not consider its children
            level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]

            for j in range(1,len(level_nodes)-1):
                left_node = level_nodes[j-1]
                #将横向名称相同的节点放到一起进行比较
                # right_bound = min(len(level_nodes), j+5)
                # right_nodes = level_nodes[j:right_bound]
                # #纵向比较
                # down_nodes = right_nodes[0]
                # right_nodes.extend(down_nodes)

                right_nodes = [node for node in level_nodes[j:] if node.tag==left_node.tag]
                #纵向查找
                # if next_level_nodes is not None:
                #     for node in next_level_nodes:
                #         if node.tag==left_node.tag:
                #             right_node.append(node)





                for right_node in right_nodes:
                    if cls.similar_check(left_node, right_node):
                        left_node.set(kg_record_mark,'1')
                        right_node.set(kg_record_mark, '1')
                        break

        record_groups = cls.merger_sibling_record_node(doctree)
        return record_groups
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:49,代码来源:api2.py

示例12: merger_sibling_record_node

    def merger_sibling_record_node(cls, doctree, cluster):
        ''' 融合数据记录
        1.首先对数据记录进行修正,然后将连续的数据记录放入到一个集合中
        将同层次相同标签的节点的节点放入一个集合中,然后在就行纠正,具体详见correct_record_mark
        :param doctree: 经过了初步的相似度比较之后标记了的DOM树
        :param cluster: 初步的相似的数据记录的集合
        :return:
        '''
        node_record_mapping = {}

        body = ElementHelper.get_body(doctree)
        thislevel = []
        thislevel.extend(body)
        # while thislevel:
        #     nextlevel = list()
        #     for node in thislevel:
        #         # correct nodes which
        #         cls.correct_record_mark(node)
        #
        #         if cls.is_node_or_ancestor_record(node):
        #             first_record_sibling = cls.find_first_sibling_record_node(node, doctree)
        #             node_record_mapping.setdefault(first_record_sibling, []).append(node)
        #ToDo 2016-04-20
        while thislevel:
            nextlevel = list()
            cls.correct_record_mark(thislevel, cluster)
            for node in thislevel:
                if len(node) > 0:
                    nextlevel.extend([child for child in node if not cls.is_node_or_ancestor_record(node)])
            thislevel = nextlevel

        return cluster
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:32,代码来源:api.py

示例13: collect_urls

def collect_urls(html, base_url, encoding=None):
    """ only collect url
    :param html: page string
    :param base_url:
    :param encoding:
    :return: list of url
    """
    h = HtmlHelper()
    doctree = h.create_doc(html, encoding)
    a_tags = ElementHelper.get_elements_by_tagnames(doctree, 'a')
    for a in a_tags:
        link = a.get('href',None)
        link = m_strip(link)
        if link is None or len(link)<2:continue
        if link[0]=='#': continue #link to itself
        link = normalize_url(link, base_url)

        #if url in non visited set
        if is_url_visited(link, unvisited_url_set):
            continue

        if not should_collect_url(link, base_url):
            continue
        #if url not in same domain

        yield link
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:26,代码来源:url.py

示例14: find_first_sibling_record_node

    def find_first_sibling_record_node(cls, element, doctree):
        '''找到element所在区域的起始节点
        1.首选查看element的xpath下表,如果其下表<2,表示element左边没有兄弟节点了,直接返回element
        2.如果element的xpath下标大于=2,表示element左边有兄弟节点,那么先找到element的父亲(以便于访问element的兄弟节点,然后index=设置为element的下表-2
        查看parentt[index]是否是数据区域,如果是继续项左寻找,否则返回parent[index+1]
        '''
        parent = element.getparent()
        if len(parent)<2:
            return element

        element_xpath = ElementHelper.get_xpath_by_element(element, doctree)
        # print 'xpath: %s' %element_xpath
        element_last_index = StringHelper.get_digits(element_xpath.split('/')[-1])

        if element_last_index < 2:
            return element

        index = element_last_index - 2
        # print 'parent length:%d' %len(parent)
        while index >= 0:
            # print index
            if parent[index].get(kg_record_mark) == '1':
                index -= 1
            else:
                break
        return parent[index+1]
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:26,代码来源:api2.py

示例15: is_node_or_children_record

    def is_node_or_children_record(cls, element):
        children = ElementHelper.get_children(element)

        marks = [child.get(kg_record_mark) for child in children]
        unique_marks = StringHelper.unique(marks)
        if len(unique_marks)==2:
            return True
        return False
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:8,代码来源:api2.py


注:本文中的util.ElementHelper类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。