當前位置: 首頁>>代碼示例>>Python>>正文


Python util.ElementHelper類代碼示例

本文整理匯總了Python中util.ElementHelper的典型用法代碼示例。如果您正苦於以下問題:Python ElementHelper類的具體用法?Python ElementHelper怎麽用?Python ElementHelper使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


在下文中一共展示了ElementHelper類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: get_clustered_records

    def get_clustered_records(cls, doctree):
        #get level_nodes_mapping
        all_level_nodes = cls.bfs_tree(doctree)

        root = ElementHelper.get_root(doctree)
        body = ElementHelper.get_body(doctree)

        #get max level and min level
        upper_bound = int(ElementHelper.get_element_depth(root))+1
        low_bound = int(body.get(px))+1

        for level in range(low_bound, upper_bound):

            level_nodes = all_level_nodes[level]
            #if parent is record node, then do not consider its children
            level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]

            for j in range(1,len(level_nodes)-1):
                left_node = level_nodes[j-1]
                #橫向比較
                right_bound = min(len(level_nodes), j+5)
                right_nodes = level_nodes[j:right_bound]

                #縱向比較
                down_nodes = right_nodes[0]
                right_nodes.extend(down_nodes)

                for right_node in right_nodes:
                    if cls.similar_check(left_node, right_node):
                        left_node.set(kg_record_mark,'1')
                        right_node.set(kg_record_mark, '1')
                        break

        record_groups = cls.merger_sibling_record_node(doctree)
        return record_groups
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:35,代碼來源:api3.py

示例2: similar_check

    def similar_check(cls, nodeA, nodeB):
        if nodeA.tag != nodeB.tag:
            return False
        #compare distinct nodes
        dnodesA = ElementHelper.get_children(nodeA)
        dnodesB = ElementHelper.get_children(nodeB)

        #dA is node_levels_mapping, rA is level_nodes_mapping
        dA, dB, rA, rB = {}, {}, {}, {}
        for node in dnodesA:
            #ignore <a> tag as distinct tag
            if node.tag == 'a': continue

            dA.setdefault(node.tag, []).append(int(node.get(px)))
            rA.setdefault(int(node.get(px)), []).append(node.tag)
        for node in dnodesB:
            if node.tag == 'a': continue

            dB.setdefault(node.tag, []).append(int(node.get(px)))
            rB.setdefault(int(node.get(px)), []).append(node.tag)

        if abs(len(dA)-len(dB))>1 or abs(len(rA)-len(rB))>1:
            return False

        #check distinct tag is same?
        for tag in dA:
            if tag not in ('em', 'b', 'br','i', 'font') and tag not in dB:
                return False

        sumA = sum([len(StringHelper.unique(rA[A])) for A in rA])
        sumB = sum([len(StringHelper.unique(rB[B])) for B in rB])
        if abs(sumA-sumB)>1:
            return False
        return True
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:34,代碼來源:api2.py

示例3: get_title_util

def get_title_util(body, title_text):
    if len(title_text) < 2:
        return None
    #1.計算節點文本與title_text的longest commen lenght
    candidate_nodes = []
    for tag in TITLE_TAG:
        nodes = ElementHelper.get_element_by_tag(body, tag)
        if nodes is None or len(nodes)<1: continue

        nodes = [node for node in nodes if is_possible_title_tag(node)]
        candidate_nodes.extend(nodes)

    mapping = {}
    for node in candidate_nodes:
        node_text = ElementHelper.element_text_content(node)
        # if len(node_text)==0 or len(node_text)>len(title_text): continue  #

        if len(node_text)==0: continue

        llength = longest_common_length(node_text, title_text)
        if llength >= 1:
            mapping[node] = llength

    if len(mapping)==0: return None
    #2.選擇長度最大的作為title節點,如果存在多個最大的,選擇最靠前的作為
    #title節點
    sorted_nodes = [node for node, _ in sorted(mapping.items(), key=lambda x:x[1], reverse=True)]
    max_len = mapping[sorted_nodes[0]]

    candidates = [node for node in sorted_nodes if mapping[node]==max_len]
    if len(candidate_nodes)==1:
        return sorted_nodes[0]
    else:
        candidates.sort(cmp=lambda x,y: ElementHelper.get_element_preorder_num(x)- ElementHelper.get_element_preorder_num(y), reverse=False)
        return candidates[0]
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:35,代碼來源:api2.py

示例4: is_cluster_all_links

def is_cluster_all_links(cluster):
    """ #p判斷是否是鏈接節點的集合。1.如果該集合中所有的文本節點都是鏈接節點,則屬於鏈接噪聲<a> text </a>或<li><a>text</a></li>的形式
    if all tags which contain links are <a> tag, then return True
    For example:
        <a> link </a>
        OR
        <li> <a> link </a> </li>
    """
    all_nodes_contain_text = []
    for node in cluster:
        children = ElementHelper.get_children(node)
        nodes_contain_text = [node for node in children if not ElementHelper.is_element_text_none(node)
            and node.tag not in ('em','strong','span','i','b')]
        all_nodes_contain_text.extend(nodes_contain_text)

    link_nodes = [node for node in all_nodes_contain_text if node.tag=='a' or node.getparent().tag=='a']
    other_nodes = [node for node in all_nodes_contain_text if node.tag!='a' and node.getparent().tag != 'a']

    link_nodes_text_number = cluster_text_number(link_nodes)
    other_nodes_text_number = cluster_text_number(other_nodes)

    if len(other_nodes)==0 or other_nodes_text_number==0:
        return True

    if 1.0 *link_nodes_text_number/other_nodes_text_number>2.0:
        return True

    return False
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:28,代碼來源:api2.py

示例5: get_aricle_cetd

def get_aricle_cetd(doctree):
    cetd_parse(doctree)
    body = ElementHelper.get_body(doctree)
    # ElementHelper.print_element(body)
    CleanTreeByMark(body)
    RemoveAttribute(body)
    return ElementHelper.element_text_content(body)
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:7,代碼來源:cetd.py

示例6: print_cluster_record

 def print_cluster_record(cls, clusters, doctree):
     ElementHelper.print_element(doctree)
     for cluster in clusters:
         if len(cluster)>1:
             print '===='*10
             nodes = clusters[cluster]
             for node in nodes:
                 print ElementHelper.get_xpath_by_element(node, doctree), node.get(py)
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:8,代碼來源:api2.py

示例7: get_headline_content_in_cleaned_body

    def get_headline_content_in_cleaned_body(body):
        headlin_tag = ['h1', 'h2', 'h3', 'h4']

        headline_contents = [ElementHelper.element_text_content(node)
                             for node in ElementHelper.get_elements_by_tagnames(body, headlin_tag)
                             if not ElementHelper.is_element_content_none(node)]

        return '\n'.join(headline_contents)
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:8,代碼來源:api2.py

示例8: CleanTreeByMark

def CleanTreeByMark(element):
    mark = long(element.get(kg_mark))
    if 0==mark:
        ElementHelper.remove_element(element)
    elif 1==mark:
        return
    else:
        for child in element:
            CleanTreeByMark(child)
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:9,代碼來源:cetd.py

示例9: get_clustered_records

    def get_clustered_records(cls, doctree):
        #get level_nodes_mapping
        all_level_nodes = cls.bfs_tree(doctree)

        root = ElementHelper.get_root(doctree)
        body = ElementHelper.get_body(doctree)

        #get max level and min level
        upper_bound = int(ElementHelper.get_element_depth(root))+1
        low_bound = int(body.get(px))+1

        #記錄相似的節點
        cluster={}

        for level in range(low_bound, upper_bound):

            level_nodes = all_level_nodes[level]

            #if parent is record node, then do not consider its children
            level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]
            #在同一個父親節點下進行比較
            # tag_names = set([node.getparent() for node in level_nodes])
            # tmp = {}
            # for tag in tag_names:
            #     for node in level_nodes:
            #         tmp.setdefault(tag, []).append(node)
            tmp = cls.segement(level_nodes)

            for k, nodes in tmp.items():
                # if len(nodes)==1:break
                first = None
                node_set = set()
                for i in range(1,len(nodes)):
                    if nodes[i].get(kg_record_mark)=='1':
                        continue
                    left_node = nodes[i-1]
                    # 和集合類的所有元素比較,查看是否有相同的
                    right_nodes=nodes[i:]
                    for node in right_nodes:
                        if cls.similar_check(left_node, node):
                            if first is None:
                                first = left_node
                                node_set.add(nodes[i-1])
                            left_node.set(kg_record_mark, '1')
                            node.set(kg_record_mark, '1')
                            node_set.add(node)
                if first is not None:
                    cluster[first]=node_set

        record_groups = cls.merger_sibling_record_node(doctree, cluster)
        # record_groups = cluster
        record_groups = {k:v for k,v in record_groups.items() if k.get(kg_record_mark)=='1'}

        return record_groups
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:54,代碼來源:api.py

示例10: get_meta_content

 def get_meta_content(doctree, metaAttrName, value):
     """Extract a given meta content form document.
     Example metaNames:
     (name, description)
     (name, keyword)
     (property, og:type)
     """
     meta  = ElementHelper.get_element_by_tag_attr(doctree, 'meta',metaAttrName, value)
     content = None
     if meta is not None and len(meta)>0:
         content = ElementHelper.get_attribute(meta[0], 'content')
     if content is not None:
         return normalize_word(content)
     return ''
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:14,代碼來源:api2.py

示例11: get_clustered_records

    def get_clustered_records(cls, doctree):
        #get level_nodes_mapping
        all_level_nodes = cls.bfs_tree(doctree)

        root = ElementHelper.get_root(doctree)
        body = ElementHelper.get_body(doctree)

        #get max level and min level
        upper_bound = int(ElementHelper.get_element_depth(root))+1
        low_bound = int(body.get(px))+1

        for level in range(low_bound, upper_bound):

            level_nodes = all_level_nodes[level]
            try:
                next_level_nodes = all_level_nodes[level+1]
            except KeyError:
                next_level_nodes=None
            #if parent is record node, then do not consider its children
            level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]

            for j in range(1,len(level_nodes)-1):
                left_node = level_nodes[j-1]
                #將橫向名稱相同的節點放到一起進行比較
                # right_bound = min(len(level_nodes), j+5)
                # right_nodes = level_nodes[j:right_bound]
                # #縱向比較
                # down_nodes = right_nodes[0]
                # right_nodes.extend(down_nodes)

                right_nodes = [node for node in level_nodes[j:] if node.tag==left_node.tag]
                #縱向查找
                # if next_level_nodes is not None:
                #     for node in next_level_nodes:
                #         if node.tag==left_node.tag:
                #             right_node.append(node)





                for right_node in right_nodes:
                    if cls.similar_check(left_node, right_node):
                        left_node.set(kg_record_mark,'1')
                        right_node.set(kg_record_mark, '1')
                        break

        record_groups = cls.merger_sibling_record_node(doctree)
        return record_groups
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:49,代碼來源:api2.py

示例12: merger_sibling_record_node

    def merger_sibling_record_node(cls, doctree, cluster):
        ''' 融合數據記錄
        1.首先對數據記錄進行修正,然後將連續的數據記錄放入到一個集合中
        將同層次相同標簽的節點的節點放入一個集合中,然後在就行糾正,具體詳見correct_record_mark
        :param doctree: 經過了初步的相似度比較之後標記了的DOM樹
        :param cluster: 初步的相似的數據記錄的集合
        :return:
        '''
        node_record_mapping = {}

        body = ElementHelper.get_body(doctree)
        thislevel = []
        thislevel.extend(body)
        # while thislevel:
        #     nextlevel = list()
        #     for node in thislevel:
        #         # correct nodes which
        #         cls.correct_record_mark(node)
        #
        #         if cls.is_node_or_ancestor_record(node):
        #             first_record_sibling = cls.find_first_sibling_record_node(node, doctree)
        #             node_record_mapping.setdefault(first_record_sibling, []).append(node)
        #ToDo 2016-04-20
        while thislevel:
            nextlevel = list()
            cls.correct_record_mark(thislevel, cluster)
            for node in thislevel:
                if len(node) > 0:
                    nextlevel.extend([child for child in node if not cls.is_node_or_ancestor_record(node)])
            thislevel = nextlevel

        return cluster
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:32,代碼來源:api.py

示例13: collect_urls

def collect_urls(html, base_url, encoding=None):
    """ only collect url
    :param html: page string
    :param base_url:
    :param encoding:
    :return: list of url
    """
    h = HtmlHelper()
    doctree = h.create_doc(html, encoding)
    a_tags = ElementHelper.get_elements_by_tagnames(doctree, 'a')
    for a in a_tags:
        link = a.get('href',None)
        link = m_strip(link)
        if link is None or len(link)<2:continue
        if link[0]=='#': continue #link to itself
        link = normalize_url(link, base_url)

        #if url in non visited set
        if is_url_visited(link, unvisited_url_set):
            continue

        if not should_collect_url(link, base_url):
            continue
        #if url not in same domain

        yield link
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:26,代碼來源:url.py

示例14: find_first_sibling_record_node

    def find_first_sibling_record_node(cls, element, doctree):
        '''找到element所在區域的起始節點
        1.首選查看element的xpath下表,如果其下表<2,表示element左邊沒有兄弟節點了,直接返回element
        2.如果element的xpath下標大於=2,表示element左邊有兄弟節點,那麽先找到element的父親(以便於訪問element的兄弟節點,然後index=設置為element的下表-2
        查看parentt[index]是否是數據區域,如果是繼續項左尋找,否則返回parent[index+1]
        '''
        parent = element.getparent()
        if len(parent)<2:
            return element

        element_xpath = ElementHelper.get_xpath_by_element(element, doctree)
        # print 'xpath: %s' %element_xpath
        element_last_index = StringHelper.get_digits(element_xpath.split('/')[-1])

        if element_last_index < 2:
            return element

        index = element_last_index - 2
        # print 'parent length:%d' %len(parent)
        while index >= 0:
            # print index
            if parent[index].get(kg_record_mark) == '1':
                index -= 1
            else:
                break
        return parent[index+1]
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:26,代碼來源:api2.py

示例15: is_node_or_children_record

    def is_node_or_children_record(cls, element):
        children = ElementHelper.get_children(element)

        marks = [child.get(kg_record_mark) for child in children]
        unique_marks = StringHelper.unique(marks)
        if len(unique_marks)==2:
            return True
        return False
開發者ID:actlea,項目名稱:TopicalCrawler,代碼行數:8,代碼來源:api2.py


注:本文中的util.ElementHelper類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。