当前位置: 首页>>代码示例>>Python>>正文


Python etree.strip_tags方法代码示例

本文整理汇总了Python中lxml.etree.strip_tags方法的典型用法代码示例。如果您正苦于以下问题:Python etree.strip_tags方法的具体用法?Python etree.strip_tags怎么用?Python etree.strip_tags使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lxml.etree的用法示例。


在下文中一共展示了etree.strip_tags方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _retain_only_pars

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def _retain_only_pars(tree):
    """Strip out all tags except title and p tags

    Function also changes title tags into p tags. This is a helpful
    preprocessing step that makes it easier to extract paragraphs in
    the order of a pre-ordered traversal.

    Modifies input tree inplace.

    Parameters
    ----------
    tree : :py:class:`lxml.etree._Element`
        etree element for valid NLM XML
    """
    for element in tree.xpath('.//*'):
        if element.tag == 'title':
            element.tag = 'p'
    for element in tree.xpath('.//*'):
        parent = element.getparent()
        if parent is not None and element.tag != 'p':
            etree.strip_tags(element.getparent(), element.tag) 
开发者ID:sorgerlab,项目名称:indra,代码行数:23,代码来源:pmc_client.py

示例2: handle_quotes

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def handle_quotes(element):
    '''Process quotes elements'''
    processed_element = etree.Element(element.tag)
    for child in element.iter():
        processed_child = process_node(child) # handle_textnode(child, comments_fix=True)
        if processed_child is not None:
            # processed_element.append(deepcopy(processed_child))
            newsub = etree.SubElement(processed_element, child.tag)
            newsub.text = processed_child.text
            newsub.tail = processed_child.tail
        child.tag = 'done'
    if len(processed_element) > 0:
        # avoid double/nested tags
        etree.strip_tags(processed_element, 'quote')
        # test if it has text
        # teststring = ''.join(processed_element.itertext())
        # if len(teststring) > 0 and re.search(r'[p{L}]', teststring):
        return processed_element
    return None 
开发者ID:adbar,项目名称:trafilatura,代码行数:21,代码来源:core.py

示例3: element_text

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def element_text(node):
    ET.strip_tags(node, ET.Comment)
    return node.text 
开发者ID:onelogin,项目名称:onelogin-python-aws-assume-role,代码行数:5,代码来源:aws_assume_role.py

示例4: sanitize_tree

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def sanitize_tree(tree):
    '''Sanitize the output from the generic algorithm'''
    etree.strip_elements(tree, 'audio', 'fieldset', 'iframe', 'image', 'label', 'object', 'option', 'select', 'source')
    etree.strip_tags(tree, 'article', 'center', 'div', 'main', 'span') # 'header', 'section', ...
    tree = prune_html(tree)
    cleaned_tree = convert_tags(tree)
    # cleaned_tree = manual_cleaning(tree, True)
    # cleaned_tree = HTML_CLEANER.clean_html(cleaned_tree)
    for elem in cleaned_tree.iter():
        #if elem.tag in ('code', 'del', 'head', 'hi', 'item', 'p', 'quote'):
        #    if elem.text is None or elem.text.isspace():
        #        elem.getparent().remove(elem)
        #        continue
        #if elem.text:
        elem.text = sanitize(elem.text)
        #if elem.tail:
        elem.tail = sanitize(elem.tail)
        # remove attributes
        if elem.tag != 'del' or elem.tag != 'hi':
            elem.attrib.clear()
        # finish table conversion
        if elem.tag == 'tr':
            elem.tag = 'row'
        elif elem.tag == 'td' or elem.tag == 'th':
            elem.tag = 'cell'
            if elem.tag == 'th':
                newsub.set('role', 'head')
    # cleaned_tree = prune_html(cleaned_tree)
    return cleaned_tree 
开发者ID:adbar,项目名称:trafilatura,代码行数:31,代码来源:core.py

示例5: handle_lists

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def handle_lists(element):
    '''Process lists elements'''
    processed_element = etree.Element(element.tag)
    for child in element.iter('item'):
        newchildelem = etree.Element('item')
        if len(child) == 0:
            processed_child = process_node(child)
            if processed_child is not None:
                # processed_element.append(deepcopy(processed_child))
                # childelem = etree.SubElement(processed_element, processed_child.tag)
                newchildelem.text = processed_child.text
                newchildelem.tail = processed_child.tail
                processed_element.append(newchildelem)
        else:
            # print(child.tag, child.text, child.tail)
            # proceed with iteration, fix for nested elements
            for subelem in child.iter():
                # newsub = etree.Element('item')
                processed_subchild = handle_textnode(subelem, comments_fix=False)  # process_node(subelem)
                # add child element to processed_element
                if processed_subchild is not None:
                    subchildelem = etree.SubElement(newchildelem, processed_subchild.tag)
                    subchildelem.text = processed_subchild.text
                    subchildelem.tail = processed_subchild.tail
                    # newsub.append(deepcopy(processed_subchild))
                    # processed_element.append(processed_subchild)
                subelem.tag = 'done'
            etree.strip_tags(newchildelem, 'item')
        if newchildelem.text or len(newchildelem) > 0:
            processed_element.append(newchildelem)
        child.tag = 'done'
    # avoid double tags??
    if len(processed_element) > 0:  # if it has children
        # test if it has text
        teststring = ''.join(processed_element.itertext())
        if teststring and re.search(r'\S', teststring):
            return processed_element
    return None 
开发者ID:adbar,项目名称:trafilatura,代码行数:40,代码来源:core.py

示例6: recover_wild_paragraphs

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def recover_wild_paragraphs(tree, result_body, potential_tags=TAG_CATALOG):
    '''Look for all p-elements, including outside of the determined frame
       and throughout the document to recover potentially missing text parts'''
    LOGGER.debug('Taking all p-elements')
    # prune
    search_tree = discard_unwanted(tree)
    etree.strip_tags(search_tree, 'a', 'link', 'span')
    processed_elems = [handle_paragraphs(element, potential_tags) for element in search_tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote')] # 'head', 'list'
    result_body.extend(list(filter(None.__ne__, processed_elems)))
    return result_body 
开发者ID:adbar,项目名称:trafilatura,代码行数:12,代码来源:core.py

示例7: extract_comments

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def extract_comments(tree):
    '''Try and extract comments out of potential sections in the HTML'''
    comments_body = etree.Element('body')
    # define iteration strategy
    potential_tags = set(TAG_CATALOG)  # 'span'
    # potential_tags.add('div') trouble with <div class="comment-author meta">
    for expr in COMMENTS_XPATH:
        # select tree if the expression has been found
        subtree = tree.xpath(expr)
        if not subtree:
            continue
        subtree = subtree[0]
        # prune
        subtree = discard_unwanted_comments(subtree)
        etree.strip_tags(subtree, 'a', 'link', 'span')
        # extract content
        #for elem in subtree.xpath('.//*'):
        #    processed_elem = process_comments_node(elem, potential_tags)
        #    if processed_elem is not None:
        #        comments_body.append(processed_elem)
        processed_elems = [process_comments_node(elem, potential_tags) for elem in subtree.xpath('.//*')]
        comments_body.extend(list(filter(None.__ne__, processed_elems)))
        # control
        if len(comments_body) > 0:  # if it has children
            LOGGER.debug(expr)
            # remove corresponding subtree
            subtree.getparent().remove(subtree)
            break
    # lengths
    temp_comments = trim(' '.join(comments_body.itertext()))
    return comments_body, temp_comments, len(temp_comments), tree 
开发者ID:adbar,项目名称:trafilatura,代码行数:33,代码来源:core.py

示例8: _clean

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def _clean(self):
        """
        Removes some of extraneous tags to make parsing easier
        """
        etree.strip_tags(self.tree, 'strong')
        for xx in self.tree.find_class('pydocx-tab'):
            xx.drop_tag() 
开发者ID:legco-watch,项目名称:legco-watch,代码行数:9,代码来源:agenda.py

示例9: element_text

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def element_text(node):
        etree.strip_tags(node, etree.Comment)
        return node.text 
开发者ID:onelogin,项目名称:python3-saml,代码行数:5,代码来源:xml_utils.py

示例10: _load_from

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def _load_from(self, data):
        if data.strip() == b'':
            data = XMP_EMPTY  # on some platforms lxml chokes on empty documents

        def basic_parser(xml):
            return parse(BytesIO(xml))

        def strip_illegal_bytes_parser(xml):
            return parse(BytesIO(re_xml_illegal_bytes.sub(b'', xml)))

        def recovery_parser(xml):
            parser = XMLParser(recover=True)
            return parse(BytesIO(xml), parser)

        def replace_with_empty_xmp(_xml=None):
            log.warning("Error occurred parsing XMP, replacing with empty XMP.")
            return basic_parser(XMP_EMPTY)

        if self.overwrite_invalid_xml:
            parsers = [
                basic_parser,
                strip_illegal_bytes_parser,
                recovery_parser,
                replace_with_empty_xmp,
            ]
        else:
            parsers = [basic_parser]

        for parser in parsers:
            try:
                self._xmp = parser(data)
            except (XMLSyntaxError if self.overwrite_invalid_xml else NeverRaise) as e:
                if str(e).startswith("Start tag expected, '<' not found") or str(
                    e
                ).startswith("Document is empty"):
                    self._xmp = replace_with_empty_xmp()
                    break
            else:
                break

        try:
            pis = self._xmp.xpath('/processing-instruction()')
            for pi in pis:
                etree.strip_tags(self._xmp, pi.tag)
            self._get_rdf_root()
        except (Exception if self.overwrite_invalid_xml else NeverRaise) as e:
            log.warning("Error occurred parsing XMP", exc_info=e)
            self._xmp = replace_with_empty_xmp()
        return 
开发者ID:pikepdf,项目名称:pikepdf,代码行数:51,代码来源:metadata.py

示例11: xmltotxt

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def xmltotxt(xmloutput):
    '''Convert to plain text format'''
    returnlist = []
    # etree.strip_tags(xmloutput, 'hi', 'link')
    # remove and insert into the previous tag
    for element in xmloutput.xpath('//hi|//link'):
        parent = element.getparent()
        if parent is None:
            continue
        full_text = ''
        if element.text is not None and element.tail is not None:
            full_text = element.text + ' ' + element.tail
        elif element.text is not None and element.tail is None:
            full_text = element.text
        elif element.text is None and element.tail is not None:
            full_text = element.tail
        previous = element.getprevious()
        if previous is not None:
            # There is a previous node, append text to its tail
            if previous.tail is not None:
                previous.tail += ' ' + full_text
            else:
                previous.tail = full_text
        else:
            # It's the first node in <parent/>, append to parent's text
            if parent.text is not None:
                parent.text += ' ' + full_text
            else:
                parent.text = full_text
        parent.remove(element)
        continue
    # iterate and convert to list of strings
    for element in xmloutput.iter():
        # process text
        if element.text is None and element.tail is None:
            # newlines for textless elements
            if element.tag in ('row', 'table'):
                returnlist.append('\n')
            continue
        if element.text is not None and element.tail is not None:
            textelement = ' '.join([element.text, element.tail])
        elif element.text is not None and element.tail is None:
            textelement = element.text
        else:
            textelement = element.tail
        if element.tag in ('code', 'fw', 'head', 'lb', 'p', 'quote', 'row', 'table'):
            returnlist.extend(['\n', textelement, '\n'])
        elif element.tag == 'item':
            returnlist.extend(['\n- ', textelement, '\n'])
        elif element.tag == 'cell':
            returnlist.extend(['|', textelement, '|'])
        elif element.tag == 'comments':
            returnlist.append('\n\n')
        else:
            returnlist.extend([textelement, ' '])
    return sanitize(''.join(returnlist)) 
开发者ID:adbar,项目名称:trafilatura,代码行数:58,代码来源:xml.py


注:本文中的lxml.etree.strip_tags方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。