本文整理汇总了Python中lxml.etree.strip_tags方法的典型用法代码示例。如果您正苦于以下问题:Python etree.strip_tags方法的具体用法?Python etree.strip_tags怎么用?Python etree.strip_tags使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.etree
的用法示例。
在下文中一共展示了etree.strip_tags方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _retain_only_pars
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def _retain_only_pars(tree):
"""Strip out all tags except title and p tags
Function also changes title tags into p tags. This is a helpful
preprocessing step that makes it easier to extract paragraphs in
the order of a pre-ordered traversal.
Modifies input tree inplace.
Parameters
----------
tree : :py:class:`lxml.etree._Element`
etree element for valid NLM XML
"""
for element in tree.xpath('.//*'):
if element.tag == 'title':
element.tag = 'p'
for element in tree.xpath('.//*'):
parent = element.getparent()
if parent is not None and element.tag != 'p':
etree.strip_tags(element.getparent(), element.tag)
示例2: handle_quotes
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def handle_quotes(element):
'''Process quotes elements'''
processed_element = etree.Element(element.tag)
for child in element.iter():
processed_child = process_node(child) # handle_textnode(child, comments_fix=True)
if processed_child is not None:
# processed_element.append(deepcopy(processed_child))
newsub = etree.SubElement(processed_element, child.tag)
newsub.text = processed_child.text
newsub.tail = processed_child.tail
child.tag = 'done'
if len(processed_element) > 0:
# avoid double/nested tags
etree.strip_tags(processed_element, 'quote')
# test if it has text
# teststring = ''.join(processed_element.itertext())
# if len(teststring) > 0 and re.search(r'[p{L}]', teststring):
return processed_element
return None
示例3: element_text
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def element_text(node):
ET.strip_tags(node, ET.Comment)
return node.text
示例4: sanitize_tree
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def sanitize_tree(tree):
'''Sanitize the output from the generic algorithm'''
etree.strip_elements(tree, 'audio', 'fieldset', 'iframe', 'image', 'label', 'object', 'option', 'select', 'source')
etree.strip_tags(tree, 'article', 'center', 'div', 'main', 'span') # 'header', 'section', ...
tree = prune_html(tree)
cleaned_tree = convert_tags(tree)
# cleaned_tree = manual_cleaning(tree, True)
# cleaned_tree = HTML_CLEANER.clean_html(cleaned_tree)
for elem in cleaned_tree.iter():
#if elem.tag in ('code', 'del', 'head', 'hi', 'item', 'p', 'quote'):
# if elem.text is None or elem.text.isspace():
# elem.getparent().remove(elem)
# continue
#if elem.text:
elem.text = sanitize(elem.text)
#if elem.tail:
elem.tail = sanitize(elem.tail)
# remove attributes
if elem.tag != 'del' or elem.tag != 'hi':
elem.attrib.clear()
# finish table conversion
if elem.tag == 'tr':
elem.tag = 'row'
elif elem.tag == 'td' or elem.tag == 'th':
elem.tag = 'cell'
if elem.tag == 'th':
newsub.set('role', 'head')
# cleaned_tree = prune_html(cleaned_tree)
return cleaned_tree
示例5: handle_lists
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def handle_lists(element):
'''Process lists elements'''
processed_element = etree.Element(element.tag)
for child in element.iter('item'):
newchildelem = etree.Element('item')
if len(child) == 0:
processed_child = process_node(child)
if processed_child is not None:
# processed_element.append(deepcopy(processed_child))
# childelem = etree.SubElement(processed_element, processed_child.tag)
newchildelem.text = processed_child.text
newchildelem.tail = processed_child.tail
processed_element.append(newchildelem)
else:
# print(child.tag, child.text, child.tail)
# proceed with iteration, fix for nested elements
for subelem in child.iter():
# newsub = etree.Element('item')
processed_subchild = handle_textnode(subelem, comments_fix=False) # process_node(subelem)
# add child element to processed_element
if processed_subchild is not None:
subchildelem = etree.SubElement(newchildelem, processed_subchild.tag)
subchildelem.text = processed_subchild.text
subchildelem.tail = processed_subchild.tail
# newsub.append(deepcopy(processed_subchild))
# processed_element.append(processed_subchild)
subelem.tag = 'done'
etree.strip_tags(newchildelem, 'item')
if newchildelem.text or len(newchildelem) > 0:
processed_element.append(newchildelem)
child.tag = 'done'
# avoid double tags??
if len(processed_element) > 0: # if it has children
# test if it has text
teststring = ''.join(processed_element.itertext())
if teststring and re.search(r'\S', teststring):
return processed_element
return None
示例6: recover_wild_paragraphs
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def recover_wild_paragraphs(tree, result_body, potential_tags=TAG_CATALOG):
'''Look for all p-elements, including outside of the determined frame
and throughout the document to recover potentially missing text parts'''
LOGGER.debug('Taking all p-elements')
# prune
search_tree = discard_unwanted(tree)
etree.strip_tags(search_tree, 'a', 'link', 'span')
processed_elems = [handle_paragraphs(element, potential_tags) for element in search_tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote')] # 'head', 'list'
result_body.extend(list(filter(None.__ne__, processed_elems)))
return result_body
示例7: extract_comments
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def extract_comments(tree):
'''Try and extract comments out of potential sections in the HTML'''
comments_body = etree.Element('body')
# define iteration strategy
potential_tags = set(TAG_CATALOG) # 'span'
# potential_tags.add('div') trouble with <div class="comment-author meta">
for expr in COMMENTS_XPATH:
# select tree if the expression has been found
subtree = tree.xpath(expr)
if not subtree:
continue
subtree = subtree[0]
# prune
subtree = discard_unwanted_comments(subtree)
etree.strip_tags(subtree, 'a', 'link', 'span')
# extract content
#for elem in subtree.xpath('.//*'):
# processed_elem = process_comments_node(elem, potential_tags)
# if processed_elem is not None:
# comments_body.append(processed_elem)
processed_elems = [process_comments_node(elem, potential_tags) for elem in subtree.xpath('.//*')]
comments_body.extend(list(filter(None.__ne__, processed_elems)))
# control
if len(comments_body) > 0: # if it has children
LOGGER.debug(expr)
# remove corresponding subtree
subtree.getparent().remove(subtree)
break
# lengths
temp_comments = trim(' '.join(comments_body.itertext()))
return comments_body, temp_comments, len(temp_comments), tree
示例8: _clean
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def _clean(self):
"""
Removes some of extraneous tags to make parsing easier
"""
etree.strip_tags(self.tree, 'strong')
for xx in self.tree.find_class('pydocx-tab'):
xx.drop_tag()
示例9: element_text
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def element_text(node):
etree.strip_tags(node, etree.Comment)
return node.text
示例10: _load_from
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def _load_from(self, data):
if data.strip() == b'':
data = XMP_EMPTY # on some platforms lxml chokes on empty documents
def basic_parser(xml):
return parse(BytesIO(xml))
def strip_illegal_bytes_parser(xml):
return parse(BytesIO(re_xml_illegal_bytes.sub(b'', xml)))
def recovery_parser(xml):
parser = XMLParser(recover=True)
return parse(BytesIO(xml), parser)
def replace_with_empty_xmp(_xml=None):
log.warning("Error occurred parsing XMP, replacing with empty XMP.")
return basic_parser(XMP_EMPTY)
if self.overwrite_invalid_xml:
parsers = [
basic_parser,
strip_illegal_bytes_parser,
recovery_parser,
replace_with_empty_xmp,
]
else:
parsers = [basic_parser]
for parser in parsers:
try:
self._xmp = parser(data)
except (XMLSyntaxError if self.overwrite_invalid_xml else NeverRaise) as e:
if str(e).startswith("Start tag expected, '<' not found") or str(
e
).startswith("Document is empty"):
self._xmp = replace_with_empty_xmp()
break
else:
break
try:
pis = self._xmp.xpath('/processing-instruction()')
for pi in pis:
etree.strip_tags(self._xmp, pi.tag)
self._get_rdf_root()
except (Exception if self.overwrite_invalid_xml else NeverRaise) as e:
log.warning("Error occurred parsing XMP", exc_info=e)
self._xmp = replace_with_empty_xmp()
return
示例11: xmltotxt
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import strip_tags [as 别名]
def xmltotxt(xmloutput):
'''Convert to plain text format'''
returnlist = []
# etree.strip_tags(xmloutput, 'hi', 'link')
# remove and insert into the previous tag
for element in xmloutput.xpath('//hi|//link'):
parent = element.getparent()
if parent is None:
continue
full_text = ''
if element.text is not None and element.tail is not None:
full_text = element.text + ' ' + element.tail
elif element.text is not None and element.tail is None:
full_text = element.text
elif element.text is None and element.tail is not None:
full_text = element.tail
previous = element.getprevious()
if previous is not None:
# There is a previous node, append text to its tail
if previous.tail is not None:
previous.tail += ' ' + full_text
else:
previous.tail = full_text
else:
# It's the first node in <parent/>, append to parent's text
if parent.text is not None:
parent.text += ' ' + full_text
else:
parent.text = full_text
parent.remove(element)
continue
# iterate and convert to list of strings
for element in xmloutput.iter():
# process text
if element.text is None and element.tail is None:
# newlines for textless elements
if element.tag in ('row', 'table'):
returnlist.append('\n')
continue
if element.text is not None and element.tail is not None:
textelement = ' '.join([element.text, element.tail])
elif element.text is not None and element.tail is None:
textelement = element.text
else:
textelement = element.tail
if element.tag in ('code', 'fw', 'head', 'lb', 'p', 'quote', 'row', 'table'):
returnlist.extend(['\n', textelement, '\n'])
elif element.tag == 'item':
returnlist.extend(['\n- ', textelement, '\n'])
elif element.tag == 'cell':
returnlist.extend(['|', textelement, '|'])
elif element.tag == 'comments':
returnlist.append('\n\n')
else:
returnlist.extend([textelement, ' '])
return sanitize(''.join(returnlist))