本文整理汇总了Python中lxml.etree.ElementTree.iterfind方法的典型用法代码示例。如果您正苦于以下问题:Python ElementTree.iterfind方法的具体用法?Python ElementTree.iterfind怎么用?Python ElementTree.iterfind使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.etree.ElementTree
的用法示例。
在下文中一共展示了ElementTree.iterfind方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Parser
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import iterfind [as 别名]
class Parser(object):
def __init__(self, template_name):
self.template_name = template_name
self.file_path = join(TEMPLATE_PATH, self.template_name)
self.doc = ElementTree(file = self.file_path)
def _onepage_metadata(self):
self.onepage_dict = {}
page = {'page': int(self.doc.find('OnePage').attrib['page'])}
self.onepage_dict.update(page)
for elem in self.doc.iterfind('OnePage/metadata'):
for sub_elem in list(elem):
key = elem.attrib['id'] + '_' + sub_elem.tag
value = sub_elem.text.encode('utf-8')
if value.isdigit():
value = int(value)
elif '\\n' in value:
value = value.replace('\\n', '\n').split(',')
else:
value = value.split(',')
self.onepage_dict.update({key: value})
return self.onepage_dict
def _variouspages_metadata(self):
self.variouspages_dict = {}
various_pages = self.doc.find('VariousPages')
if various_pages is not None:
first_page = int(self.doc.find('VariousPages').attrib['startpage'])
end_page = int(self.doc.find('VariousPages').attrib['endpage'])
pages_parse = [first_page, end_page]
pages = {'pages': pages_parse}
self.variouspages_dict.update(pages)
for elem in self.doc.iterfind('VariousPages/metadata'):
for sub_elem in list(elem):
key = elem.attrib['id'] + '_' + sub_elem.tag
value = sub_elem.text.encode('utf-8')
if value.isdigit():
value = int(value)
elif '\\n' in value:
value = value.replace('\\n', '\n').split(',')
else:
value = value.split(',')
self.variouspages_dict.update({key: value})
return self.variouspages_dict
def xml_template_metadata(self):
onepage_metadata = self._onepage_metadata()
variouspages_metadata = self._variouspages_metadata()
onepage_metadata.update(variouspages_metadata)
all_template_metadata = onepage_metadata
return all_template_metadata
示例2: extract_lemmatized_parse_trees
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import iterfind [as 别名]
def extract_lemmatized_parse_trees(scnlp_files, parse_dir):
"""
extract lemmatzied parse trees (PTB labeled bracket structures) from
Stanford CoreNLP XML ouput
"""
make_dir(parse_dir)
for scnlp_fname in file_list(scnlp_files, "*.xml"):
nlp_doc = ElementTree(file=scnlp_fname)
parse_fname = derive_path(scnlp_fname,
new_dir=parse_dir,
new_ext='.parse')
log.info("writing " + parse_fname)
with open(parse_fname, "wt", encoding="utf-8") as parse_file:
for sentence_elem in nlp_doc.iterfind(".//sentence"):
lemmas = sentence_elem.iterfind("tokens/token/lemma")
word_parse = sentence_elem.find("parse").text.strip()
lemma_parse = " ".join(_lemmatized_node(node, lemmas)
for node in word_parse.split(" "))
parse_file.write(lemma_parse + "\n")
示例3: __init__
# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import iterfind [as 别名]
def __init__(self, file):
self.zip = ZipFile(file)
self.parts = {}
content_types = ElementTree(file=self.zip.open('[Content_Types].xml'))
for file in content_types.iterfind('ct:Override', namespaces=NAMESPACES):
type = file.attrib['ContentType']
if type in CONTENT_TYPES:
fn = file.attrib['PartName'].split('/', 1)[1]
zi = self.zip.getinfo(fn)
self.parts[zi] = ElementTree(file=self.zip.open(zi))
for part in self.parts.values():
# Remove attribute that soft-links to other namespaces; other namespaces
# are not used, so would cause word to throw an error.
ignorable_key = '{%(mc)s}Ignorable' % NAMESPACES
if ignorable_key in part.getroot().attrib:
part.getroot().attrib.pop(ignorable_key)
for parent in part.iterfind('.//w:fldSimple/..', namespaces=NAMESPACES):
for idx, child in enumerate(parent.iterfind("w:fldSimple", namespaces=NAMESPACES)):
instr = child.xpath('@w:instr', namespaces=NAMESPACES)[0]
fieldName = _extract_mailmerge_instr(instr)
if not fieldName:
# Do not fail if no MERGEFIELD found
continue
# Extract original w:r structure to preserve formatting
childspan = child.xpath('w:r', namespaces=NAMESPACES)[0]
childtext = _first(childspan.xpath('w:t', namespaces=NAMESPACES))
if childtext is None:
childtext = Element("{%(w)s}t" % NAMESPACES)
childspan.append(childtext)
childtext.set("{%(xml)s}space" % NAMESPACES, "preserve")
childtext.set("{%(int)s}merge-field-name" % NAMESPACES, fieldName)
parent.insert(parent.index(child),childspan)
parent.remove(child)
# Eliminate duplicate iteration with set
for parent in set(part.iterfind('.//w:instrText/../..', namespaces=NAMESPACES)):
# state machine: capture status, capture begin, captured string + element
capturing = False
idx_begin = None
elem_instr = None
capturedInstr = ""
for elem in list(parent):
if not capturing:
if elem.find('./w:fldChar[@w:fldCharType="begin"]', namespaces=NAMESPACES) is None:
continue
idx_begin = parent.index(elem)
capturing = True
else:
if elem.find("./w:instrText", namespaces=NAMESPACES) is not None:
if elem_instr is None:
elem_instr = elem
else:
if (elem_instr.find("./w:rPr", namespaces=NAMESPACES) is not None and
elem.find("./w:rPr", namespaces=NAMESPACES) is not None):
if (etree.tostring(elem_instr.find("./w:rPr", namespaces=NAMESPACES)) !=
etree.tostring(elem.find("./w:rPr", namespaces=NAMESPACES))):
warnings.warn("Found inconsistent styling across two w:instrText tags. Only the first style will be applied.",
RuntimeWarning)
capturedInstr += elem.find("./w:instrText", namespaces=NAMESPACES).text
elif elem.find('./w:fldChar[@w:fldCharType="end"]', namespaces=NAMESPACES) is not None:
# Process field!
if elem_instr is not None:
fieldName = _extract_mailmerge_instr(capturedInstr)
if fieldName:
elem_instr.remove(elem_instr.find("./w:instrText", namespaces=NAMESPACES))
textFld = elem_instr.find("w:t", namespaces=NAMESPACES)
if textFld is None:
textFld = Element("{%(w)s}t" % NAMESPACES)
textFld.set("{%(xml)s}space" % NAMESPACES, "preserve")
elem_instr.append(textFld)
textFld.set("{%(int)s}merge-field-name" % NAMESPACES, fieldName)
idx_instr = parent.index(elem_instr)
idx_end = parent.index(elem)
# Must be a chain of two lists or one list, because otherwise the iteration will fail due to the deletion
for elem in itertools.chain([parent[i] for i in range(idx_begin, idx_instr)], [parent[i] for i in range(idx_instr + 1, idx_end + 1)]):
parent.remove(elem)
else:
warnings.warn("No w:instrText found between fldChar begin and end. Ignored", RuntimeWarning)
# Cleanup
capturing = False
idx_begin = None
elem_instr = None
capturedInstr = ""
else:
continue
#.........这里部分代码省略.........