当前位置: 首页>>代码示例>>Python>>正文


Python ElementTree.iterfind方法代码示例

本文整理汇总了Python中lxml.etree.ElementTree.iterfind方法的典型用法代码示例。如果您正苦于以下问题:Python ElementTree.iterfind方法的具体用法?Python ElementTree.iterfind怎么用?Python ElementTree.iterfind使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lxml.etree.ElementTree的用法示例。


在下文中一共展示了ElementTree.iterfind方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Parser

# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import iterfind [as 别名]
class Parser(object):

    def __init__(self, template_name):
        self.template_name = template_name
        self.file_path = join(TEMPLATE_PATH, self.template_name)
        self.doc = ElementTree(file = self.file_path)
        
    def _onepage_metadata(self):
        self.onepage_dict = {}	
        page = {'page': int(self.doc.find('OnePage').attrib['page'])}
        self.onepage_dict.update(page)
        for elem in self.doc.iterfind('OnePage/metadata'):
            for sub_elem in list(elem):
                key = elem.attrib['id'] + '_' + sub_elem.tag
                value = sub_elem.text.encode('utf-8')
                if value.isdigit():
                    value = int(value)
                elif '\\n' in value:
                    value = value.replace('\\n', '\n').split(',')
                else:
                    value = value.split(',')
                self.onepage_dict.update({key: value})
        return self.onepage_dict
		
    def _variouspages_metadata(self):
        self.variouspages_dict = {}
        various_pages = self.doc.find('VariousPages')
        if various_pages is not None:
            first_page = int(self.doc.find('VariousPages').attrib['startpage'])
            end_page = int(self.doc.find('VariousPages').attrib['endpage'])
            pages_parse = [first_page, end_page]
            pages = {'pages': pages_parse}
            self.variouspages_dict.update(pages)
            for elem in self.doc.iterfind('VariousPages/metadata'):
                for sub_elem in list(elem):
                    key = elem.attrib['id'] + '_' + sub_elem.tag
                    value = sub_elem.text.encode('utf-8')
                    if value.isdigit():
                        value = int(value)
                    elif '\\n' in value:
                        value = value.replace('\\n', '\n').split(',')
                    else:
                        value = value.split(',')    
                    self.variouspages_dict.update({key: value})
        return self.variouspages_dict

    def xml_template_metadata(self):
        onepage_metadata = self._onepage_metadata()
        variouspages_metadata = self._variouspages_metadata()
        onepage_metadata.update(variouspages_metadata)
        all_template_metadata = onepage_metadata
        return all_template_metadata
开发者ID:nsi-iff,项目名称:nsi.metadataextractor,代码行数:54,代码来源:xml_parser.py

示例2: extract_lemmatized_parse_trees

# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import iterfind [as 别名]
def extract_lemmatized_parse_trees(scnlp_files, parse_dir):
    """
    extract lemmatzied parse trees (PTB labeled bracket structures) from
    Stanford CoreNLP XML ouput
    """
    make_dir(parse_dir)

    for scnlp_fname in file_list(scnlp_files, "*.xml"):
        nlp_doc = ElementTree(file=scnlp_fname)

        parse_fname = derive_path(scnlp_fname,
                                  new_dir=parse_dir,
                                  new_ext='.parse')
        log.info("writing " + parse_fname)

        with open(parse_fname, "wt", encoding="utf-8") as parse_file:
            for sentence_elem in nlp_doc.iterfind(".//sentence"):
                lemmas = sentence_elem.iterfind("tokens/token/lemma")
                word_parse = sentence_elem.find("parse").text.strip()
                lemma_parse = " ".join(_lemmatized_node(node, lemmas)
                                       for node in word_parse.split(" "))
                parse_file.write(lemma_parse + "\n")
开发者ID:OC-NTNU,项目名称:baleen-python,代码行数:24,代码来源:scnlp.py

示例3: __init__

# 需要导入模块: from lxml.etree import ElementTree [as 别名]
# 或者: from lxml.etree.ElementTree import iterfind [as 别名]
    def __init__(self, file):
        self.zip = ZipFile(file)
        self.parts = {}

        content_types = ElementTree(file=self.zip.open('[Content_Types].xml'))
        for file in content_types.iterfind('ct:Override', namespaces=NAMESPACES):
            type = file.attrib['ContentType']
            if type in CONTENT_TYPES:
                fn = file.attrib['PartName'].split('/', 1)[1]
                zi = self.zip.getinfo(fn)
                self.parts[zi] = ElementTree(file=self.zip.open(zi))

        for part in self.parts.values():
            # Remove attribute that soft-links to other namespaces; other namespaces
            # are not used, so would cause word to throw an error.
            ignorable_key = '{%(mc)s}Ignorable' % NAMESPACES
            if ignorable_key in part.getroot().attrib:
                part.getroot().attrib.pop(ignorable_key)

            for parent in part.iterfind('.//w:fldSimple/..', namespaces=NAMESPACES):

                for idx, child in enumerate(parent.iterfind("w:fldSimple", namespaces=NAMESPACES)):
                    instr = child.xpath('@w:instr', namespaces=NAMESPACES)[0]

                    fieldName = _extract_mailmerge_instr(instr)

                    if not fieldName:
                        # Do not fail if no MERGEFIELD found
                        continue
                    
                    # Extract original w:r structure to preserve formatting
                    childspan = child.xpath('w:r', namespaces=NAMESPACES)[0]
                    childtext = _first(childspan.xpath('w:t', namespaces=NAMESPACES))
                    if childtext is None:
                        childtext = Element("{%(w)s}t" % NAMESPACES)
                        childspan.append(childtext)
                    childtext.set("{%(xml)s}space" % NAMESPACES, "preserve")
                    childtext.set("{%(int)s}merge-field-name" % NAMESPACES, fieldName)
                    parent.insert(parent.index(child),childspan)
                    parent.remove(child)
                    
            # Eliminate duplicate iteration with set

            for parent in set(part.iterfind('.//w:instrText/../..', namespaces=NAMESPACES)):
                # state machine: capture status, capture begin, captured string + element
                capturing = False
                idx_begin = None
                elem_instr = None
                capturedInstr = ""

                for elem in list(parent):
                    if not capturing:
                        if elem.find('./w:fldChar[@w:fldCharType="begin"]', namespaces=NAMESPACES) is None:
                            continue
                        idx_begin = parent.index(elem)
                        capturing = True
                    else:
                        if elem.find("./w:instrText", namespaces=NAMESPACES) is not None:
                            if elem_instr is None:
                                elem_instr = elem
                            else:
                                if (elem_instr.find("./w:rPr", namespaces=NAMESPACES) is not None and
                                    elem.find("./w:rPr", namespaces=NAMESPACES) is not None):
                                    if (etree.tostring(elem_instr.find("./w:rPr", namespaces=NAMESPACES)) != 
                                        etree.tostring(elem.find("./w:rPr", namespaces=NAMESPACES))):
                                        warnings.warn("Found inconsistent styling across two w:instrText tags. Only the first style will be applied.", 
                                            RuntimeWarning)

                            capturedInstr += elem.find("./w:instrText", namespaces=NAMESPACES).text
                        elif elem.find('./w:fldChar[@w:fldCharType="end"]', namespaces=NAMESPACES) is not None:
                            # Process field!
                            if elem_instr is not None:
                                fieldName = _extract_mailmerge_instr(capturedInstr)
                                if fieldName:
                                    elem_instr.remove(elem_instr.find("./w:instrText", namespaces=NAMESPACES))

                                    textFld = elem_instr.find("w:t", namespaces=NAMESPACES)
                                    if textFld is None:
                                        textFld = Element("{%(w)s}t" % NAMESPACES)
                                        textFld.set("{%(xml)s}space" % NAMESPACES, "preserve")
                                        elem_instr.append(textFld)

                                    textFld.set("{%(int)s}merge-field-name" % NAMESPACES, fieldName)

                                    idx_instr = parent.index(elem_instr)
                                    idx_end = parent.index(elem)

                                    # Must be a chain of two lists or one list, because otherwise the iteration will fail due to the deletion
                                    for elem in itertools.chain([parent[i] for i in range(idx_begin, idx_instr)], [parent[i] for i in range(idx_instr + 1, idx_end + 1)]):
                                        parent.remove(elem)
                            else:
                                warnings.warn("No w:instrText found between fldChar begin and end. Ignored", RuntimeWarning)

                            # Cleanup
                            capturing = False
                            idx_begin = None
                            elem_instr = None
                            capturedInstr = ""
                        else:
                            continue
#.........这里部分代码省略.........
开发者ID:brendan-sterne,项目名称:docx-mailmerge,代码行数:103,代码来源:mailmerge.py


注:本文中的lxml.etree.ElementTree.iterfind方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。