当前位置: 首页>>代码示例>>Python>>正文


Python etree.iterparse方法代码示例

本文整理汇总了Python中lxml.etree.iterparse方法的典型用法代码示例。如果您正苦于以下问题:Python etree.iterparse方法的具体用法?Python etree.iterparse怎么用?Python etree.iterparse使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lxml.etree的用法示例。


在下文中一共展示了etree.iterparse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def parse(fp):
    """Efficiently parses an XML file from the StackExchange data dump and
    returns a generator which yields one row at a time.
    """

    context = etree.iterparse(fp, events=('end',))

    for action, elem in context:
        if elem.tag=='row':
            # processing goes here
            assert elem.text is None, "The row wasn't empty"
            yield elem.attrib

        # cleanup
        # first empty children from current element
            # This is not absolutely necessary if you are also deleting
            # siblings, but it will allow you to free memory earlier.
        elem.clear()
        # second, delete previous siblings (records)
        while elem.getprevious() is not None:
            del elem.getparent()[0]
        # make sure you have no references to Element objects outside the loop 
开发者ID:Networks-Learning,项目名称:stackexchange-dump-to-postgres,代码行数:24,代码来源:row_processor.py

示例2: _g_process_et_items

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def _g_process_et_items(path, tag) -> Iterable[Tuple]:
        """
        Generator: Processes ElementTree items in a memory
        efficient way
        """

        context: etree.ElementTree = etree.iterparse(
            path, events=('end',), tag=tag
        )

        for event, elem in context:
            yield event, elem

            # delete content of node once we're done processing
            # it. If we don't then it would stay in memory
            elem.clear() 
开发者ID:Wikidata,项目名称:soweego,代码行数:18,代码来源:discogs_dump_extractor.py

示例3: get_parser

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def get_parser(filename):
    ns_token        = '{http://www.mediawiki.org/xml/export-0.10/}ns'
    title_token     = '{http://www.mediawiki.org/xml/export-0.10/}title'
    revision_token  = '{http://www.mediawiki.org/xml/export-0.10/}revision'
    text_token      = '{http://www.mediawiki.org/xml/export-0.10/}text'

    with bz2.BZ2File(filename, 'r+b') as bz2_file:
        for event, element in etree.iterparse(bz2_file, events=('end',)):
            if element.tag.endswith('page'):
                namespace_tag = element.find(ns_token)

                if namespace_tag.text == '0':
                    title_tag = element.find(title_token)
                    text_tag = element.find(revision_token).find(text_token)
                    yield title_tag.text, text_tag.text

                element.clear() 
开发者ID:marklit,项目名称:airline-passenger-counts,代码行数:19,代码来源:app.py

示例4: parse

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def parse(self, xml):
        result = {
            'type': None,
            'taskId': None,
            'maxSeverity': None,
            'objects': [],
            'problems': [],
        }

        events = ("start", "end")

        context = etree.iterparse(six.BytesIO(xml),
                                  events=events)
        for action, elem in context:
            self.tag = self._remove_ns(elem.tag)

            func = self._get_func(action, self.tag)
            if func in vars(XMLAPIParser):
                if action == 'start':
                    eval('self.' + func)(elem, result)
                elif action == 'end':
                    eval('self.' + func)()

        return result 
开发者ID:openstack,项目名称:manila,代码行数:26,代码来源:xml_api_parser.py

示例5: iso_info

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def iso_info(self, iso):
        result = dict(
            product=None,
            version=None,
            build=None
        )

        iso = isoparser.parse(iso)
        content = self._find_iso_content(iso)
        content = io.BytesIO(content)

        context = etree.iterparse(content)
        for action, elem in context:
            if elem.text:
                text = elem.text

            if elem.tag == 'productName':
                result['product'] = text
            elif elem.tag == 'version':
                result['version'] = text
            elif elem.tag == 'buildNumber':
                result['build'] = text

        return result 
开发者ID:mcgonagle,项目名称:ansible_f5,代码行数:26,代码来源:bigip_software.py

示例6: init_etree

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def init_etree(self):
        """
        Creates the ``lxml.etree.iterparse`` object.
        This method should not be called directly,
        ``BioPaxReader.process()`` calls it.
        """
        try:

            self.bp = etree.iterparse(self._biopax, events=('start', 'end'))
            _, self.root = next(self.bp)

        except etree.XMLSyntaxError:

            self.bp = None

        self.used_elements = [] 
开发者ID:saezlab,项目名称:pypath,代码行数:18,代码来源:pyreact.py

示例7: ParseXmlResponse

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def ParseXmlResponse(self, xml, localOnly=False, subscriptionIds=[]):
        # https://gist.github.com/karlcow/3258330
        xml = bytes(bytearray(xml, encoding='utf-8'))
        context = etree.iterparse(BytesIO(xml),
                                  events=('end',), tag='imdata')
        mos = []
        event, root = next(context)
        sIds = root.get('subscriptionId', '')
        if sIds:
            subscriptionIds.extend([str(x) for x in sIds.split(',')])
        for element in root.iterchildren():
            if 'dn' not in element.attrib:
                raise MoError('Property `dn` not found in element {}'.format(
                    _elementToString(element)))
            if element.tag == 'moCount':
                mo = self.moCount()
            else:
                mo = self.FromDn(element.attrib['dn'])
            mo._fromXmlElement(element, localOnly=localOnly)
            element.clear()
            mos.append(mo)
        return mos 
开发者ID:datacenter,项目名称:pyaci,代码行数:24,代码来源:core.py

示例8: lxml_trace

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.

    This lets you see how lxml parses a document when no Beautiful
    Soup code is running.
    """
    from lxml import etree
    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
        print("%s, %4s, %s" % (event, element.tag, element.text)) 
开发者ID:MarcelloLins,项目名称:ServerlessCrawler-VancouverRealState,代码行数:11,代码来源:diagnose.py

示例9: build_corpus

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def build_corpus():
    global lcode, max_corpus_size, fname
    with codecs.open("data/{}.txt".format(lcode), 'w', 'utf-8') as fout:
        i = 1
        j = 1
        ns = "{http://www.mediawiki.org/xml/export-0.10/}" # namespace
        for _, elem in ET.iterparse("data/{}".format(fname), tag=ns+"text"):
            running_text = elem.text
            try:
                running_text = clean_text(running_text)
                sents = sentence_segment(running_text)
                for sent in sents:
                    if sent is not None:
                        words = word_segment(sent)
                        if len(words) > 10:
                            if lcode in ['ja']:
                                fout.write(" ".join(words).decode('utf8') + "\n")
                            else:
                                fout.write(" ".join(words) + "\n")
                                
            except:
                continue # it's okay as we have a pretty big corpus!
            elem.clear() # We need to save memory!
            if i % 1000 == 0: 
                print i,
                fsize = os.path.getsize("data/{}.txt".format(lcode))
                if fsize > max_corpus_size:
                    break
            i += 1 
开发者ID:Kyubyong,项目名称:wordvectors,代码行数:31,代码来源:build_corpus.py

示例10: _parse_and_remove

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def _parse_and_remove(self, f, path):
        """
        snippet from python cookbook, for parsing large xml file
        """
        path_parts = path.split('/')
        doc = iterparse(f, ('start', 'end'), recover=False, encoding='utf-8', huge_tree=True)
        # Skip the root element
        next(doc)
        tag_stack = []
        elem_stack = []
        for event, elem in doc:
            if event == 'start':
                tag_stack.append(elem.tag)
                elem_stack.append(elem)
            elif event == 'end':
                if tag_stack == path_parts:
                    yield elem
                    elem_stack[-2].remove(elem)
                if tag_stack == ['database', 'table_structure']:  # dirty hack for getting the tables structure
                    self._parse_table_structure(elem)
                    elem_stack[-2].remove(elem)
                try:
                    tag_stack.pop()
                    elem_stack.pop()
                except IndexError:
                    pass 
开发者ID:zhongbiaodev,项目名称:py-mysql-elasticsearch-sync,代码行数:28,代码来源:__init__.py

示例11: _parse_and_remove

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def _parse_and_remove(self, f, path):
        """
        snippet from python cookbook, for parsing large xml file
        """
        path_parts = path.split('/')
        doc = iterparse(f, ('start', 'end'), recover=False, encoding='utf-8', huge_tree=True)
        # Skip the root element
        next(doc)
        tag_stack = []
        elem_stack = []
        for event, elem in doc:
            if event == 'start':
                if elem.tag == 'table_data':
                    self.current_table = elem.attrib['name']
                tag_stack.append(elem.tag)
                elem_stack.append(elem)
            elif event == 'end':
                if tag_stack == ['database', 'table_data']:
                    self.current_table = None
                if tag_stack == path_parts:
                    yield elem
                    elem_stack[-2].remove(elem)
                if tag_stack == ['database', 'table_structure']:
                # dirty hack for getting the tables structure
                    self._parse_table_structure(elem)
                    elem_stack[-2].remove(elem)
                try:
                    tag_stack.pop()
                    elem_stack.pop()
                except IndexError:
                    pass 
开发者ID:zhongbiaodev,项目名称:py-mysql-elasticsearch-sync,代码行数:33,代码来源:__init__.py

示例12: get_tag_attributes

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def get_tag_attributes(source, tag_name):
    '''Iteratively parse XML stream in ``source`` until encountering ``tag_name``
    at which point parsing terminates and return the attributes of the matched
    tag.

    Parameters
    ----------
    source: file-like
        A file-like object over an XML document
    tag_name: str
        The name of the XML tag to parse until

    Returns
    -------
    dict
    '''
    g = etree.iterparse(source, ('start', 'end'))
    for event, tag in g:
        if event == 'start':
            if xml._local_name(tag) == tag_name:
                return tag.attrib
            else:
                continue
        else:
            tag.clear()
    return None 
开发者ID:mobiusklein,项目名称:ms_deisotope,代码行数:28,代码来源:xml_reader.py

示例13: iterparse_until

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def iterparse_until(source, target_name, quit_name):
    '''Iteratively parse XML stream in ``source``, yielding XML elements
    matching ``target_name``. If at any point a tag matching ``quit_name``
    is encountered, stop parsing.

    Parameters
    ----------
    source: file-like
        A file-like object over an XML document
    tag_name: str
        The name of the XML tag to parse until
    quit_name: str
        The name to stop parsing at.

    Yields
    ------
    lxml.etree.Element
    '''
    g = etree.iterparse(source, ('start', 'end'))
    for event, tag in g:
        if event == 'start':
            if xml._local_name(tag) == quit_name:
                break
            else:
                if xml._local_name(tag) == target_name:
                    yield tag
                else:
                    tag.clear() 
开发者ID:mobiusklein,项目名称:ms_deisotope,代码行数:30,代码来源:xml_reader.py

示例14: iterparse

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def iterparse(self):
        """
        Use lxml.etree.iterparse to parse data.xml.
        """
        file_name = os.path.join(self.data_dir, "data.xml")
        with open(file_name, "r") as f:
            etree.iterparse(file_name, events=("start", "end")) 
开发者ID:recipy,项目名称:recipy,代码行数:9,代码来源:run_lxml.py

示例15: lxml_trace

# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.

    This lets you see how lxml parses a document when no Beautiful
    Soup code is running.
    """
    from lxml import etree
    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
        print(("%s, %4s, %s" % (event, element.tag, element.text))) 
开发者ID:the-ethan-hunt,项目名称:B.E.N.J.I.,代码行数:11,代码来源:diagnose.py


注:本文中的lxml.etree.iterparse方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。