本文整理汇总了Python中lxml.etree.iterparse方法的典型用法代码示例。如果您正苦于以下问题:Python etree.iterparse方法的具体用法?Python etree.iterparse怎么用?Python etree.iterparse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.etree
的用法示例。
在下文中一共展示了etree.iterparse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def parse(fp):
"""Efficiently parses an XML file from the StackExchange data dump and
returns a generator which yields one row at a time.
"""
context = etree.iterparse(fp, events=('end',))
for action, elem in context:
if elem.tag=='row':
# processing goes here
assert elem.text is None, "The row wasn't empty"
yield elem.attrib
# cleanup
# first empty children from current element
# This is not absolutely necessary if you are also deleting
# siblings, but it will allow you to free memory earlier.
elem.clear()
# second, delete previous siblings (records)
while elem.getprevious() is not None:
del elem.getparent()[0]
# make sure you have no references to Element objects outside the loop
示例2: _g_process_et_items
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def _g_process_et_items(path, tag) -> Iterable[Tuple]:
"""
Generator: Processes ElementTree items in a memory
efficient way
"""
context: etree.ElementTree = etree.iterparse(
path, events=('end',), tag=tag
)
for event, elem in context:
yield event, elem
# delete content of node once we're done processing
# it. If we don't then it would stay in memory
elem.clear()
示例3: get_parser
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def get_parser(filename):
ns_token = '{http://www.mediawiki.org/xml/export-0.10/}ns'
title_token = '{http://www.mediawiki.org/xml/export-0.10/}title'
revision_token = '{http://www.mediawiki.org/xml/export-0.10/}revision'
text_token = '{http://www.mediawiki.org/xml/export-0.10/}text'
with bz2.BZ2File(filename, 'r+b') as bz2_file:
for event, element in etree.iterparse(bz2_file, events=('end',)):
if element.tag.endswith('page'):
namespace_tag = element.find(ns_token)
if namespace_tag.text == '0':
title_tag = element.find(title_token)
text_tag = element.find(revision_token).find(text_token)
yield title_tag.text, text_tag.text
element.clear()
示例4: parse
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def parse(self, xml):
result = {
'type': None,
'taskId': None,
'maxSeverity': None,
'objects': [],
'problems': [],
}
events = ("start", "end")
context = etree.iterparse(six.BytesIO(xml),
events=events)
for action, elem in context:
self.tag = self._remove_ns(elem.tag)
func = self._get_func(action, self.tag)
if func in vars(XMLAPIParser):
if action == 'start':
eval('self.' + func)(elem, result)
elif action == 'end':
eval('self.' + func)()
return result
示例5: iso_info
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def iso_info(self, iso):
result = dict(
product=None,
version=None,
build=None
)
iso = isoparser.parse(iso)
content = self._find_iso_content(iso)
content = io.BytesIO(content)
context = etree.iterparse(content)
for action, elem in context:
if elem.text:
text = elem.text
if elem.tag == 'productName':
result['product'] = text
elif elem.tag == 'version':
result['version'] = text
elif elem.tag == 'buildNumber':
result['build'] = text
return result
示例6: init_etree
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def init_etree(self):
"""
Creates the ``lxml.etree.iterparse`` object.
This method should not be called directly,
``BioPaxReader.process()`` calls it.
"""
try:
self.bp = etree.iterparse(self._biopax, events=('start', 'end'))
_, self.root = next(self.bp)
except etree.XMLSyntaxError:
self.bp = None
self.used_elements = []
示例7: ParseXmlResponse
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def ParseXmlResponse(self, xml, localOnly=False, subscriptionIds=[]):
# https://gist.github.com/karlcow/3258330
xml = bytes(bytearray(xml, encoding='utf-8'))
context = etree.iterparse(BytesIO(xml),
events=('end',), tag='imdata')
mos = []
event, root = next(context)
sIds = root.get('subscriptionId', '')
if sIds:
subscriptionIds.extend([str(x) for x in sIds.split(',')])
for element in root.iterchildren():
if 'dn' not in element.attrib:
raise MoError('Property `dn` not found in element {}'.format(
_elementToString(element)))
if element.tag == 'moCount':
mo = self.moCount()
else:
mo = self.FromDn(element.attrib['dn'])
mo._fromXmlElement(element, localOnly=localOnly)
element.clear()
mos.append(mo)
return mos
示例8: lxml_trace
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
This lets you see how lxml parses a document when no Beautiful
Soup code is running.
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print("%s, %4s, %s" % (event, element.tag, element.text))
示例9: build_corpus
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def build_corpus():
global lcode, max_corpus_size, fname
with codecs.open("data/{}.txt".format(lcode), 'w', 'utf-8') as fout:
i = 1
j = 1
ns = "{http://www.mediawiki.org/xml/export-0.10/}" # namespace
for _, elem in ET.iterparse("data/{}".format(fname), tag=ns+"text"):
running_text = elem.text
try:
running_text = clean_text(running_text)
sents = sentence_segment(running_text)
for sent in sents:
if sent is not None:
words = word_segment(sent)
if len(words) > 10:
if lcode in ['ja']:
fout.write(" ".join(words).decode('utf8') + "\n")
else:
fout.write(" ".join(words) + "\n")
except:
continue # it's okay as we have a pretty big corpus!
elem.clear() # We need to save memory!
if i % 1000 == 0:
print i,
fsize = os.path.getsize("data/{}.txt".format(lcode))
if fsize > max_corpus_size:
break
i += 1
示例10: _parse_and_remove
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def _parse_and_remove(self, f, path):
"""
snippet from python cookbook, for parsing large xml file
"""
path_parts = path.split('/')
doc = iterparse(f, ('start', 'end'), recover=False, encoding='utf-8', huge_tree=True)
# Skip the root element
next(doc)
tag_stack = []
elem_stack = []
for event, elem in doc:
if event == 'start':
tag_stack.append(elem.tag)
elem_stack.append(elem)
elif event == 'end':
if tag_stack == path_parts:
yield elem
elem_stack[-2].remove(elem)
if tag_stack == ['database', 'table_structure']: # dirty hack for getting the tables structure
self._parse_table_structure(elem)
elem_stack[-2].remove(elem)
try:
tag_stack.pop()
elem_stack.pop()
except IndexError:
pass
示例11: _parse_and_remove
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def _parse_and_remove(self, f, path):
"""
snippet from python cookbook, for parsing large xml file
"""
path_parts = path.split('/')
doc = iterparse(f, ('start', 'end'), recover=False, encoding='utf-8', huge_tree=True)
# Skip the root element
next(doc)
tag_stack = []
elem_stack = []
for event, elem in doc:
if event == 'start':
if elem.tag == 'table_data':
self.current_table = elem.attrib['name']
tag_stack.append(elem.tag)
elem_stack.append(elem)
elif event == 'end':
if tag_stack == ['database', 'table_data']:
self.current_table = None
if tag_stack == path_parts:
yield elem
elem_stack[-2].remove(elem)
if tag_stack == ['database', 'table_structure']:
# dirty hack for getting the tables structure
self._parse_table_structure(elem)
elem_stack[-2].remove(elem)
try:
tag_stack.pop()
elem_stack.pop()
except IndexError:
pass
示例12: get_tag_attributes
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def get_tag_attributes(source, tag_name):
'''Iteratively parse XML stream in ``source`` until encountering ``tag_name``
at which point parsing terminates and return the attributes of the matched
tag.
Parameters
----------
source: file-like
A file-like object over an XML document
tag_name: str
The name of the XML tag to parse until
Returns
-------
dict
'''
g = etree.iterparse(source, ('start', 'end'))
for event, tag in g:
if event == 'start':
if xml._local_name(tag) == tag_name:
return tag.attrib
else:
continue
else:
tag.clear()
return None
示例13: iterparse_until
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def iterparse_until(source, target_name, quit_name):
'''Iteratively parse XML stream in ``source``, yielding XML elements
matching ``target_name``. If at any point a tag matching ``quit_name``
is encountered, stop parsing.
Parameters
----------
source: file-like
A file-like object over an XML document
tag_name: str
The name of the XML tag to parse until
quit_name: str
The name to stop parsing at.
Yields
------
lxml.etree.Element
'''
g = etree.iterparse(source, ('start', 'end'))
for event, tag in g:
if event == 'start':
if xml._local_name(tag) == quit_name:
break
else:
if xml._local_name(tag) == target_name:
yield tag
else:
tag.clear()
示例14: iterparse
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def iterparse(self):
"""
Use lxml.etree.iterparse to parse data.xml.
"""
file_name = os.path.join(self.data_dir, "data.xml")
with open(file_name, "r") as f:
etree.iterparse(file_name, events=("start", "end"))
示例15: lxml_trace
# 需要导入模块: from lxml import etree [as 别名]
# 或者: from lxml.etree import iterparse [as 别名]
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
This lets you see how lxml parses a document when no Beautiful
Soup code is running.
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print(("%s, %4s, %s" % (event, element.tag, element.text)))