本文整理汇总了Python中xml.etree.ElementTree.iterparse函数的典型用法代码示例。如果您正苦于以下问题:Python iterparse函数的具体用法?Python iterparse怎么用?Python iterparse使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了iterparse函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: xlsx
def xlsx(fname):
import zipfile
from xml.etree.ElementTree import iterparse
zippy = zipfile.ZipFile(fname)
try:
words = [el.text for e, el in iterparse(zippy.open("xl/sharedStrings.xml")) if el.tag.endswith("}t")]
except:
words = {}
rows = []
row = {}
val = ""
for e, el in iterparse(zippy.open("xl/worksheets/sheet1.xml")):
if el.tag.endswith("}v"): # <v>84</v>
val = el.text
if el.tag.endswith("}c"): # <c r="A3" t="s"><v>84</v></c>
if el.attrib.get("t") == "s":
val = words[int(val)]
charac = el.attrib["r"] # AZ22
while charac[-1].isdigit():
charac = charac[:-1]
row[charac] = val
val = ""
if el.tag.endswith("}row"):
rows.append(row)
row = {}
return rows
示例2: xlsx
def xlsx(path):
""" Returns a list of rows, where each row is a list of column values.
"""
import zipfile
from xml.etree.ElementTree import iterparse
a = []
r = {}
v = ""
z = zipfile.ZipFile(path)
s = [e.text for x, e in iterparse(z.open("xl/sharedStrings.xml")) if e.tag.endswith("}t")]
for x, e in iterparse(z.open("xl/worksheets/sheet1.xml")):
if e.tag.endswith("}v"): # <v>84</v>
v = e.text
if e.tag.endswith("}c") \
and e.attrib.get("t"): # <c r="A3" t="s"><v>84</v></c>
v = s[int(v)]
if e.tag.endswith("}c"):
c = e.attrib["r"] # AZ22
c = c.rstrip("0123456789")
r[c], v = v, ""
if e.tag.endswith("}row"):
if any(r.values()): # skip empty rows
a.append(r)
r = {}
m = max([max(r.keys()) for r in a])
for i, r in enumerate(a): # fill empty cells
for c in CELLS.split(m)[0] + m:
r.setdefault(c, "")
a[i] = [r[c] for c in sorted(r)]
return a
示例3: readXlsx
def readXlsx(fileName, **args):
# from: Hooshmand zandi http://stackoverflow.com/a/16544219
import zipfile
from xml.etree.ElementTree import iterparse
if "sheet" in args:
sheet = args["sheet"]
else:
sheet = 1
if "header" in args:
isHeader = args["header"]
else:
isHeader = False
rows = []
row = {}
header = {}
z = zipfile.ZipFile(fileName)
# Get shared strings
strings = [el.text for e, el in iterparse(z.open("xl/sharedStrings.xml")) if el.tag.endswith("}t")]
value = ""
# Open specified worksheet
for e, el in iterparse(z.open("xl/worksheets/sheet%d.xml" % (sheet))):
# get value or index to shared strings
if el.tag.endswith("}v"): # <v>84</v>
value = el.text
if el.tag.endswith("}c"): # <c r="A3" t="s"><v>84</v></c>
# If value is a shared string, use value as an index
if el.attrib.get("t") == "s":
value = strings[int(value)]
# split the row/col information so that the row leter(s) can be separate
letter = el.attrib["r"] # AZ22
while letter[-1].isdigit():
letter = letter[:-1]
# if it is the first row, then create a header hash for the names
# that COULD be used
if rows == []:
header[letter] = value.strip()
else:
if value != "":
# if there is a header row, use the first row's names as the row hash index
if isHeader == True and letter in header:
row[header[letter]] = value
else:
row[letter] = value
value = ""
if el.tag.endswith("}row"):
rows.append(row)
row = {}
z.close()
return [header, rows]
示例4: read_xlsx
def read_xlsx(file, **args):
# type: (typing.Any, **typing.Any) -> typing.Tuple[typing.Dict[typing.Any, str], typing.List[typing.Dict[str, str]]]
# from: Hooshmand zandi http://stackoverflow.com/a/16544219
import zipfile
from xml.etree.ElementTree import iterparse
sheet = args.get("sheet", 1)
is_header = args.get("header", False)
rows = [] # type: typing.List[typing.Dict[str, str]]
row = {}
header = {}
z = zipfile.ZipFile(file)
# Get shared strings
strings = [el.text for e, el
in iterparse(z.open('xl/sharedStrings.xml'))
if el.tag.endswith('}t')
] # type: typing.List[str]
value = ''
# Open specified worksheet
for e, el in iterparse(z.open('xl/worksheets/sheet%d.xml' % sheet)):
# get value or index to shared strings
if el.tag.endswith('}v'): # <v>84</v>
value = el.text
if el.tag.endswith(
'}c'): # <c r="A3" t="s"><v>84</v></c>
# If value is a shared string, use value as an index
if el.attrib.get('t') == 's':
value = strings[int(value)]
# split the row/col information so that the row letter(s) can be separate
letter = el.attrib['r'] # type: str # AZ22
while letter[-1].isdigit():
letter = letter[:-1]
# if it is the first row, then create a header hash for the names that COULD be used
if not rows:
header[letter] = value.strip()
else:
if value != '':
# if there is a header row, use the first row's names as the row hash index
if is_header is True and letter in header:
row[header[letter]] = value
else:
row[letter] = value
value = ''
if el.tag.endswith('}row'):
rows.append(row)
row = {}
z.close()
return header, rows
示例5: readXlsx
def readXlsx(fileName,**args):
import zipfile
from xml.etree.ElementTree import iterparse
if "sheet" in args:
sheet=args["sheet"]
else:
sheet=1
if "header" in args:
isHeader=args["header"]
else:
isHeader=False
rows = []
row = {}
header = {}
z=zipfile.ZipFile(fileName)
# Get shared strings
strings = [el.text for e, el in iterparse(z.open('xl/sharedStrings.xml')) if el.tag.endswith('}t')]
value = ''
# Open specified worksheet
for e, el in iterparse(z.open('xl/worksheets/sheet%d.xml'%(sheet))):
# get value or index to shared strings
if el.tag.endswith('}v'): # <v>84</v>
value = el.text
if el.tag.endswith('}c'): # <c r="A3" t="s"><v>84</v></c>
# If value is a shared string, use value as an index
if el.attrib.get('t') == 's':
value = strings[int(value)]
# split the row/col information so that the row leter(s) can be separate
letter = el.attrib['r'] # AZ22
while letter[-1].isdigit():
letter = letter[:-1]
# if it is the first row, then create a header hash for the names
# that COULD be used
if rows ==[]:
header[letter]=value
else:
if value != '':
# if there is a header row, use the first row's names as the row hash index
if isHeader == True and letter in header:
row[header[letter]] = value
else:
row[letter] = value
value = ''
if el.tag.endswith('}row'):
rows.append(row)
row = {}
z.close()
return rows
示例6: main
def main():
limited_tags = ['jquery','javascript','python']
con = lite.connect('bigdata.db')
## tree = ET.parse('Posts.xml')
## root = tree.getroot()
# get an iterable
context = iterparse('Posts.xml', events=("start", "end"))
# turn it into an iterator
context = iter(context)
# get the root element
event, root = context.next()
with con:
# Commented sections below create a separate table for tags
#tags_dict = {}
cur = con.cursor()
cur.execute("CREATE TABLE SO(Id INTEGER PRIMARY KEY ASC, Tags TEXT, CreationDate TEXT, UserID INTEGER)")
#cur.execute("CREATE TABLE TAGS(Id INTEGER PRIMARY KEY ASC, Tag TEXT)")
#tag_id = 0
for event, child in context:
if event == "end" and 'Title' in child.attrib and 'OwnerUserId' in child.attrib and (limited_tags[0] in child.attrib['Tags'] or limited_tags[1] in child.attrib['Tags'] or limited_tags[2] in child.attrib['Tags']):
sqlQuery = "INSERT INTO SO VALUES(?,?,?,?)"
cur.execute(sqlQuery,(child.attrib['Id'],child.attrib['Tags'],child.attrib['CreationDate'],child.attrib['OwnerUserId']))
# tags = child.attrib['Tags'].replace('<','').split('>')[:-1]
# for tag in tags:
# if not tag in tags_dict:
# tags_dict[tag] = tag_id
# tag_id+=1
root.clear()
# sqlQuery = "INSERT INTO TAGS VALUES(?,?)"
# for tag in tags_dict:
# cur.execute(sqlQuery,(tags_dict[tag],tag))
# get an iterable
context = iterparse('Users.xml', events=("start", "end"))
# turn it into an iterator
context = iter(context)
# get the root element
event, root = context.next()
with con:
cur = con.cursor()
cur.execute("CREATE TABLE USERS(UserID INTEGER PRIMARY KEY ASC, Location TEXT)")
for event, child in context:
if event == "end" and 'Location' in child.attrib:
sqlQuery = "INSERT INTO USERS VALUES(?,?)"
cur.execute(sqlQuery,(child.attrib['Id'],child.attrib['Location']))
root.clear()
示例7: parse_and_remove
def parse_and_remove(filename, out):
doc = iterparse(filename, ('start', 'end'))
categories = {}
questions = {}
for event, elem in doc:
if event == 'end':
if elem.tag == 'message':
if 'QID' in elem.text and 'TITLE' in elem.text and 'BODY' in elem.text and 'CATEGORY' in elem.text:
start_ind = elem.text.rfind('CATEGORY:')
if start_ind != -1:
cat = elem.text[start_ind+len('CATEGORY:'):].strip()
if not categories.get(cat):
categories[cat] = 1
questions[cat] = [elem.text]
else:
categories[cat] += 1
questions[cat].append(elem.text)
print(categories)
with open(out, 'w') as outfile:
for item in sorted(questions.items(), key=lambda x: x[0]):
outfile.write('***%s***\n' % item[0])
for q in item[1]:
outfile.write('%s\n' % q)
示例8: unpack
def unpack( xml ):
for (event, elem) in iterparse(xml, ['start', 'end', 'start-ns', 'end-ns']):
if event == 'end':
if elem.tag == FOLDER:
os.chdir(os.pardir)
if event == 'start':
print "working for ...", elem.attrib[NAME]
if elem.tag == FILE:
size = int(elem.attrib[SIZE])
block = size / contentLength
remdr = size % contentLength
file = open(elem.attrib[NAME], 'a')
for blockIndex in range(0, block):
file.write(content)
for remdrIndex in range(0, remdr):
file.write("X")
file.close()
if elem.tag == FOLDER:
os.mkdir(elem.attrib[NAME])
os.chdir(elem.attrib[NAME])
if elem.tag == ROOT:
shutil.rmtree(elem.attrib[NAME], ignore_errors=True)
os.mkdir(elem.attrib[NAME])
os.chdir(elem.attrib[NAME])
return 0;
示例9: read_corpus
def read_corpus(corpus_file_path, sections=['text']):
for event, elem in iterparse(corpus_file_path):
if elem.tag == 'item':
values = [elem.find(section).text for section in sections]
if not all(values):
continue
rating_text = elem.find('rating')
if rating_text is not None:
rating_text = rating_text.text
rating = float(rating_text.strip())
if rating < 3:
label = 0
else:
label = 1
else:
rating_text = elem.find('polarity')
if rating_text is None:
label = -1
elif rating_text.text.strip() == 'N':
label = 0
else:
label = 1
yield values, label
示例10: loadScheme
def loadScheme(self):
que = []
scheme = self.feed.output_scheme
map_file = self.feed.map_rules if self.feed.map_rules else ''
if scheme == None:
return
filepath = os.path.join("schemas",scheme, "schema.xml")
for (event, node) in iterparse(filepath, ['start', 'end']):
if event == 'end':
que.pop()
if event == 'start':
que.append(node.tag)
if not list(node):
o = struct()
o.xpath = "/".join(que[1:])
o.tag = node.tag
o.desc = node.text
self.schema.append(o)
else:
if len(que) == 1:
o = struct()
o.xpath = "/".join(que)
o.tag = node.tag
self.schema_root = o
elif len(que) == 2:
o = struct()
o.xpath = "/".join(que)
o.tag = node.tag
self.schema_container = o
示例11: wait_for_new_job
def wait_for_new_job(sasl_token):
# https://developers.google.com/cloud-print/docs/rawxmpp
import ssl, socket
from xml.etree.ElementTree import iterparse, tostring
xmpp = ssl.wrap_socket(socket.socket())
xmpp.connect(("talk.google.com", 5223))
parser = iterparse(xmpp, ("start", "end"))
def msg(msg=" "):
xmpp.write(msg)
stack = 0
for event, el in parser:
if event == "start" and el.tag.endswith("stream"):
continue
stack += 1 if event == "start" else -1
if stack == 0:
assert (
not el.tag.endswith("failure") and not el.tag.endswith("error") and not el.get("type") == "error"
), tostring(el)
return el
msg('<stream to="gmail.com" version="1.0" xmlns="http://etherx.jabber.org/streams">')
msg('<auth xmlns="urn:ietf:params:xml:ns:xmpp-sasl" mechanism="X-GOOGLE-TOKEN">%s</auth>' % sasl_token)
msg('<s:stream to="gmail.com" version="1.0" xmlns:s="http://etherx.jabber.org/streams" xmlns="jabber:client">')
iq = msg('<iq type="set"><bind xmlns="urn:ietf:params:xml:ns:xmpp-bind"><resource>Armooo</resource></bind></iq>')
bare_jid = iq[0][0].text.split("/")[0]
msg(
'<iq type="set" to="%s"><subscribe xmlns="google:push"><item channel="cloudprint.google.com" from="cloudprint.google.com"/></subscribe></iq>'
% bare_jid
)
return msg()
示例12: importXML
def importXML(path):
header = open(path).readline()
start = header.find('xmlns=')+7
NS = "{%s}" % header[start: header.find('\"', start)]
allInfo=[] #to store all the concised info
myBase='' #to store the base web
with open(path) as f:
for event, elem in iterparse(f):
# print elem.tag #each elem has its own tag
if elem.tag == '{0}base'.format(NS):
myBase = str(elem.text)
if elem.tag == '{0}page'.format(NS):
title = elem.find("{0}title".format(NS))
contr = elem.find(".//{0}username".format(NS))
content = elem.find(".//{0}text".format(NS))
token_dic={} #to parse the content into many tokens and store in the dictionary
if content is not None:
tokenizer = RegexpTokenizer(r'\w+') #so can get rid of punctuation
# print tokenizer.tokenize(content.text)
for eachword in tokenizer.tokenize(content.text):
try:
token_dic[eachword.lower()] += 1
except:
token_dic[eachword.lower()] = 1
allInfo.append((title.text, token_dic, content.text))
elem.clear()
return myBase, allInfo
示例13: show_all_event
def show_all_event():
"""event-based parsing"""
from xml.etree.ElementTree import iterparse
depth = 0
prefix_width = 8
prefix_dots = '.' * prefix_width
line_template = '{prefix:<0.{prefix_len}}{event:<8}{suffix:<{suffix_len}} {node.tag:<12} {node_id}'
for (event, node) in iterparse('podcasts.opml', ['start', 'end', 'start-ns', 'end-ns']):
if event == 'end':
depth -= 1
prefix_len = depth * 2
print line_template.format(prefix=prefix_dots,
prefix_len=prefix_len,
suffix='',
suffix_len=(prefix_len - prefix_len),
node=node,
node_id=id(node),
event=event)
if event == 'start':
depth += 1
示例14: parse_and_remove
def parse_and_remove(self, filename, path):
print('********')
from xml.etree.ElementTree import iterparse
path_parts = path.split('/')
doc = iterparse(filename, ('start', 'end')) # Skip the root element
print(path_parts)
next(doc)
tag_stack = []
elem_stack = []
for event, elem in doc:
print(event)
print(elem)
if event == 'start':
tag_stack.append(elem.tag)
elem_stack.append(elem)
elif event == 'end':
if tag_stack == path_parts:
yield elem
elem_stack[-2].remove(elem)
try:
tag_stack.pop()
elem_stack.pop()
except IndexError as e:
print(e)
pass
示例15: parse_and_remove
def parse_and_remove(filename,path):
path_parts=path.split('/')
doc = iterparse(filename,('start', 'end'))
#skip the root element
next(doc)
tag_stack = []
elem_stack = []
for event, elem in doc:
if event == 'start':
tag_stack.append(elem.tag)
elem_stack.append(elem)
print("start.\n")
print("tag_stack:",tag_stack,"\n")
print("elem_stack",elem_stack,"\n")
elif event == 'end':
if tag_stack == path_parts:
print("end.\n")
print("elem:",elem)
yield elem
print("elem_stack[-2]",elem_stack[-2])
elem_stack[-2].remove(elem)
try:
tag_stack.pop()
elem_stack.pop()
except IndexError:
pass