本文整理汇总了Python中Utils.ElementTreeUtils类的典型用法代码示例。如果您正苦于以下问题:Python ElementTreeUtils类的具体用法?Python ElementTreeUtils怎么用?Python ElementTreeUtils使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ElementTreeUtils类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: processCorpus
def processCorpus(inputFilename, outputFilename, rules):
print >> sys.stderr, "Deleting elements, rules =", rules
print >> sys.stderr, "Loading corpus file", inputFilename
corpusTree = ETUtils.ETFromObj(inputFilename)
corpusRoot = corpusTree.getroot()
for eType in rules.keys():
for attrRule in rules[eType].keys():
rules[eType][attrRule] = rules[eType][attrRule].split("|")
documents = corpusRoot.findall("document")
counter = ProgressCounter(len(documents), "Documents")
countsByType = defaultdict(int)
for document in documents:
counter.update()
for sentence in document.findall("sentence"):
processSentence(sentence, rules, countsByType)
print >> sys.stderr, "Deleted elements"
for k in sorted(countsByType.keys()):
print >> sys.stderr, " " + k + ":", countsByType[k]
if outputFilename != None:
print >> sys.stderr, "Writing output to", outputFilename
ETUtils.write(corpusRoot, outputFilename)
return corpusTree
示例2: catenateElements
def catenateElements(inputs, output):
print >> sys.stderr, "##### Catenate interaction XML as elements #####"
c1 = RecalculateIds.recalculateIds(input1, None, False, 0)
numDocs = len(c1.getroot().findall("document"))
print >> sys.stderr, "Documents in input 1:", numDocs
c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs)
print >> sys.stderr, "Appending documents"
c1Root = c1.getroot()
for document in c2.getroot().findall("document"):
c1Root.append(document)
print >> sys.stderr, "Validating ids"
ids = set()
for element in c1Root.getiterator("entity"):
id = element.get("id")
assert not id in ids
ids.add(id)
for element in c1Root.getiterator("interaction"):
id = element.get("id")
assert not id in ids
ids.add(id)
for element in c1Root.getiterator("sentence"):
id = element.get("id")
assert not id in ids
ids.add(id)
for element in c1Root.getiterator("document"):
id = element.get("id")
assert not id in ids
ids.add(id)
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(c1Root, output)
return c1
示例3: parse
def parse(self, parserName, input, output=None, debug=False, reparse=False, stanfordParserDir=None, stanfordParserArgs=None, action="convert", outputFormat=None, memory=None):
#global stanfordParserDir, stanfordParserArgs
assert action in ("convert", "penn", "dep")
if stanfordParserDir == None:
stanfordParserDir = Settings.STANFORD_PARSER_DIR
# Run the parser process
corpusTree, corpusRoot = self.getCorpus(input)
workdir = tempfile.mkdtemp()
inPath = self.makeInputFile(corpusRoot, workdir, parserName, reparse, action, debug)
outPath = self.runProcess(stanfordParserArgs, stanfordParserDir, inPath, workdir, action, outputFormat, memory)
self.printStderr(outPath)
# Insert the parses
if action in ("convert", "dep"):
#self.insertDependencyParses(outPath, corpusRoot, parserName, {"stanford-mode":action}, addTimeStamp=True, skipExtra=0, removeExisting=True)
self.insertStanfordDependencyParses(outPath, corpusRoot, parserName, skipParsed=reparse, removeExisting=reparse)
elif action == "penn":
self.insertPennTrees(outPath, corpusRoot, parserName)
# Remove work directory
if not debug:
shutil.rmtree(workdir)
else:
print >> sys.stderr, "Parser IO files at", workdir
# Write the output XML file
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(corpusRoot, output)
return corpusTree
示例4: mergeAll
def mergeAll(input, output=None, debug=False, iterate=False):
if iterate:
origItems = defaultdict(int)
removedItems = defaultdict(int)
for docSentences in SentenceElements.getCorpusIterator(input, output):
entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(docSentences, debug)
for key in entitiesByType: origItems[key] += entitiesByType[key]
for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(docSentences, debug)
for key in interactionsByType: origItems[key] += interactionsByType[key]
for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
printStats(origItems, removedItems)
return None
else:
corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False)
print >> sys.stderr, "Merging duplicate entities"
entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(corpusElements.sentences, debug)
printStats(entitiesByType, duplicatesRemovedByType)
print >> sys.stderr, "Merging duplicate interactions"
interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(corpusElements.sentences, debug)
printStats(interactionsByType, duplicatesRemovedByType)
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(corpusElements.rootElement, output)
return corpusElements
示例5: classify
def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None, validate=False):
model = self.openModel(model, "r")
self.enterState(self.STATE_CLASSIFY)
self.setWorkDir(workDir)
if workDir == None:
self.setTempWorkDir()
model = self.openModel(model, "r")
if parse == None: parse = self.getStr(self.tag+"parse", model)
workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-")
xml = self.classifyToXML(data, model, None, workOutputTag,
model.get(self.tag+"classifier-model", defaultIfNotExist=None), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0)))
if (validate):
self.structureAnalyzer.load(model)
self.structureAnalyzer.validate(xml)
ETUtils.write(xml, output+"-pred.xml.gz")
else:
shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz")
EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
if stParams["convert"]: #self.useBioNLPSTFormat:
extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz"
Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events" + extension, outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True))
if stParams["evaluate"]: #self.stEvaluator != None:
if task == None:
task = self.getStr(self.tag+"task", model)
self.stEvaluator.evaluate(output+"-events" + extension, task)
self.deleteTempWorkDir()
self.exitState()
示例6: fixAltOffsets
def fixAltOffsets(input, output=None):
print >> sys.stderr, "Loading corpus", input
corpusTree = ETUtils.ETFromObj(input)
print >> sys.stderr, "Corpus file loaded"
corpusRoot = corpusTree.getroot()
docCount = 0
sentencesCreated = 0
sentences = [x for x in corpusRoot.getiterator("sentence")]
counter = ProgressCounter(len(sentences), "FixAltOffsets")
fixCount = 0
# fix spans
for sentence in sentences:
counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
for entity in sentence.findall("entity"):
altOffsetString = entity.get("altOffset")
if altOffsetString == None:
continue
#print altOffsetString
altOffsets = Range.charOffsetToTuples(altOffsetString)
assert len(altOffsets) == 1
for i in range(len(altOffsets)):
altOffset = altOffsets[i]
altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
fixCount += 1
print >> sys.stderr, "Fixed", fixCount, "altOffsets"
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(corpusRoot, output)
return corpusTree
示例7: removeUnconnectedEntities
def removeUnconnectedEntities(input, output=None):
input = ETUtils.ETFromObj(input)
root = input.getroot()
removed = 0
preserved = 0
for document in root.findall("document"):
sentMap = {} # allow for intersentence interactions
for sentence in document.findall("sentence"):
sentMap[sentence.get("id")] = sentence
connected = set()
for interaction in document.getiterator("interaction"):
connected.add(interaction.get("e1"))
connected.add(interaction.get("e2"))
entities = []
for entity in document.getiterator("entity"):
entities.append(entity)
for entity in entities:
if entity.get("isName") == "True": # never remove named entities
continue
eId = entity.get("id")
if eId not in connected:
if eId.find(".s") != -1: # sentence level entity
sentMap[eId.rsplit(".", 1)[0]].remove(entity)
else: # document level entity
document.remove(entity)
removed += 1
else:
preserved += 1
print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities"
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(root, output)
return input
示例8: findHeadsSyntactic
def findHeadsSyntactic(corpus, parse, tokenization):
"""
Determine the head token for a named entity or trigger. The head token is the token closest
to the root for the subtree of the dependency parse spanned by the text of the element.
@param entityElement: a semantic node (trigger or named entity)
@type entityElement: cElementTree.Element
@param verbose: Print selected head tokens on screen
@param verbose: boolean
"""
counts = [0,0]
sentences = [x for x in corpus.getiterator("sentence")]
counter = ProgressCounter(len(sentences), "SYNTAX")
for sentence in sentences:
counter.update()
tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization})
parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse})
if tokElement == None or parseElement == None:
print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization"
tokens = tokElement.findall("token")
tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id"))
for entity in sentence.findall("entity"):
if entity.get("headOffset") == None:
headToken = getEntityHeadToken(entity, tokens, tokenHeadScores)
# The ElementTree entity-element is modified by setting the headOffset attribute
entity.set("headOffset", headToken.get("charOffset"))
entity.set("headMethod", "Syntax")
entity.set("headString", headToken.get("text"))
counts[0] += 1
return counts
示例9: makeSubset
def makeSubset(input, output=None, ratio=1.0, seed=0):
if ratio == 1.0:
if output != None:
shutil.copy2(input, output)
return output
else:
return input
totalFolds = 100
selectedFolds = int(ratio * 100.0)
print >>sys.stderr, "====== Making subset ======"
print >>sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed
xml = ETUtils.ETFromObj(input).getroot()
count = 0
sentCount = 0
for document in xml.findall("document"):
sentCount += len(document.findall("sentence"))
count += 1
division = Core.Split.getFolds(count, totalFolds, seed)
# print division, selectedFolds - 1
index = 0
removeCount = 0
sentRemoveCount = 0
for document in xml.findall("document"):
if division[index] > selectedFolds - 1:
xml.remove(document)
sentRemoveCount += len(document.findall("sentence"))
removeCount += 1
index += 1
print >>sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount
xml.set("subsetRatio", str(ratio))
xml.set("subsetSeed", str(seed))
if output != None:
ETUtils.write(xml, output)
return output
示例10: run
def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False):
"""inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream
multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf)
multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence
the root of the modified tree is returned and, if outFile is a string, written out to outFile as well"""
print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####"
tree=ETUtils.ETFromObj(inFile)
if not ET.iselement(tree):
assert isinstance(tree,ET.ElementTree)
root=tree.getroot()
else:
root = tree
if multiplier != -1:
if binary:
print >> sys.stderr, "Recall binary mode"
classRanges = getClassRanges(root.getiterator("entity"))
assert len(classRanges.keys()) in [0,2]
if len(classRanges.keys()) == 0:
print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found"
else:
print >> sys.stderr, "Recall multiclass mode"
classRanges = None
for entityNode in root.getiterator("entity"):
adjustEntity(entityNode,targetLabel,multiplier,classRanges)
if outFile:
ETUtils.write(root,outFile)
return tree
示例11: processCorpus
def processCorpus(inPath, outPath, sourceSet, newSets, seed=1):
print >> sys.stderr, "Loading corpus file", inPath
corpusTree = ETUtils.ETFromObj(inPath)
corpusRoot = corpusTree.getroot()
rand = random.Random(seed)
documents = corpusRoot.findall("document")
counts = {"old":defaultdict(int), "new":defaultdict(int)}
for document in documents:
counts["old"][document.get("set")] += 1
if sourceSet != None and document.get("set") != sourceSet:
counts["new"][document.get("set")] += 1
continue
value = rand.random()
document.set("setValue", str(value))
document.set("origSet", document.get("set", ""))
for setName, cutoff in newSets:
if value <= cutoff:
document.set("set", setName)
break
counts["new"][document.get("set")] += 1
#for key in counts:
# counts[key] = dict(counts[key])
print "MakeSets result:", "old=" + str(dict(counts["old"])) + ", new=" + str(dict(counts["new"]))
if outPath != None:
ETUtils.write(corpusRoot, outPath)
return corpusTree
示例12: test
def test(extractPath, downloadPath, inCorpusPath, outCorpusPath):
download(extractPath, downloadPath)
specAnn = readResources(extractPath)
inCorpus = ETUtils.ETFromObj(inCorpusPath)
insertElements(inCorpus.getroot(), specAnn)
ETUtils.write(inCorpus.getroot(), outCorpusPath)
#process("/tmp/extract", "/tmp/download", "/home/jari/Dropbox/data/BioNLP16/corpora/BB_EVENT_16-devel.xml", "/tmp/ner.xml")
示例13: process
def process(input, output=None):
download("/tmp/extract", "/tmp/download")
specAnn = readResources("/tmp/extract")
insertElements(input.getroot(), specAnn)
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(input.getroot(), output)
return input
示例14: insertParses
def insertParses(self, parseDir, input, output=None, parseName="McCC", extensions=None, subDirs=None, debug=False, skipParsed=False, docMatchKeys=None, conllFormat=None, splitting=True, unescapeFormats="AUTO", tokenMerging=True, extMap=None, sdFailedFormat="empty", origIdType=None, posTags=None):
corpusTree, corpusRoot = self.getCorpus(input)
if not os.path.exists(parseDir):
raise Exception("Cannot find parse input '" + str(parseDir) + "'")
if not os.path.isdir(parseDir):
raise Exception("Parse input '" + str(parseDir) + "' is not a directory")
if extensions == None:
extensions = self.allExt
elif isinstance(extensions, basestring):
extensions = extensions.split(",")
extensions = [x for x in extensions if x in self.allExt]
unescapeFormats = self.getUnescapeFormats(unescapeFormats)
if docMatchKeys == None:
docMatchKeys = ["origId", "pmid", "id"]
elif isinstance(docMatchKeys, basestring):
docMatchKeys = docMatchKeys.split(",")
print >> sys.stderr, "Inserting parses from file types:", extensions
counts = defaultdict(int)
files = self.getParseFiles(parseDir, extensions, subDirs, counts, extMap=extMap, origIdType=origIdType)
typeCounts = {x:defaultdict(int) for x in extensions}
# Make document elements if needed
documents = [x for x in corpusRoot.findall("document")]
if len(documents) == 0:
typeCounts["document-generation"] = defaultdict(int)
documents = self.prepareDocuments(corpusRoot, files)
counter = ProgressCounter(len(files), "Parse Insertion")
# Insert parses and make sentence elements if needed
typeCounts["sentence-splitting"] = defaultdict(int)
print >> sys.stderr, "Inserting parses for", len(files), "out of total", len(documents), "documents"
for document in documents:
counts["document"] += 1
matchFound = False
for docMatchValue in [document.get(x) for x in docMatchKeys if document.get(x) != None]:
if docMatchValue in files:
if matchFound:
raise Exception("Multiple matching parses for document " + str(document.attrib) + " using keys " + str(docMatchKeys))
matchFound = True
counter.update(1, "Inserting parses for (" + document.get("id") + "/" + str(docMatchValue) + "): ")
counts["document-match"] += 1
for ext in extensions:
if ext not in files[docMatchValue]:
continue
counts[ext + "-match"] += 1
sentences = [x for x in self.getSentences(document, skipParsed=skipParsed)]
self.insertParse(document, sentences, ext, files[docMatchValue][ext], parseName, splitting, typeCounts, conllFormat, unescapeFormats=unescapeFormats, tokenMerging=tokenMerging, sdFailedFormat=sdFailedFormat, posTags=posTags)
if not matchFound:
counts["document-no-match"] += 1
if len(typeCounts["sentence-splitting"]) > 0:
print >> sys.stderr, "Sentence Splitting Counts", dict(typeCounts["sentence-splitting"])
print >> sys.stderr, "Counts", dict(counts)
for ext in extensions:
if len(typeCounts[ext]) > 0:
print >> sys.stderr, "Counts for type '" + ext + "':", dict(typeCounts[ext])
# Write the output XML file
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(corpusRoot, output)
return corpusTree
示例15: processCorpus
def processCorpus(input, outDir, stem=None, tail=".xml", mergedSets=[], saveCombined=False, verbose=False):
newCorpora = {}
print >> sys.stderr, "Loading corpus file", input
corpusRoot = ETUtils.ETFromObj(input).getroot()
documents = corpusRoot.findall("document")
counter = ProgressCounter(len(documents), "Documents")
countsByType = {}
for document in documents:
counter.update()
docSet = document.get("set")
if docSet == None:
if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id")
if not countsByType.has_key("No set"):
countsByType["No set"] = 0
countsByType["No set"] += 1
continue
elif not newCorpora.has_key(docSet):
newCorpora[docSet] = ET.Element("corpus")
for k, v in corpusRoot.attrib.iteritems():
newCorpora[docSet].set(k, v)
countsByType[docSet] = 0
newCorpora[docSet].append(document)
countsByType[docSet] += 1
# Make merged sets
for mergedSet in mergedSets:
tag = "-and-".join(sorted(mergedSet))
if not newCorpora.has_key(tag):
newCorpora[tag] = ET.Element("corpus")
for k, v in corpusRoot.attrib.iteritems():
newCorpora[tag].set(k, v)
countsByType[tag] = 0
for componentSet in mergedSet:
for element in newCorpora[componentSet].findall("document"):
newCorpora[tag].append(element)
countsByType[tag] += 1
print >> sys.stderr, "Documents per set"
for k in sorted(countsByType.keys()):
print >> sys.stderr, " " + str(k) + ":", countsByType[k]
if stem == None:
outDir, stem = os.path.dirname(outDir), os.path.basename(outDir)
if not os.path.exists(outDir):
os.makedirs(outDir)
print >> sys.stderr, "Writing output files to directory", outDir
if saveCombined:
print >> sys.stderr, "Saving combined input to", stem + tail
ETUtils.write(corpusRoot, stem + tail)
else:
print >> sys.stderr, "Combined input not saved"
for docSet in sorted(newCorpora.keys()):
outFilename = os.path.join(outDir, stem + "-" + docSet + tail)
print >> sys.stderr, "Writing set", docSet, "to", outFilename
ETUtils.write(newCorpora[docSet], outFilename)