本文整理汇总了Python中Utils.ProgressCounter.ProgressCounter类的典型用法代码示例。如果您正苦于以下问题:Python ProgressCounter类的具体用法?Python ProgressCounter怎么用?Python ProgressCounter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ProgressCounter类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: processCorpus
def processCorpus(inputFilename, outputFilename, rules):
print >> sys.stderr, "Deleting elements, rules =", rules
print >> sys.stderr, "Loading corpus file", inputFilename
corpusTree = ETUtils.ETFromObj(inputFilename)
corpusRoot = corpusTree.getroot()
for eType in rules.keys():
for attrRule in rules[eType].keys():
rules[eType][attrRule] = rules[eType][attrRule].split("|")
documents = corpusRoot.findall("document")
counter = ProgressCounter(len(documents), "Documents")
countsByType = defaultdict(int)
for document in documents:
counter.update()
for sentence in document.findall("sentence"):
processSentence(sentence, rules, countsByType)
print >> sys.stderr, "Deleted elements"
for k in sorted(countsByType.keys()):
print >> sys.stderr, " " + k + ":", countsByType[k]
if outputFilename != None:
print >> sys.stderr, "Writing output to", outputFilename
ETUtils.write(corpusRoot, outputFilename)
return corpusTree
示例2: processCorpus
def processCorpus(inputFilename, outputFilename, rules):
print >> sys.stderr, "Loading corpus file", inputFilename
if inputFilename.rsplit(".",1)[-1] == "gz":
import gzip
corpusTree = ET.parse(gzip.open(inputFilename))
else:
corpusTree = ET.parse(inputFilename)
corpusRoot = corpusTree.getroot()
documents = corpusRoot.findall("document")
counter = ProgressCounter(len(documents), "Documents")
countsByType = {}
for k in sorted(rules.keys()):
countsByType[k] = 0
for document in documents:
counter.update()
for sentence in document.findall("sentence"):
processSentence(sentence, rules, countsByType)
print >> sys.stderr, "Removed"
for k in sorted(countsByType.keys()):
print >> sys.stderr, " " + k + ":", countsByType[k]
if outputFilename != None:
print >> sys.stderr, "Writing output to", outputFilename
ETUtils.write(corpusRoot, outputFilename)
return corpusTree
示例3: fixAltOffsets
def fixAltOffsets(input, output=None):
print >> sys.stderr, "Loading corpus", input
corpusTree = ETUtils.ETFromObj(input)
print >> sys.stderr, "Corpus file loaded"
corpusRoot = corpusTree.getroot()
docCount = 0
sentencesCreated = 0
sentences = [x for x in corpusRoot.getiterator("sentence")]
counter = ProgressCounter(len(sentences), "FixAltOffsets")
fixCount = 0
# fix spans
for sentence in sentences:
counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
for entity in sentence.findall("entity"):
altOffsetString = entity.get("altOffset")
if altOffsetString == None:
continue
#print altOffsetString
altOffsets = Range.charOffsetToTuples(altOffsetString)
assert len(altOffsets) == 1
for i in range(len(altOffsets)):
altOffset = altOffsets[i]
altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
fixCount += 1
print >> sys.stderr, "Fixed", fixCount, "altOffsets"
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(corpusRoot, output)
return corpusTree
示例4: findHeadsSyntactic
def findHeadsSyntactic(corpus, parse, tokenization):
"""
Determine the head token for a named entity or trigger. The head token is the token closest
to the root for the subtree of the dependency parse spanned by the text of the element.
@param entityElement: a semantic node (trigger or named entity)
@type entityElement: cElementTree.Element
@param verbose: Print selected head tokens on screen
@param verbose: boolean
"""
counts = [0,0]
sentences = [x for x in corpus.getiterator("sentence")]
counter = ProgressCounter(len(sentences), "SYNTAX")
for sentence in sentences:
counter.update()
tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization})
parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse})
if tokElement == None or parseElement == None:
print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization"
tokens = tokElement.findall("token")
tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id"))
for entity in sentence.findall("entity"):
if entity.get("headOffset") == None:
headToken = getEntityHeadToken(entity, tokens, tokenHeadScores)
# The ElementTree entity-element is modified by setting the headOffset attribute
entity.set("headOffset", headToken.get("charOffset"))
entity.set("headMethod", "Syntax")
entity.set("headString", headToken.get("text"))
counts[0] += 1
return counts
示例5: buildExamples
def buildExamples(exampleBuilder, sentences, options):
print >> sys.stderr, "Defining predicted value range:",
sentenceElements = []
for sentence in sentences:
sentenceElements.append(sentence[0].sentenceElement)
exampleBuilder.definePredictedValueRange(sentenceElements, "entity")
print >> sys.stderr, exampleBuilder.getPredictedValueRange()
examples = []
if hasattr(exampleBuilder, "styles") and "graph_kernel" in exampleBuilder.styles:
counter = ProgressCounter(len(sentences), "Build examples", 0)
else:
counter = ProgressCounter(len(sentences), "Build examples")
for sentence in sentences:
counter.update(1, "Building examples ("+sentence[0].getSentenceId()+"): ")
sentence[1] = exampleBuilder.buildExamples(sentence[0])
examples.extend(sentence[1])
print >> sys.stderr, "Examples built:", len(examples)
print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames())
print >> sys.stderr, "Preprocessing examples:"
examples = exampleBuilder.preProcessExamples(examples)
# Save examples
# if options.output != None:
# print >> sys.stderr, "Saving examples to", options.output + "/examples.txt"
# commentLines = []
# commentLines.append("Input file: " + options.input)
# commentLines.append("Example builder: " + options.exampleBuilder)
# commentLines.append("Features:")
# commentLines.extend(exampleBuilder.featureSet.toStrings())
# Example.writeExamples(examples, options.output + "/examples.txt", commentLines)
#examples = filterFeatures(exampleBuilder.featureSet, examples)
#Example.normalizeFeatureVectors(examples)
return examples
示例6: buildExamplesForSentences
def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False):
examples = []
counter = ProgressCounter(len(sentences), "Build examples")
if append:
outfile = open(output, "at")
else:
outfile = open(output, "wt")
exampleCount = 0
for i in range(len(sentences)):
sentence = sentences[i]
goldSentence = [None]
if goldSentences != None:
goldSentence = goldSentences[i]
counter.update(1, "Building examples (" + sentence[0].getSentenceId() + "): ")
examples = self.buildExamples(sentence[0], goldSentence[0], append=append)
exampleCount += len(examples)
examples = self.preProcessExamples(examples)
ExampleUtils.appendExamples(examples, outfile)
outfile.close()
print >>sys.stderr, "Examples built:", exampleCount
print >>sys.stderr, "Features:", len(self.featureSet.getNames())
# IF LOCAL
if self.exampleStats.getExampleCount() > 0:
self.exampleStats.printStats()
# ENDIF
# Save Ids
if idFileTag != None:
print >>sys.stderr, "Saving class names to", idFileTag + ".class_names"
self.classSet.write(idFileTag + ".class_names")
print >>sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
self.featureSet.write(idFileTag + ".feature_names")
示例7: insertParses
def insertParses(self, parseDir, input, output=None, parseName="McCC", extensions=None, subDirs=None, debug=False, skipParsed=False, docMatchKeys=None, conllFormat=None, splitting=True, unescapeFormats="AUTO", tokenMerging=True, extMap=None, sdFailedFormat="empty", origIdType=None, posTags=None):
corpusTree, corpusRoot = self.getCorpus(input)
if not os.path.exists(parseDir):
raise Exception("Cannot find parse input '" + str(parseDir) + "'")
if not os.path.isdir(parseDir):
raise Exception("Parse input '" + str(parseDir) + "' is not a directory")
if extensions == None:
extensions = self.allExt
elif isinstance(extensions, basestring):
extensions = extensions.split(",")
extensions = [x for x in extensions if x in self.allExt]
unescapeFormats = self.getUnescapeFormats(unescapeFormats)
if docMatchKeys == None:
docMatchKeys = ["origId", "pmid", "id"]
elif isinstance(docMatchKeys, basestring):
docMatchKeys = docMatchKeys.split(",")
print >> sys.stderr, "Inserting parses from file types:", extensions
counts = defaultdict(int)
files = self.getParseFiles(parseDir, extensions, subDirs, counts, extMap=extMap, origIdType=origIdType)
typeCounts = {x:defaultdict(int) for x in extensions}
# Make document elements if needed
documents = [x for x in corpusRoot.findall("document")]
if len(documents) == 0:
typeCounts["document-generation"] = defaultdict(int)
documents = self.prepareDocuments(corpusRoot, files)
counter = ProgressCounter(len(files), "Parse Insertion")
# Insert parses and make sentence elements if needed
typeCounts["sentence-splitting"] = defaultdict(int)
print >> sys.stderr, "Inserting parses for", len(files), "out of total", len(documents), "documents"
for document in documents:
counts["document"] += 1
matchFound = False
for docMatchValue in [document.get(x) for x in docMatchKeys if document.get(x) != None]:
if docMatchValue in files:
if matchFound:
raise Exception("Multiple matching parses for document " + str(document.attrib) + " using keys " + str(docMatchKeys))
matchFound = True
counter.update(1, "Inserting parses for (" + document.get("id") + "/" + str(docMatchValue) + "): ")
counts["document-match"] += 1
for ext in extensions:
if ext not in files[docMatchValue]:
continue
counts[ext + "-match"] += 1
sentences = [x for x in self.getSentences(document, skipParsed=skipParsed)]
self.insertParse(document, sentences, ext, files[docMatchValue][ext], parseName, splitting, typeCounts, conllFormat, unescapeFormats=unescapeFormats, tokenMerging=tokenMerging, sdFailedFormat=sdFailedFormat, posTags=posTags)
if not matchFound:
counts["document-no-match"] += 1
if len(typeCounts["sentence-splitting"]) > 0:
print >> sys.stderr, "Sentence Splitting Counts", dict(typeCounts["sentence-splitting"])
print >> sys.stderr, "Counts", dict(counts)
for ext in extensions:
if len(typeCounts[ext]) > 0:
print >> sys.stderr, "Counts for type '" + ext + "':", dict(typeCounts[ext])
# Write the output XML file
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(corpusRoot, output)
return corpusTree
示例8: run
def run(
cls,
fileIn,
fileOut=None,
tokenization="split-Charniak-Lease",
entityOffsetKey="charOffset",
includeNeg=False,
stem=False,
):
"""Builds the master gazzeteer.
fileIn: a string (ending with .xml or .xml.gz), an open input stream, an ElementTree or an Element
fileOut: a string or None. If given, the resulting gazzetteer will be written out
tokenization: name of the tokenization to be used
Produces a dictionary with...
"""
print >>sys.stderr, "Building gazetteer"
gztr = {} # key: token value: dictionary (key: className, value count)
root = ETUtils.ETFromObj(fileIn)
if not ET.iselement(root):
assert isinstance(root, ET.ElementTree)
root = root.getroot()
sentences = []
for sNode in root.getiterator("sentence"):
sentences.append(sNode)
counter = ProgressCounter(len(sentences), "Build gazetteer")
for sNode in sentences:
counter.update(1, "Adding to gazetteer sentence " + sNode.get("id") + ", ")
for tokenizationNode in sNode.getiterator("tokenization"):
if tokenizationNode.get("tokenizer") == tokenization:
break
else:
assert False, "Did not find %s tokenization" % tokenization
tClasses = tokClasses(tokenizationNode, sNode, entityOffsetKey)
assert len(tClasses) == len(tokenizationNode)
for tokIdx, tokNode in enumerate(tokenizationNode):
gsClass = tClasses[tokIdx]
b, e = charOffStr2tuple(tokNode.get("charOffset"))
tokNodeTxt = tokTxt(b, e, sNode, stem).lower()
tokDict = gztr.setdefault(tokNodeTxt, {})
tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
# for multi-part texts, add collapsed and last token versions
if tokNodeTxt.find("-") != -1:
# collapsed
text = tokNodeTxt.replace("-", "")
if text != "":
tokDict = gztr.setdefault(text, {})
tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
# last part
text = tokNodeTxt.rsplit("-", 1)[-1]
if text != "":
tokDict = gztr.setdefault(text, {})
tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
if fileOut:
Gazetteer.saveGztr(gztr, fileOut, includeNeg)
return gztr
示例9: processCorpus
def processCorpus(input, outDir, stem=None, tail=".xml", mergedSets=[], saveCombined=False, verbose=False):
newCorpora = {}
print >> sys.stderr, "Loading corpus file", input
corpusRoot = ETUtils.ETFromObj(input).getroot()
documents = corpusRoot.findall("document")
counter = ProgressCounter(len(documents), "Documents")
countsByType = {}
for document in documents:
counter.update()
docSet = document.get("set")
if docSet == None:
if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id")
if not countsByType.has_key("No set"):
countsByType["No set"] = 0
countsByType["No set"] += 1
continue
elif not newCorpora.has_key(docSet):
newCorpora[docSet] = ET.Element("corpus")
for k, v in corpusRoot.attrib.iteritems():
newCorpora[docSet].set(k, v)
countsByType[docSet] = 0
newCorpora[docSet].append(document)
countsByType[docSet] += 1
# Make merged sets
for mergedSet in mergedSets:
tag = "-and-".join(sorted(mergedSet))
if not newCorpora.has_key(tag):
newCorpora[tag] = ET.Element("corpus")
for k, v in corpusRoot.attrib.iteritems():
newCorpora[tag].set(k, v)
countsByType[tag] = 0
for componentSet in mergedSet:
for element in newCorpora[componentSet].findall("document"):
newCorpora[tag].append(element)
countsByType[tag] += 1
print >> sys.stderr, "Documents per set"
for k in sorted(countsByType.keys()):
print >> sys.stderr, " " + str(k) + ":", countsByType[k]
if stem == None:
outDir, stem = os.path.dirname(outDir), os.path.basename(outDir)
if not os.path.exists(outDir):
os.makedirs(outDir)
print >> sys.stderr, "Writing output files to directory", outDir
if saveCombined:
print >> sys.stderr, "Saving combined input to", stem + tail
ETUtils.write(corpusRoot, stem + tail)
else:
print >> sys.stderr, "Combined input not saved"
for docSet in sorted(newCorpora.keys()):
outFilename = os.path.join(outDir, stem + "-" + docSet + tail)
print >> sys.stderr, "Writing set", docSet, "to", outFilename
ETUtils.write(newCorpora[docSet], outFilename)
示例10: loadCorpus
def loadCorpus(corpus, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractionsFromCorpusElements=True):
"""
Load an entire corpus through CorpusElements and add SentenceGraph-objects
to its SentenceElements-objects.
"""
import cElementTreeUtils as ETUtils
import sys
sys.path.append("..")
from Utils.ProgressCounter import ProgressCounter
from InteractionXML.CorpusElements import CorpusElements
# Corpus may be in file or not
if type(corpus) == types.StringType:
print >> sys.stderr, "Loading corpus file", corpus
corpusTree = ETUtils.ETFromObj(corpus)
corpusRoot = corpusTree.getroot()
# Use CorpusElements-class to access xml-tree
corpusElements = CorpusElements(corpusRoot, parse, tokenization, tree=corpusTree, removeNameInfo=removeNameInfo, removeIntersentenceInteractions=removeIntersentenceInteractionsFromCorpusElements)
print >> sys.stderr, str(len(corpusElements.documentsById)) + " documents, " + str(len(corpusElements.sentencesById)) + " sentences"
# Make sentence graphs
duplicateInteractionEdgesRemoved = 0
sentences = []
counter = ProgressCounter(len(corpusElements.sentences), "Make sentence graphs")
counter.showMilliseconds = True
for sentence in corpusElements.sentences[:]:
counter.update(1, "Making sentence graphs ("+sentence.sentence.get("id")+"): ")
# No tokens, no sentence. No also no dependencies = no sentence.
# Let's not remove them though, so that we don't lose sentences from input.
if len(sentence.tokens) == 0 or len(sentence.dependencies) == 0:
#corpusElements.sentences.remove(sentence)
sentence.sentenceGraph = None
continue
for pair in sentence.pairs:
# gif-xml defines two closely related element types, interactions and
# pairs. Pairs are like interactions, but they can also be negative (if
# interaction-attribute == False). Sometimes pair-elements have been
# (incorrectly) used without this attribute. To work around these issues
# we take all pair-elements that define interaction and add them to
# the interaction-element list.
isInteraction = pair.get("interaction")
if isInteraction == "True" or isInteraction == None:
sentence.interactions.append(pair) # add to interaction-elements
if pair.get("type") == None: # type-attribute must be explicitly defined
pair.set("type", "undefined")
# Construct the basic SentenceGraph (only syntactic information)
graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
# Add semantic information, i.e. the interactions
graph.mapInteractions(sentence.entities, sentence.interactions)
graph.interSentenceInteractions = sentence.interSentenceInteractions
duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved
sentence.sentenceGraph = graph
graph.parseElement = sentence.parseElement
#graph.mapEntityHints()
print >> sys.stderr, "Skipped", duplicateInteractionEdgesRemoved, "duplicate interaction edges in SentenceGraphs"
return corpusElements
示例11: compareToBinary
def compareToBinary(complexSentencesById, classifications, exampleBuilder, options):
# Load corpus and make sentence graphs
print >> sys.stderr, "Calculating performance on binary corpus"
classificationsBySentence = {}
for classification in classifications:
example = classification[0][0]
sentenceId = example[0].rsplit(".",1)[0]
sentenceOrigId = complexSentencesById[sentenceId].sentence.attrib["origId"]
if not classificationsBySentence.has_key(sentenceOrigId):
classificationsBySentence[sentenceOrigId] = []
classificationsBySentence[sentenceOrigId].append(classification)
print >> sys.stderr, "Loading Binary corpus"
binaryCorpusElements = loadCorpus(options.binaryCorpus)
binaryClassifications = []
counter = ProgressCounter(len(binaryCorpusElements.sentences), "Build binary classifications")
for binarySentence in binaryCorpusElements.sentences:
counter.update(1, "Building binary classifications ("+binarySentence.sentence.attrib["id"]+"): ")
if(classificationsBySentence.has_key(binarySentence.sentence.attrib["origId"])):
complexClassificationGraph = NX.XGraph(multiedges = multiedges)
for token in binarySentence.sentenceGraph.tokens:
complexClassificationGraph.add_node(token)
for classification in classificationsBySentence[binarySentence.sentence.attrib["origId"]]:
if classification[1] > 0:
example = classification[0][0]
t1 = example[3]["t1"]
t2 = example[3]["t2"]
t1Binary = None
for token in binarySentence.sentenceGraph.tokens:
if token.attrib["charOffset"] == t1.attrib["charOffset"]:
t1Binary = token
t2Binary = None
for token in binarySentence.sentenceGraph.tokens:
if token.attrib["charOffset"] == t2.attrib["charOffset"]:
t2Binary = token
assert(t1Binary != None and t2Binary != None)
complexClassificationGraph.add_edge(t1Binary, t2Binary)
paths = NX.all_pairs_shortest_path(complexClassificationGraph, cutoff=999)
for pair in binarySentence.pairs:
t1 = binarySentence.sentenceGraph.entityHeadTokenByEntity[pair.attrib["e1"]]
t2 = binarySentence.sentenceGraph.entityHeadTokenByEntity[pair.attrib["e2"]]
assert(pair.attrib["interaction"] == "True" or pair.attrib["interaction"] == "False")
if pair.attrib["interaction"] == "True":
pairClass = 1
else:
pairClass = -1
extra = {"xtype":"edge","type":"i","t1":t1,"t2":t2}
if paths.has_key(t1) and paths[t1].has_key(t2):
binaryClassifications.append( [[pair.attrib["id"], pairClass, None, extra], 1, "binary"] )
else:
binaryClassifications.append( [[pair.attrib["id"], pairClass, None, extra], -1, "binary"] )
print >> sys.stderr, "Evaluating binary classifications"
evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet)
print >> sys.stderr, evaluation.toStringConcise()
if options.output != None:
evaluation.saveCSV(options.output + "/binary_comparison_results.csv")
示例12: processCorpora
def processCorpora(EvaluatorClass, fromCorpus, toCorpus, target, classSets, negativeClassId, entityMatchFunction, errorMatrix=False, verbose=False):
counts = defaultdict(int)
entityExamples = []
entityPredictions = []
interactionExamples = []
interactionPredictions = []
eventExamples = []
eventPredictions = []
falseEntity = defaultdict(lambda: defaultdict(int))
if not verbose:
counter = ProgressCounter(len(fromCorpus.sentences), "Corpus Processing")
# Loop through the sentences and collect all predictions
toCorpusSentences = None
if toCorpus != None:
toCorpusSentences = toCorpus.documentSentences
for i in range(len(fromCorpus.documentSentences)):
if len(fromCorpus.documentSentences[i]) > 0 and not verbose:
counter.update(len(fromCorpus.documentSentences[i]), fromCorpus.documentSentences[i][0].sentence.get("id").rsplit(".", 1)[0])
if toCorpusSentences != None:
newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument(fromCorpus.documentSentences[i], toCorpusSentences[i], target, classSets, negativeClassId, entityMatchFunction, verbose=verbose, counts=counts)
else:
newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument(fromCorpus.documentSentences[i], None, target, classSets, negativeClassId, entityMatchFunction, verbose=verbose, counts=counts)
entityExamples.extend(newEntityExPred[0])
entityPredictions.extend(newEntityExPred[1])
interactionExamples.extend(newInteractionExPred[0])
interactionPredictions.extend(newInteractionExPred[1])
eventExamples.extend(newEventExPred[0])
eventPredictions.extend(newEventExPred[1])
for k,v in sentFalseEntity.iteritems():
falseEntity[k][0] += v[0]
falseEntity[k][1] += v[1]
# Process the predictions with an evaluator and print the results
evaluator = None
if len(entityPredictions) > 0:
evaluator = EvaluatorClass(entityExamples, entityPredictions, classSet=classSets["entity"])
print evaluator.toStringConcise(title="Entities")
if errorMatrix:
print evaluator.matrixToString()
print evaluator.matrixToString(True)
if len(interactionPredictions) > 0:
evaluator = EvaluatorClass(interactionExamples, interactionPredictions, classSet=classSets["interaction"])
print evaluator.toStringConcise(title="Interactions")
if errorMatrix:
print evaluator.matrixToString()
print evaluator.matrixToString(True)
#print "Interactions (fp ent->fp int, fn-ent->fn-int )"
#for key in sorted(falseEntity.keys()):
# print "", key, falseEntity[key][0], "/", falseEntity[key][1]
if len(eventPredictions) > 0:
evaluator = EvaluatorClass(eventExamples, eventPredictions, classSet=classSets["entity"])
print evaluator.toStringConcise(title="Events")
if errorMatrix:
print evaluator.matrixToString()
print evaluator.matrixToString(True)
return evaluator
示例13: processCorpus
def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True):
# Create intermediate paths if needed
if os.path.dirname(output) != "" and not os.path.exists(os.path.dirname(output)):
os.makedirs(os.path.dirname(output))
# Open output file
openStyle = "wt"
if append:
#print "Appending examples"
openStyle = "at"
if output.endswith(".gz"):
outfile = gzip.open(output, openStyle)
else:
outfile = open(output, openStyle)
# Build examples
self.exampleCount = 0
if type(input) in types.StringTypes:
self.elementCounts = self.getElementCounts(input)
if self.elementCounts["sentences"] > 0:
self.progress = ProgressCounter(self.elementCounts["sentences"], "Build examples")
else:
self.elementCounts = None
self.progress = ProgressCounter(None, "Build examples")
else:
self.elementCounts = None
self.progress = ProgressCounter(None, "Build examples")
self.calculatePredictedRange(self.getSentences(input, self.parse, self.tokenization))
inputIterator = getCorpusIterator(input, None, self.parse, self.tokenization)
#goldIterator = []
if gold != None:
goldIterator = getCorpusIterator(gold, None, self.parse, self.tokenization)
for inputSentences, goldSentences in itertools.izip_longest(inputIterator, goldIterator, fillvalue=None):
assert inputSentences != None
assert goldSentences != None
self.processDocument(inputSentences, goldSentences, outfile)
else:
for inputSentences in inputIterator:
self.processDocument(inputSentences, None, outfile)
outfile.close()
self.progress.endUpdate()
# Show statistics
print >> sys.stderr, "Examples built:", self.exampleCount
print >> sys.stderr, "Features:", len(self.featureSet.getNames())
print >> sys.stderr, "Style:", Utils.Parameters.toString(self.getParameters(self.styles))
if self.exampleStats.getExampleCount() > 0:
self.exampleStats.printStats()
# Save Ids
if allowNewIds:
self.saveIds()
示例14: processCorpus
def processCorpus(input, output, wordVectorPath, tokenizerName="McCC", max_rank_mem=100000, max_rank=10000000):
print >> sys.stderr, "Making vocabulary"
print >> sys.stderr, "Loading corpus file", input
corpusTree = ETUtils.ETFromObj(input)
corpusRoot = corpusTree.getroot()
vocabulary = {"indices":{}, "vectors":[]}
print >> sys.stderr, "Loading word vectors from", wordVectorPath
print >> sys.stderr, "max_rank_mem", max_rank_mem
print >> sys.stderr, "max_rank", max_rank
max_rank_mem = int(max_rank_mem)
max_rank = int(max_rank)
wv = WV.load(wordVectorPath, max_rank_mem, max_rank)
dimVector = wv.vectors.shape[1]
print >> sys.stderr, "WordVector length", dimVector
#addVector("[out]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range
#addVector("[OoV]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary
addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range
addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary
documents = corpusRoot.findall("document")
counter = ProgressCounter(len(documents), "Documents")
counts = defaultdict(int)
for document in documents:
counter.update()
counts["document"] += 1
for sentence in document.findall("sentence"):
counts["sentence"] += 1
tokenization = IXMLUtils.getTokenizationElement(sentence, tokenizerName)
if tokenization != None:
counts["tokenization"] += 1
for token in tokenization.findall("token"):
counts["token"] += 1
text = token.get("text")
if text not in vocabulary["indices"]:
counts["token-unique"] += 1
vector = wv.w_to_normv(token.get("text").lower())
if vector is not None:
counts["vector"] += 1
vector = vector.tolist() + [0.0, 0.0]
addVector(text, vector, vocabulary)
else:
counts["no-vector"] += 1
print >> sys.stderr, "Counts:", dict(counts)
if output != None:
print >> sys.stderr, "Writing vectors to", output + "-vectors.json.gz"
with gzip.open(output + "-vectors.json.gz", "wt") as f:
json.dump(vocabulary, f)
print >> sys.stderr, "Writing indices to", output + "-indices.json.gz"
with gzip.open(output + "-indices.json.gz", "wt") as f:
json.dump({"indices":vocabulary["indices"], "vectors":None}, f)
return vocabulary
示例15: findHeads
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False):
if iterate:
from Utils.ProgressCounter import ProgressCounter
import InteractionXML.SentenceElements as SentenceElements
print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
print >> sys.stderr, "Removing existing head offsets"
removeCount = 0
counter = ProgressCounter(None, "Find heads")
counter.showMilliseconds = True
for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization):
for sentence in sentences:
if removeExisting:
for e in sentence.sentence.findall("entity"):
if e.get("headOffset") != None:
removeCount += 1
del e.attrib["headOffset"]
graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
graph.mapInteractions(sentence.entities, sentence.interactions)
# Make sure every parse gets head scores
#if graph.tokenHeadScores == None:
# graph.getTokenHeadScores()
counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ")
print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
else:
xml = ETUtils.ETFromObj(input)
if removeExisting:
print >> sys.stderr, "Removing existing head offsets"
removeCount = 0
xml = ETUtils.ETFromObj(input)
for d in xml.getroot().findall("document"):
for s in d.findall("sentence"):
for e in s.findall("entity"):
if e.get("headOffset") != None:
removeCount += 1
del e.attrib["headOffset"]
print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
# SentenceGraph automatically calculates head offsets and adds them to entities if they are missing
print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization)
# Make sure every parse gets head scores
for sentence in corpusElements.sentences:
if sentence.sentenceGraph == None:
continue
if sentence.sentenceGraph.tokenHeadScores == None:
sentence.sentenceGraph.getTokenHeadScores()
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(corpusElements.rootElement, output)
return xml