当前位置: 首页>>代码示例>>Python>>正文


Python ProgressCounter.ProgressCounter类代码示例

本文整理汇总了Python中Utils.ProgressCounter.ProgressCounter的典型用法代码示例。如果您正苦于以下问题:Python ProgressCounter类的具体用法?Python ProgressCounter怎么用?Python ProgressCounter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了ProgressCounter类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: processCorpus

def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    for eType in rules.keys():
        for attrRule in rules[eType].keys():
            rules[eType][attrRule] = rules[eType][attrRule].split("|")
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = defaultdict(int)
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
开发者ID:ninjin,项目名称:TEES,代码行数:25,代码来源:DeleteElements.py

示例2: processCorpus

def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Loading corpus file", inputFilename
    if inputFilename.rsplit(".",1)[-1] == "gz":
        import gzip
        corpusTree = ET.parse(gzip.open(inputFilename))
    else:
        corpusTree = ET.parse(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    for k in sorted(rules.keys()):
        countsByType[k] = 0
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
开发者ID:jbjorne,项目名称:CVSTransferTest,代码行数:26,代码来源:DeleteElements.py

示例3: fixAltOffsets

def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
开发者ID:jbjorne,项目名称:CVSTransferTest,代码行数:34,代码来源:FixAltOffsets.py

示例4: findHeadsSyntactic

def findHeadsSyntactic(corpus, parse, tokenization):
    """
    Determine the head token for a named entity or trigger. The head token is the token closest
    to the root for the subtree of the dependency parse spanned by the text of the element.
    
    @param entityElement: a semantic node (trigger or named entity)
    @type entityElement: cElementTree.Element
    @param verbose: Print selected head tokens on screen
    @param verbose: boolean
    """
    counts = [0,0]
    sentences = [x for x in corpus.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "SYNTAX")
    for sentence in sentences:
        counter.update()
        tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization})
        parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse})
        if tokElement == None or parseElement == None:
            print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization" 
        tokens = tokElement.findall("token")
        tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id"))
        for entity in sentence.findall("entity"):
            if entity.get("headOffset") == None:
                headToken = getEntityHeadToken(entity, tokens, tokenHeadScores)
                # The ElementTree entity-element is modified by setting the headOffset attribute
                entity.set("headOffset", headToken.get("charOffset"))
                entity.set("headMethod", "Syntax")
                entity.set("headString", headToken.get("text"))
                counts[0] += 1
    return counts
开发者ID:ninjin,项目名称:TEES,代码行数:30,代码来源:DetectHeads.py

示例5: buildExamples

def buildExamples(exampleBuilder, sentences, options):
    print >> sys.stderr, "Defining predicted value range:",
    sentenceElements = []
    for sentence in sentences:
        sentenceElements.append(sentence[0].sentenceElement)
    exampleBuilder.definePredictedValueRange(sentenceElements, "entity")
    print >> sys.stderr, exampleBuilder.getPredictedValueRange()
    
    examples = []
    if hasattr(exampleBuilder, "styles") and "graph_kernel" in exampleBuilder.styles:
        counter = ProgressCounter(len(sentences), "Build examples", 0)
    else:
        counter = ProgressCounter(len(sentences), "Build examples")
    for sentence in sentences:
        counter.update(1, "Building examples ("+sentence[0].getSentenceId()+"): ")
        sentence[1] = exampleBuilder.buildExamples(sentence[0])
        examples.extend(sentence[1])
    print >> sys.stderr, "Examples built:", len(examples)
    print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames())
    print >> sys.stderr, "Preprocessing examples:"
    examples = exampleBuilder.preProcessExamples(examples)
    # Save examples
#    if options.output != None:
#        print >> sys.stderr, "Saving examples to", options.output + "/examples.txt"
#        commentLines = []
#        commentLines.append("Input file: " + options.input)
#        commentLines.append("Example builder: " + options.exampleBuilder)
#        commentLines.append("Features:")
#        commentLines.extend(exampleBuilder.featureSet.toStrings())
#        Example.writeExamples(examples, options.output + "/examples.txt", commentLines)
    #examples = filterFeatures(exampleBuilder.featureSet, examples)
    #Example.normalizeFeatureVectors(examples)
    return examples
开发者ID:jbjorne,项目名称:Tdevel,代码行数:33,代码来源:SplitAnalysis.py

示例6: buildExamplesForSentences

    def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False):
        examples = []
        counter = ProgressCounter(len(sentences), "Build examples")

        if append:
            outfile = open(output, "at")
        else:
            outfile = open(output, "wt")
        exampleCount = 0
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = [None]
            if goldSentences != None:
                goldSentence = goldSentences[i]
            counter.update(1, "Building examples (" + sentence[0].getSentenceId() + "): ")
            examples = self.buildExamples(sentence[0], goldSentence[0], append=append)
            exampleCount += len(examples)
            examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >>sys.stderr, "Examples built:", exampleCount
        print >>sys.stderr, "Features:", len(self.featureSet.getNames())
        # IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        # ENDIF
        # Save Ids
        if idFileTag != None:
            print >>sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >>sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")
开发者ID:jbjorne,项目名称:Tdevel,代码行数:33,代码来源:Round2TriggerExampleBuilder.py

示例7: insertParses

 def insertParses(self, parseDir, input, output=None, parseName="McCC", extensions=None, subDirs=None, debug=False, skipParsed=False, docMatchKeys=None, conllFormat=None, splitting=True, unescapeFormats="AUTO", tokenMerging=True, extMap=None, sdFailedFormat="empty", origIdType=None, posTags=None):
     corpusTree, corpusRoot = self.getCorpus(input)
     if not os.path.exists(parseDir):
         raise Exception("Cannot find parse input '" + str(parseDir) + "'")
     if not os.path.isdir(parseDir):
         raise Exception("Parse input '" + str(parseDir) + "' is not a directory")
     if extensions == None:
         extensions = self.allExt
     elif isinstance(extensions, basestring):
         extensions = extensions.split(",")
     extensions = [x for x in extensions if x in self.allExt]
     unescapeFormats = self.getUnescapeFormats(unescapeFormats)
     if docMatchKeys == None:
         docMatchKeys = ["origId", "pmid", "id"]
     elif isinstance(docMatchKeys, basestring):
         docMatchKeys = docMatchKeys.split(",")
     print >> sys.stderr, "Inserting parses from file types:", extensions
     counts = defaultdict(int)
     files = self.getParseFiles(parseDir, extensions, subDirs, counts, extMap=extMap, origIdType=origIdType)
     typeCounts = {x:defaultdict(int) for x in extensions}
     # Make document elements if needed
     documents = [x for x in corpusRoot.findall("document")]
     if len(documents) == 0:
         typeCounts["document-generation"] = defaultdict(int)
         documents = self.prepareDocuments(corpusRoot, files)
     counter = ProgressCounter(len(files), "Parse Insertion")
     # Insert parses and make sentence elements if needed
     typeCounts["sentence-splitting"] = defaultdict(int)
     print >> sys.stderr, "Inserting parses for", len(files), "out of total", len(documents), "documents"
     for document in documents:
         counts["document"] += 1
         matchFound = False
         for docMatchValue in [document.get(x) for x in docMatchKeys if document.get(x) != None]:
             if docMatchValue in files:
                 if matchFound:
                     raise Exception("Multiple matching parses for document " + str(document.attrib) + " using keys " + str(docMatchKeys))
                 matchFound = True
                 counter.update(1, "Inserting parses for (" + document.get("id") + "/" + str(docMatchValue) + "): ")
                 counts["document-match"] += 1
                 for ext in extensions:
                     if ext not in files[docMatchValue]:
                         continue
                     counts[ext + "-match"] += 1
                     sentences = [x for x in self.getSentences(document, skipParsed=skipParsed)]
                     self.insertParse(document, sentences, ext, files[docMatchValue][ext], parseName, splitting, typeCounts, conllFormat, unescapeFormats=unescapeFormats, tokenMerging=tokenMerging, sdFailedFormat=sdFailedFormat, posTags=posTags)
         if not matchFound:
             counts["document-no-match"] += 1
     if len(typeCounts["sentence-splitting"]) > 0:
         print >> sys.stderr, "Sentence Splitting Counts", dict(typeCounts["sentence-splitting"])
     print >> sys.stderr, "Counts", dict(counts)
     for ext in extensions:
         if len(typeCounts[ext]) > 0:
             print >> sys.stderr, "Counts for type '" + ext + "':", dict(typeCounts[ext])
     # Write the output XML file
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     return corpusTree
开发者ID:jbjorne,项目名称:TEES,代码行数:58,代码来源:ParseConverter.py

示例8: run

    def run(
        cls,
        fileIn,
        fileOut=None,
        tokenization="split-Charniak-Lease",
        entityOffsetKey="charOffset",
        includeNeg=False,
        stem=False,
    ):
        """Builds the master gazzeteer.
        fileIn: a string (ending with .xml or .xml.gz), an open input stream, an ElementTree or an Element
        fileOut: a string or None. If given, the resulting gazzetteer will be written out
        tokenization: name of the tokenization to be used

        Produces a dictionary with...
        """

        print >>sys.stderr, "Building gazetteer"

        gztr = {}  # key: token value: dictionary (key: className, value count)
        root = ETUtils.ETFromObj(fileIn)
        if not ET.iselement(root):
            assert isinstance(root, ET.ElementTree)
            root = root.getroot()
        sentences = []
        for sNode in root.getiterator("sentence"):
            sentences.append(sNode)
        counter = ProgressCounter(len(sentences), "Build gazetteer")
        for sNode in sentences:
            counter.update(1, "Adding to gazetteer sentence " + sNode.get("id") + ", ")
            for tokenizationNode in sNode.getiterator("tokenization"):
                if tokenizationNode.get("tokenizer") == tokenization:
                    break
            else:
                assert False, "Did not find %s tokenization" % tokenization
            tClasses = tokClasses(tokenizationNode, sNode, entityOffsetKey)
            assert len(tClasses) == len(tokenizationNode)
            for tokIdx, tokNode in enumerate(tokenizationNode):
                gsClass = tClasses[tokIdx]
                b, e = charOffStr2tuple(tokNode.get("charOffset"))
                tokNodeTxt = tokTxt(b, e, sNode, stem).lower()
                tokDict = gztr.setdefault(tokNodeTxt, {})
                tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
                # for multi-part texts, add collapsed and last token versions
                if tokNodeTxt.find("-") != -1:
                    # collapsed
                    text = tokNodeTxt.replace("-", "")
                    if text != "":
                        tokDict = gztr.setdefault(text, {})
                        tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
                    # last part
                    text = tokNodeTxt.rsplit("-", 1)[-1]
                    if text != "":
                        tokDict = gztr.setdefault(text, {})
                        tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
        if fileOut:
            Gazetteer.saveGztr(gztr, fileOut, includeNeg)
        return gztr
开发者ID:jbjorne,项目名称:Tdevel,代码行数:58,代码来源:Gazetteer.py

示例9: processCorpus

def processCorpus(input, outDir, stem=None, tail=".xml", mergedSets=[], saveCombined=False, verbose=False):
    newCorpora = {}
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    for document in documents:
        counter.update()
        docSet = document.get("set")
        if docSet == None:
            if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id")
            if not countsByType.has_key("No set"):
                countsByType["No set"] = 0
            countsByType["No set"] += 1
            continue
        elif not newCorpora.has_key(docSet):
            newCorpora[docSet] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[docSet].set(k, v)
            countsByType[docSet] = 0
        newCorpora[docSet].append(document)
        countsByType[docSet] += 1
        
    # Make merged sets
    for mergedSet in mergedSets:
        tag = "-and-".join(sorted(mergedSet))
        if not newCorpora.has_key(tag):
            newCorpora[tag] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[tag].set(k, v)
            countsByType[tag] = 0    
        for componentSet in mergedSet:
            for element in newCorpora[componentSet].findall("document"):
                newCorpora[tag].append(element)
                countsByType[tag] += 1
        
    print >> sys.stderr, "Documents per set"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + str(k) + ":", countsByType[k]
    
    if stem == None:
        outDir, stem = os.path.dirname(outDir), os.path.basename(outDir)
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    
    print >> sys.stderr, "Writing output files to directory", outDir
    if saveCombined:
        print >> sys.stderr, "Saving combined input to", stem + tail
        ETUtils.write(corpusRoot, stem + tail)
    else:
        print >> sys.stderr, "Combined input not saved"
    for docSet in sorted(newCorpora.keys()):
        outFilename = os.path.join(outDir, stem + "-" + docSet + tail)
        print >> sys.stderr, "Writing set", docSet, "to", outFilename
        ETUtils.write(newCorpora[docSet], outFilename)
开发者ID:jbjorne,项目名称:TEES,代码行数:57,代码来源:DivideSets.py

示例10: loadCorpus

def loadCorpus(corpus, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractionsFromCorpusElements=True):
    """
    Load an entire corpus through CorpusElements and add SentenceGraph-objects
    to its SentenceElements-objects.
    """
    import cElementTreeUtils as ETUtils
    import sys
    sys.path.append("..")
    from Utils.ProgressCounter import ProgressCounter
    from InteractionXML.CorpusElements import CorpusElements
    
    # Corpus may be in file or not
    if type(corpus) == types.StringType:
        print >> sys.stderr, "Loading corpus file", corpus
    corpusTree = ETUtils.ETFromObj(corpus)
    corpusRoot = corpusTree.getroot()
    # Use CorpusElements-class to access xml-tree
    corpusElements = CorpusElements(corpusRoot, parse, tokenization, tree=corpusTree, removeNameInfo=removeNameInfo, removeIntersentenceInteractions=removeIntersentenceInteractionsFromCorpusElements)
    print >> sys.stderr, str(len(corpusElements.documentsById)) + " documents, " + str(len(corpusElements.sentencesById)) + " sentences"
    # Make sentence graphs
    duplicateInteractionEdgesRemoved = 0
    sentences = []
    counter = ProgressCounter(len(corpusElements.sentences), "Make sentence graphs")
    counter.showMilliseconds = True
    for sentence in corpusElements.sentences[:]:
        counter.update(1, "Making sentence graphs ("+sentence.sentence.get("id")+"): ")
        # No tokens, no sentence. No also no dependencies = no sentence.
        # Let's not remove them though, so that we don't lose sentences from input.
        if len(sentence.tokens) == 0 or len(sentence.dependencies) == 0: 
            #corpusElements.sentences.remove(sentence)
            sentence.sentenceGraph = None
            continue
        for pair in sentence.pairs:
            # gif-xml defines two closely related element types, interactions and
            # pairs. Pairs are like interactions, but they can also be negative (if
            # interaction-attribute == False). Sometimes pair-elements have been
            # (incorrectly) used without this attribute. To work around these issues
            # we take all pair-elements that define interaction and add them to
            # the interaction-element list.
            isInteraction = pair.get("interaction")
            if isInteraction == "True" or isInteraction == None:
                sentence.interactions.append(pair) # add to interaction-elements
                if pair.get("type") == None: # type-attribute must be explicitly defined
                    pair.set("type", "undefined")
        # Construct the basic SentenceGraph (only syntactic information)
        graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
        # Add semantic information, i.e. the interactions
        graph.mapInteractions(sentence.entities, sentence.interactions)
        graph.interSentenceInteractions = sentence.interSentenceInteractions
        duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved
        sentence.sentenceGraph = graph
        
        graph.parseElement = sentence.parseElement
        
        #graph.mapEntityHints()
    print >> sys.stderr, "Skipped", duplicateInteractionEdgesRemoved, "duplicate interaction edges in SentenceGraphs"
    return corpusElements
开发者ID:jbjorne,项目名称:Tdevel,代码行数:57,代码来源:SentenceGraph.py

示例11: compareToBinary

def compareToBinary(complexSentencesById, classifications, exampleBuilder, options):
    # Load corpus and make sentence graphs
    print >> sys.stderr, "Calculating performance on binary corpus"
    classificationsBySentence = {}
    for classification in classifications:
        example = classification[0][0]
        sentenceId = example[0].rsplit(".",1)[0]
        sentenceOrigId = complexSentencesById[sentenceId].sentence.attrib["origId"]
        if not classificationsBySentence.has_key(sentenceOrigId):
            classificationsBySentence[sentenceOrigId] = []
        classificationsBySentence[sentenceOrigId].append(classification)
    
    print >> sys.stderr, "Loading Binary corpus"
    binaryCorpusElements = loadCorpus(options.binaryCorpus)
    binaryClassifications = []
    counter = ProgressCounter(len(binaryCorpusElements.sentences), "Build binary classifications")
    for binarySentence in binaryCorpusElements.sentences:
        counter.update(1, "Building binary classifications ("+binarySentence.sentence.attrib["id"]+"): ")
        if(classificationsBySentence.has_key(binarySentence.sentence.attrib["origId"])):
            complexClassificationGraph = NX.XGraph(multiedges = multiedges)
            for token in binarySentence.sentenceGraph.tokens:
                complexClassificationGraph.add_node(token)
            for classification in classificationsBySentence[binarySentence.sentence.attrib["origId"]]:
                if classification[1] > 0:
                    example = classification[0][0]       
                    t1 = example[3]["t1"]
                    t2 = example[3]["t2"]
                    t1Binary = None
                    for token in binarySentence.sentenceGraph.tokens:
                        if token.attrib["charOffset"] == t1.attrib["charOffset"]:
                            t1Binary = token
                    t2Binary = None
                    for token in binarySentence.sentenceGraph.tokens:
                        if token.attrib["charOffset"] == t2.attrib["charOffset"]:
                            t2Binary = token
                    assert(t1Binary != None and t2Binary != None)
                    complexClassificationGraph.add_edge(t1Binary, t2Binary)
            paths = NX.all_pairs_shortest_path(complexClassificationGraph, cutoff=999)
            for pair in binarySentence.pairs:
                t1 = binarySentence.sentenceGraph.entityHeadTokenByEntity[pair.attrib["e1"]]
                t2 = binarySentence.sentenceGraph.entityHeadTokenByEntity[pair.attrib["e2"]]
                assert(pair.attrib["interaction"] == "True" or pair.attrib["interaction"] == "False")
                if pair.attrib["interaction"] == "True":
                    pairClass = 1
                else:
                    pairClass = -1
                extra = {"xtype":"edge","type":"i","t1":t1,"t2":t2}
                if paths.has_key(t1) and paths[t1].has_key(t2):
                    binaryClassifications.append( [[pair.attrib["id"], pairClass, None, extra], 1, "binary"] )
                else:
                    binaryClassifications.append( [[pair.attrib["id"], pairClass, None, extra], -1, "binary"] )
    print >> sys.stderr, "Evaluating binary classifications"
    evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet)
    print >> sys.stderr, evaluation.toStringConcise()
    if options.output != None:
        evaluation.saveCSV(options.output + "/binary_comparison_results.csv")                    
开发者ID:jbjorne,项目名称:Tdevel,代码行数:56,代码来源:SplitAnalysis.py

示例12: processCorpora

def processCorpora(EvaluatorClass, fromCorpus, toCorpus, target, classSets, negativeClassId, entityMatchFunction, errorMatrix=False, verbose=False):
    counts = defaultdict(int)
    entityExamples = []
    entityPredictions = []
    interactionExamples = []
    interactionPredictions = []
    eventExamples = []
    eventPredictions = []
    falseEntity = defaultdict(lambda: defaultdict(int))
    if not verbose:
        counter = ProgressCounter(len(fromCorpus.sentences), "Corpus Processing")
    # Loop through the sentences and collect all predictions
    toCorpusSentences = None
    if toCorpus != None:
        toCorpusSentences = toCorpus.documentSentences
    for i in range(len(fromCorpus.documentSentences)):
        if len(fromCorpus.documentSentences[i]) > 0 and not verbose:
            counter.update(len(fromCorpus.documentSentences[i]), fromCorpus.documentSentences[i][0].sentence.get("id").rsplit(".", 1)[0])
        if toCorpusSentences != None:
            newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument(fromCorpus.documentSentences[i], toCorpusSentences[i], target, classSets, negativeClassId, entityMatchFunction, verbose=verbose, counts=counts)
        else:
            newEntityExPred, newInteractionExPred, newEventExPred, sentFalseEntity = processDocument(fromCorpus.documentSentences[i], None, target, classSets, negativeClassId, entityMatchFunction, verbose=verbose, counts=counts)
        entityExamples.extend(newEntityExPred[0])
        entityPredictions.extend(newEntityExPred[1])
        interactionExamples.extend(newInteractionExPred[0])
        interactionPredictions.extend(newInteractionExPred[1])
        eventExamples.extend(newEventExPred[0])
        eventPredictions.extend(newEventExPred[1])
        for k,v in sentFalseEntity.iteritems():
            falseEntity[k][0] += v[0]
            falseEntity[k][1] += v[1]
    
    # Process the predictions with an evaluator and print the results
    evaluator = None
    if len(entityPredictions) > 0:
        evaluator = EvaluatorClass(entityExamples, entityPredictions, classSet=classSets["entity"])
        print evaluator.toStringConcise(title="Entities")
        if errorMatrix:
            print evaluator.matrixToString()
            print evaluator.matrixToString(True)
    if len(interactionPredictions) > 0:
        evaluator = EvaluatorClass(interactionExamples, interactionPredictions, classSet=classSets["interaction"])
        print evaluator.toStringConcise(title="Interactions")
        if errorMatrix:
            print evaluator.matrixToString()
            print evaluator.matrixToString(True)
        #print "Interactions (fp ent->fp int, fn-ent->fn-int )"
        #for key in sorted(falseEntity.keys()):
        #    print "", key, falseEntity[key][0], "/", falseEntity[key][1]
    if len(eventPredictions) > 0:
        evaluator = EvaluatorClass(eventExamples, eventPredictions, classSet=classSets["entity"])
        print evaluator.toStringConcise(title="Events")
        if errorMatrix:
            print evaluator.matrixToString()
            print evaluator.matrixToString(True)
    return evaluator
开发者ID:jbjorne,项目名称:TEES,代码行数:56,代码来源:EvaluateInteractionXML.py

示例13: processCorpus

 def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True):
     # Create intermediate paths if needed
     if os.path.dirname(output) != "" and not os.path.exists(os.path.dirname(output)):
         os.makedirs(os.path.dirname(output))
     # Open output file
     openStyle = "wt"
     if append:
         #print "Appending examples"
         openStyle = "at"
     if output.endswith(".gz"):
         outfile = gzip.open(output, openStyle)
     else:
         outfile = open(output, openStyle)
     
     # Build examples
     self.exampleCount = 0
     if type(input) in types.StringTypes:
         self.elementCounts = self.getElementCounts(input)
         if self.elementCounts["sentences"] > 0:
             self.progress = ProgressCounter(self.elementCounts["sentences"], "Build examples")
         else:
             self.elementCounts = None
             self.progress = ProgressCounter(None, "Build examples")
     else:
         self.elementCounts = None
         self.progress = ProgressCounter(None, "Build examples")
     
     self.calculatePredictedRange(self.getSentences(input, self.parse, self.tokenization))
     
     inputIterator = getCorpusIterator(input, None, self.parse, self.tokenization)            
     
     #goldIterator = []
     if gold != None:
         goldIterator = getCorpusIterator(gold, None, self.parse, self.tokenization)
         for inputSentences, goldSentences in itertools.izip_longest(inputIterator, goldIterator, fillvalue=None):
             assert inputSentences != None
             assert goldSentences != None
             self.processDocument(inputSentences, goldSentences, outfile)
     else:
         for inputSentences in inputIterator:
             self.processDocument(inputSentences, None, outfile)
     outfile.close()
     self.progress.endUpdate()
     
     # Show statistics
     print >> sys.stderr, "Examples built:", self.exampleCount
     print >> sys.stderr, "Features:", len(self.featureSet.getNames())
     print >> sys.stderr, "Style:", Utils.Parameters.toString(self.getParameters(self.styles))
     if self.exampleStats.getExampleCount() > 0:
         self.exampleStats.printStats()
 
     # Save Ids
     if allowNewIds:
         self.saveIds()
开发者ID:ninjin,项目名称:TEES,代码行数:54,代码来源:ExampleBuilder.py

示例14: processCorpus

def processCorpus(input, output, wordVectorPath, tokenizerName="McCC", max_rank_mem=100000, max_rank=10000000):
    print >> sys.stderr, "Making vocabulary"
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    vocabulary = {"indices":{}, "vectors":[]}
    
    print >> sys.stderr, "Loading word vectors from", wordVectorPath
    print >> sys.stderr, "max_rank_mem", max_rank_mem
    print >> sys.stderr, "max_rank", max_rank
    max_rank_mem = int(max_rank_mem)
    max_rank = int(max_rank)
    wv = WV.load(wordVectorPath, max_rank_mem, max_rank)
    dimVector = wv.vectors.shape[1]
    print >> sys.stderr, "WordVector length", dimVector
    #addVector("[out]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range
    #addVector("[OoV]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary
    addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range
    addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    counts = defaultdict(int)
    for document in documents:
        counter.update()
        counts["document"] += 1
        for sentence in document.findall("sentence"):
            counts["sentence"] += 1
            tokenization = IXMLUtils.getTokenizationElement(sentence, tokenizerName)
            if tokenization != None:
                counts["tokenization"] += 1
                for token in tokenization.findall("token"):
                    counts["token"] += 1
                    text = token.get("text")
                    if text not in vocabulary["indices"]:
                        counts["token-unique"] += 1
                        vector = wv.w_to_normv(token.get("text").lower())
                        if vector is not None:
                            counts["vector"] += 1
                            vector = vector.tolist() + [0.0, 0.0]
                            addVector(text, vector, vocabulary)
                        else:
                            counts["no-vector"] += 1           
    
    print >> sys.stderr, "Counts:", dict(counts)
    
    if output != None:
        print >> sys.stderr, "Writing vectors to", output + "-vectors.json.gz"
        with gzip.open(output + "-vectors.json.gz", "wt") as f:
            json.dump(vocabulary, f)
        print >> sys.stderr, "Writing indices to", output + "-indices.json.gz"
        with gzip.open(output + "-indices.json.gz", "wt") as f:
            json.dump({"indices":vocabulary["indices"], "vectors":None}, f)
    return vocabulary
开发者ID:jbjorne,项目名称:TEES,代码行数:54,代码来源:MakeVocabulary.py

示例15: findHeads

def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False):
    if iterate:
        from Utils.ProgressCounter import ProgressCounter
        import InteractionXML.SentenceElements as SentenceElements
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        print >> sys.stderr, "Removing existing head offsets"
        removeCount = 0
        counter = ProgressCounter(None, "Find heads")
        counter.showMilliseconds = True
        for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization):
            for sentence in sentences:
                if removeExisting:
                    for e in sentence.sentence.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
                graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
                graph.mapInteractions(sentence.entities, sentence.interactions)
                # Make sure every parse gets head scores
                #if graph.tokenHeadScores == None:
                #    graph.getTokenHeadScores()
            counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ")                
        print >> sys.stderr, "Removed head offsets from", removeCount, "entities"    
    else:
        xml = ETUtils.ETFromObj(input)
        if removeExisting:
            print >> sys.stderr, "Removing existing head offsets"
            removeCount = 0
            xml = ETUtils.ETFromObj(input)
            for d in xml.getroot().findall("document"):
                for s in d.findall("sentence"):
                    for e in s.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
            print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
        
        # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization)
        
        # Make sure every parse gets head scores
        for sentence in corpusElements.sentences:
            if sentence.sentenceGraph == None:
                continue
            if sentence.sentenceGraph.tokenHeadScores == None:
                sentence.sentenceGraph.getTokenHeadScores()
        
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return xml
开发者ID:jbjorne,项目名称:Tdevel,代码行数:52,代码来源:FindHeads.py


注:本文中的Utils.ProgressCounter.ProgressCounter类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。