当前位置: 首页>>代码示例>>Python>>正文


Python Utils.ElementTreeUtils类代码示例

本文整理汇总了Python中Utils.ElementTreeUtils的典型用法代码示例。如果您正苦于以下问题:Python ElementTreeUtils类的具体用法?Python ElementTreeUtils怎么用?Python ElementTreeUtils使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了ElementTreeUtils类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: processCorpus

def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    for eType in rules.keys():
        for attrRule in rules[eType].keys():
            rules[eType][attrRule] = rules[eType][attrRule].split("|")
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = defaultdict(int)
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
开发者ID:ninjin,项目名称:TEES,代码行数:25,代码来源:DeleteElements.py

示例2: catenateElements

def catenateElements(inputs, output):
    print >> sys.stderr, "##### Catenate interaction XML as elements #####"
    c1 = RecalculateIds.recalculateIds(input1, None, False, 0)
    numDocs = len(c1.getroot().findall("document"))
    print >> sys.stderr, "Documents in input 1:", numDocs
    c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs)
    
    print >> sys.stderr, "Appending documents"
    c1Root = c1.getroot()
    for document in c2.getroot().findall("document"):
        c1Root.append(document)
    
    print >> sys.stderr, "Validating ids"
    ids = set()
    for element in c1Root.getiterator("entity"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("interaction"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("sentence"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("document"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(c1Root, output)
    return c1
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:35,代码来源:Catenate.py

示例3: parse

 def parse(self, parserName, input, output=None, debug=False, reparse=False, stanfordParserDir=None, stanfordParserArgs=None, action="convert", outputFormat=None, memory=None):
     #global stanfordParserDir, stanfordParserArgs
     assert action in ("convert", "penn", "dep")
     if stanfordParserDir == None:
         stanfordParserDir = Settings.STANFORD_PARSER_DIR
     # Run the parser process
     corpusTree, corpusRoot = self.getCorpus(input)
     workdir = tempfile.mkdtemp()
     inPath = self.makeInputFile(corpusRoot, workdir, parserName, reparse, action, debug)
     outPath = self.runProcess(stanfordParserArgs, stanfordParserDir, inPath, workdir, action, outputFormat, memory)
     self.printStderr(outPath)
     # Insert the parses    
     if action in ("convert", "dep"):
         #self.insertDependencyParses(outPath, corpusRoot, parserName, {"stanford-mode":action}, addTimeStamp=True, skipExtra=0, removeExisting=True)
         self.insertStanfordDependencyParses(outPath, corpusRoot, parserName, skipParsed=reparse, removeExisting=reparse)
     elif action == "penn":
         self.insertPennTrees(outPath, corpusRoot, parserName)
     # Remove work directory
     if not debug:
         shutil.rmtree(workdir)
     else:
         print >> sys.stderr, "Parser IO files at", workdir
     # Write the output XML file
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     return corpusTree
开发者ID:jbjorne,项目名称:TEES,代码行数:27,代码来源:StanfordParser.py

示例4: mergeAll

def mergeAll(input, output=None, debug=False, iterate=False):
    if iterate:
        origItems = defaultdict(int)
        removedItems = defaultdict(int)
        for docSentences in SentenceElements.getCorpusIterator(input, output):
            entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(docSentences, debug)
            for key in entitiesByType: origItems[key] += entitiesByType[key]
            for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
            interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(docSentences, debug)
            for key in interactionsByType: origItems[key] += interactionsByType[key]
            for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
        printStats(origItems, removedItems)
        return None
    else:
        corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False)
        print >> sys.stderr, "Merging duplicate entities"
        entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(corpusElements.sentences, debug)
        printStats(entitiesByType, duplicatesRemovedByType)
        print >> sys.stderr, "Merging duplicate interactions"
        interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(corpusElements.sentences, debug)
        printStats(interactionsByType, duplicatesRemovedByType)
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return corpusElements
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:25,代码来源:MergeDuplicateEntities.py

示例5: classify

 def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None, validate=False):
     model = self.openModel(model, "r")
     self.enterState(self.STATE_CLASSIFY)
     self.setWorkDir(workDir)
     if workDir == None:
         self.setTempWorkDir()
     model = self.openModel(model, "r")
     if parse == None: parse = self.getStr(self.tag+"parse", model)
     workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-")
     xml = self.classifyToXML(data, model, None, workOutputTag, 
         model.get(self.tag+"classifier-model", defaultIfNotExist=None), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0)))
     if (validate):
         self.structureAnalyzer.load(model)
         self.structureAnalyzer.validate(xml)
         ETUtils.write(xml, output+"-pred.xml.gz")
     else:
         shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz")
     EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
     stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
     if stParams["convert"]: #self.useBioNLPSTFormat:
         extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz" 
         Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events" + extension, outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True))
         if stParams["evaluate"]: #self.stEvaluator != None:
             if task == None: 
                 task = self.getStr(self.tag+"task", model)
             self.stEvaluator.evaluate(output+"-events" + extension, task)
     self.deleteTempWorkDir()
     self.exitState()
开发者ID:jbjorne,项目名称:TEES,代码行数:28,代码来源:SingleStageDetector.py

示例6: fixAltOffsets

def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:34,代码来源:FixAltOffsets.py

示例7: removeUnconnectedEntities

def removeUnconnectedEntities(input, output=None):
    input = ETUtils.ETFromObj(input)
    root = input.getroot()
    removed = 0
    preserved = 0
    for document in root.findall("document"):
        sentMap = {} # allow for intersentence interactions
        for sentence in document.findall("sentence"):
            sentMap[sentence.get("id")] = sentence
        connected = set()
        for interaction in document.getiterator("interaction"):
            connected.add(interaction.get("e1"))
            connected.add(interaction.get("e2"))
        entities = []
        for entity in document.getiterator("entity"):
            entities.append(entity)
        for entity in entities:
            if entity.get("isName") == "True": # never remove named entities
                continue
            eId = entity.get("id")
            if eId not in connected:
                if eId.find(".s") != -1: # sentence level entity
                    sentMap[eId.rsplit(".", 1)[0]].remove(entity)
                else: # document level entity
                    document.remove(entity)
                removed += 1
            else:
                preserved += 1
    
    print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities"
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(root, output)
    return input
开发者ID:ninjin,项目名称:TEES,代码行数:35,代码来源:RemoveUnconnectedEntities.py

示例8: findHeadsSyntactic

def findHeadsSyntactic(corpus, parse, tokenization):
    """
    Determine the head token for a named entity or trigger. The head token is the token closest
    to the root for the subtree of the dependency parse spanned by the text of the element.
    
    @param entityElement: a semantic node (trigger or named entity)
    @type entityElement: cElementTree.Element
    @param verbose: Print selected head tokens on screen
    @param verbose: boolean
    """
    counts = [0,0]
    sentences = [x for x in corpus.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "SYNTAX")
    for sentence in sentences:
        counter.update()
        tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization})
        parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse})
        if tokElement == None or parseElement == None:
            print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization" 
        tokens = tokElement.findall("token")
        tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id"))
        for entity in sentence.findall("entity"):
            if entity.get("headOffset") == None:
                headToken = getEntityHeadToken(entity, tokens, tokenHeadScores)
                # The ElementTree entity-element is modified by setting the headOffset attribute
                entity.set("headOffset", headToken.get("charOffset"))
                entity.set("headMethod", "Syntax")
                entity.set("headString", headToken.get("text"))
                counts[0] += 1
    return counts
开发者ID:ninjin,项目名称:TEES,代码行数:30,代码来源:DetectHeads.py

示例9: makeSubset

def makeSubset(input, output=None, ratio=1.0, seed=0):
    if ratio == 1.0:
        if output != None:
            shutil.copy2(input, output)
            return output
        else:
            return input
    totalFolds = 100
    selectedFolds = int(ratio * 100.0)
    print >>sys.stderr, "====== Making subset ======"
    print >>sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed
    xml = ETUtils.ETFromObj(input).getroot()
    count = 0
    sentCount = 0
    for document in xml.findall("document"):
        sentCount += len(document.findall("sentence"))
        count += 1
    division = Core.Split.getFolds(count, totalFolds, seed)
    # print division, selectedFolds - 1
    index = 0
    removeCount = 0
    sentRemoveCount = 0
    for document in xml.findall("document"):
        if division[index] > selectedFolds - 1:
            xml.remove(document)
            sentRemoveCount += len(document.findall("sentence"))
            removeCount += 1
        index += 1
    print >>sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount
    xml.set("subsetRatio", str(ratio))
    xml.set("subsetSeed", str(seed))
    if output != None:
        ETUtils.write(xml, output)
    return output
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:34,代码来源:MakeSubset.py

示例10: run

 def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False):
     """inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream
     multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf)
     multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence
     the root of the modified tree is returned and, if outFile is a string, written out to outFile as well"""
     print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####"
     tree=ETUtils.ETFromObj(inFile)
     if not ET.iselement(tree):
         assert isinstance(tree,ET.ElementTree)
         root=tree.getroot()
     else:
         root = tree
     
     if multiplier != -1:
         if binary:
             print >> sys.stderr, "Recall binary mode"
             classRanges = getClassRanges(root.getiterator("entity"))
             assert len(classRanges.keys()) in [0,2]
             if len(classRanges.keys()) == 0:
                 print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found"
         else:
             print >> sys.stderr, "Recall multiclass mode"
             classRanges = None
         for entityNode in root.getiterator("entity"):
             adjustEntity(entityNode,targetLabel,multiplier,classRanges)
     if outFile:
         ETUtils.write(root,outFile)
     return tree
开发者ID:ninjin,项目名称:TEES,代码行数:28,代码来源:RecallAdjust.py

示例11: processCorpus

def processCorpus(inPath, outPath, sourceSet, newSets, seed=1):
    print >> sys.stderr, "Loading corpus file", inPath
    corpusTree = ETUtils.ETFromObj(inPath)
    corpusRoot = corpusTree.getroot()
    
    rand = random.Random(seed)
    documents = corpusRoot.findall("document")
    counts = {"old":defaultdict(int), "new":defaultdict(int)}
    for document in documents:
        counts["old"][document.get("set")] += 1
        if sourceSet != None and document.get("set") != sourceSet:
            counts["new"][document.get("set")] += 1
            continue
        value = rand.random()
        document.set("setValue", str(value))
        document.set("origSet", document.get("set", ""))
        for setName, cutoff in newSets:
            if value <= cutoff:
                document.set("set", setName)
                break
        counts["new"][document.get("set")] += 1
    #for key in counts:
    #    counts[key] = dict(counts[key])
    print "MakeSets result:", "old=" + str(dict(counts["old"])) + ", new=" + str(dict(counts["new"]))
    if outPath != None:
        ETUtils.write(corpusRoot, outPath)
    return corpusTree
开发者ID:jbjorne,项目名称:TEES,代码行数:27,代码来源:MakeSets.py

示例12: test

def test(extractPath, downloadPath, inCorpusPath, outCorpusPath):
    download(extractPath, downloadPath)
    specAnn = readResources(extractPath)
    inCorpus = ETUtils.ETFromObj(inCorpusPath)
    insertElements(inCorpus.getroot(), specAnn)
    ETUtils.write(inCorpus.getroot(), outCorpusPath)

#process("/tmp/extract", "/tmp/download", "/home/jari/Dropbox/data/BioNLP16/corpora/BB_EVENT_16-devel.xml", "/tmp/ner.xml")
开发者ID:jbjorne,项目名称:TEES,代码行数:8,代码来源:insertResources.py

示例13: process

def process(input, output=None):
    download("/tmp/extract", "/tmp/download")
    specAnn = readResources("/tmp/extract")
    insertElements(input.getroot(), specAnn)
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(input.getroot(), output)
    return input
开发者ID:jbjorne,项目名称:TEES,代码行数:8,代码来源:insertResources.py

示例14: insertParses

 def insertParses(self, parseDir, input, output=None, parseName="McCC", extensions=None, subDirs=None, debug=False, skipParsed=False, docMatchKeys=None, conllFormat=None, splitting=True, unescapeFormats="AUTO", tokenMerging=True, extMap=None, sdFailedFormat="empty", origIdType=None, posTags=None):
     corpusTree, corpusRoot = self.getCorpus(input)
     if not os.path.exists(parseDir):
         raise Exception("Cannot find parse input '" + str(parseDir) + "'")
     if not os.path.isdir(parseDir):
         raise Exception("Parse input '" + str(parseDir) + "' is not a directory")
     if extensions == None:
         extensions = self.allExt
     elif isinstance(extensions, basestring):
         extensions = extensions.split(",")
     extensions = [x for x in extensions if x in self.allExt]
     unescapeFormats = self.getUnescapeFormats(unescapeFormats)
     if docMatchKeys == None:
         docMatchKeys = ["origId", "pmid", "id"]
     elif isinstance(docMatchKeys, basestring):
         docMatchKeys = docMatchKeys.split(",")
     print >> sys.stderr, "Inserting parses from file types:", extensions
     counts = defaultdict(int)
     files = self.getParseFiles(parseDir, extensions, subDirs, counts, extMap=extMap, origIdType=origIdType)
     typeCounts = {x:defaultdict(int) for x in extensions}
     # Make document elements if needed
     documents = [x for x in corpusRoot.findall("document")]
     if len(documents) == 0:
         typeCounts["document-generation"] = defaultdict(int)
         documents = self.prepareDocuments(corpusRoot, files)
     counter = ProgressCounter(len(files), "Parse Insertion")
     # Insert parses and make sentence elements if needed
     typeCounts["sentence-splitting"] = defaultdict(int)
     print >> sys.stderr, "Inserting parses for", len(files), "out of total", len(documents), "documents"
     for document in documents:
         counts["document"] += 1
         matchFound = False
         for docMatchValue in [document.get(x) for x in docMatchKeys if document.get(x) != None]:
             if docMatchValue in files:
                 if matchFound:
                     raise Exception("Multiple matching parses for document " + str(document.attrib) + " using keys " + str(docMatchKeys))
                 matchFound = True
                 counter.update(1, "Inserting parses for (" + document.get("id") + "/" + str(docMatchValue) + "): ")
                 counts["document-match"] += 1
                 for ext in extensions:
                     if ext not in files[docMatchValue]:
                         continue
                     counts[ext + "-match"] += 1
                     sentences = [x for x in self.getSentences(document, skipParsed=skipParsed)]
                     self.insertParse(document, sentences, ext, files[docMatchValue][ext], parseName, splitting, typeCounts, conllFormat, unescapeFormats=unescapeFormats, tokenMerging=tokenMerging, sdFailedFormat=sdFailedFormat, posTags=posTags)
         if not matchFound:
             counts["document-no-match"] += 1
     if len(typeCounts["sentence-splitting"]) > 0:
         print >> sys.stderr, "Sentence Splitting Counts", dict(typeCounts["sentence-splitting"])
     print >> sys.stderr, "Counts", dict(counts)
     for ext in extensions:
         if len(typeCounts[ext]) > 0:
             print >> sys.stderr, "Counts for type '" + ext + "':", dict(typeCounts[ext])
     # Write the output XML file
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     return corpusTree
开发者ID:jbjorne,项目名称:TEES,代码行数:58,代码来源:ParseConverter.py

示例15: processCorpus

def processCorpus(input, outDir, stem=None, tail=".xml", mergedSets=[], saveCombined=False, verbose=False):
    newCorpora = {}
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    for document in documents:
        counter.update()
        docSet = document.get("set")
        if docSet == None:
            if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id")
            if not countsByType.has_key("No set"):
                countsByType["No set"] = 0
            countsByType["No set"] += 1
            continue
        elif not newCorpora.has_key(docSet):
            newCorpora[docSet] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[docSet].set(k, v)
            countsByType[docSet] = 0
        newCorpora[docSet].append(document)
        countsByType[docSet] += 1
        
    # Make merged sets
    for mergedSet in mergedSets:
        tag = "-and-".join(sorted(mergedSet))
        if not newCorpora.has_key(tag):
            newCorpora[tag] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[tag].set(k, v)
            countsByType[tag] = 0    
        for componentSet in mergedSet:
            for element in newCorpora[componentSet].findall("document"):
                newCorpora[tag].append(element)
                countsByType[tag] += 1
        
    print >> sys.stderr, "Documents per set"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + str(k) + ":", countsByType[k]
    
    if stem == None:
        outDir, stem = os.path.dirname(outDir), os.path.basename(outDir)
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    
    print >> sys.stderr, "Writing output files to directory", outDir
    if saveCombined:
        print >> sys.stderr, "Saving combined input to", stem + tail
        ETUtils.write(corpusRoot, stem + tail)
    else:
        print >> sys.stderr, "Combined input not saved"
    for docSet in sorted(newCorpora.keys()):
        outFilename = os.path.join(outDir, stem + "-" + docSet + tail)
        print >> sys.stderr, "Writing set", docSet, "to", outFilename
        ETUtils.write(newCorpora[docSet], outFilename)
开发者ID:jbjorne,项目名称:TEES,代码行数:57,代码来源:DivideSets.py


注:本文中的Utils.ElementTreeUtils类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。