当前位置: 首页>>代码示例>>Python>>正文


Python Range.tuplesToCharOffset方法代码示例

本文整理汇总了Python中Utils.Range.tuplesToCharOffset方法的典型用法代码示例。如果您正苦于以下问题:Python Range.tuplesToCharOffset方法的具体用法?Python Range.tuplesToCharOffset怎么用?Python Range.tuplesToCharOffset使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Utils.Range的用法示例。


在下文中一共展示了Range.tuplesToCharOffset方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: fixAltOffsets

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:36,代码来源:FixAltOffsets.py

示例2: makeEntityElement

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def makeEntityElement(ann, idCount, docEl):
    entEl = ET.Element("entity")
    entEl.set("type", ann.type)
    entEl.set("text", ann.text)
    # identifiers
    protId = docEl.get("id") + ".e" + str(idCount)
    entEl.set("id", protId)
    if ann.id != None:
        entEl.set("origId", docEl.get("origId") + "." + str(ann.id))
    # offsets
    entEl.set("charOffset", Range.tuplesToCharOffset(ann.charOffsets))
    if len(ann.alternativeOffsets) > 0:
        altOffs = []
        for alternativeOffset in ann.alternativeOffsets:
            altOffs.append( str(alternativeOffset[0]) + "-" + str(alternativeOffset[1]-1) ) 
        entEl.set("altOffset", ",".join(altOffs))
    if ann.normalization != None:
        entEl.set("normalization", ann.normalization)
    addExtraToElement(entEl, ann.extra)
    # determine if given data
    assert ann.fileType in ["a1", "a2", "rel"], ann.fileType
    if ann.fileType == "a1": #protein.isName():
        entEl.set("given", "True")
    #else:
    #    entEl.set("given", "False")
    return entEl
开发者ID:ayoshiaki,项目名称:TEES,代码行数:28,代码来源:ConvertXML.py

示例3: addParseElements

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def addParseElements(doc, docEl):
    if docEl.tag != "sentence":
        return
    sentAnalysesEl = ET.SubElement(docEl, "analyses")
    #parsesEl = ET.SubElement(sentAnalysesEl, "parses")
    parseEl = ET.SubElement(sentAnalysesEl, "parse")
    #tokenizationsEl = ET.SubElement(sentAnalysesEl, "tokenizations")
    tokenizationEl = ET.SubElement(sentAnalysesEl, "tokenization")
    parseEl.set("parser", "gold")
    parseEl.set("tokenizer", "gold")
    tokenizationEl.set("tokenizer", "gold")
    tokenMap = {}
    for word in doc.words:
        tokEl = ET.SubElement(tokenizationEl, "token")
        tokEl.set("id", word.id)
        tokEl.set("text", word.text)
        tokEl.set("POS", "None")
        assert len(word.charOffsets) == 1, (word, word.charOffsets)
        tokEl.set("charOffset", Range.tuplesToCharOffset(word.charOffsets))
        tokenMap[word.id] = tokEl
    for dep in doc.dependencies:
        depEl = ET.SubElement(parseEl, "dependency")
        depEl.set("id", dep.id)
        depEl.set("type", dep.type)
        assert len(dep.arguments) == 2
        depEl.set("t1", dep.arguments[0].target.id)
        depEl.set("t2", dep.arguments[1].target.id)
        if dep.type.find(":") != -1:
            word1Type, word2Type = dep.type.split("(")[0].split(":")[-1].split("-")
            tokenMap[dep.arguments[0].target.id].set("POS", word1Type)
            tokenMap[dep.arguments[1].target.id].set("POS", word2Type)
开发者ID:ayoshiaki,项目名称:TEES,代码行数:33,代码来源:ConvertXML.py

示例4: updateXML

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def updateXML(root, removeAnalyses=True):
    counts = defaultdict(int)
    for document in root.findall("document"):
        sentencePos = 0
        counts["documents"] += 1
        for sentence in document.findall("sentence"):
            counts["sentences"] += 1
            # Remove the original parses
            analyses = sentence.find("sentenceanalyses")
            if analyses != None:
                counts["analyses"] += 1
                if removeAnalyses:
                    counts["removed-analyses"] += 1
                    sentence.remove(analyses)
            # Add an artifical sentence offset so that sentences can be exported as a single document
            sentenceText = sentence.get("text")
            sentence.set("charOffset", Range.tuplesToCharOffset((sentencePos, sentencePos + len(sentenceText))))
            # Update the character offsets of all entities from the old format (begin,end) to the new one (begin,end+1)
            for entity in sentence.findall("entity"):
                counts["entities"] += 1
                offsets = [(x[0], x[1] + 1) for x in Range.charOffsetToTuples(entity.get("charOffset"))]
                entityText = entity.get("text")
                for offset, entitySpan in zip(offsets, [sentenceText[x[0]:x[1]] for x in offsets]):
                    counts["entity-offsets"] += 1
                    lenOffset = offset[1] - offset[0]
                    offsetText, entityText = entityText[:lenOffset].strip(), entityText[lenOffset:].strip()
                    assert offsetText == entitySpan, (offsets, (entity.get("text"), entitySpan), (offsetText, entityText), sentenceText)
                entity.set("charOffset", Range.tuplesToCharOffset(offsets))
            # Convert positive pairs into interaction elements
            numInteractions = 0
            for pair in sentence.findall("pair"):
                counts["pairs"] += 1
                sentence.remove(pair)
                if pair.get("interaction") == "True":
                    del pair.attrib["interaction"]
                    pair.set("id", pair.get("id").rsplit(".", 1)[0] + ".i" + str(numInteractions))
                    pair.set("type", "PPI")
                    ET.SubElement(sentence, "interaction", pair.attrib)
                    numInteractions += 1
                    counts["interactions"] += 1
            sentencePos += len(sentenceText) + 1
    print >> sys.stderr, "Updated Interaction XML format:", dict(counts)
    return root
开发者ID:jbjorne,项目名称:TEES,代码行数:45,代码来源:convertPPI.py

示例5: processElements

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def processElements(xml):
    for ddi in xml.getiterator("ddi"):
        ddi.tag = "interaction"
    for entity in xml.getiterator("entity"):
        entity.set("given", "True")
        # Reformat disjoint character offsets and update character range format for TEES 2.0+
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";")
        updatedCharOffsets = []
        for charOffset in charOffsets:
            updatedCharOffsets.append( (charOffset[0], charOffset[1]+1) )
        entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:13,代码来源:convertDDI13.py

示例6: fixEntities

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def fixEntities(xml):
    counts = defaultdict(int)
    for sentence in xml.getiterator("sentence"):
        sText = sentence.get("text")
        for entity in sentence.findall("entity"):
            charOffset = entity.get("charOffset")
            if charOffset == "-":
                assert False, str(entity)
                sentence.remove(entity)
                counts["removed-invalid"] += 1
            else:
                charOffset = Range.charOffsetToSingleTuple(charOffset)
                # fix length
                realLength = len(entity.get("text"))
                lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength
                if lenDiff != realLength:
                    counts["incorrect-ent-offset"] += 1
                    counts["incorrect-ent-offset-diff"+str(lenDiff)] += 1
                    if abs(lenDiff) > 2:
                        print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id"))
                charOffset = (charOffset[0], charOffset[0] + realLength)
                # find starting position
                entIndex = sText.find(entity.get("text"), charOffset[0])
                if entIndex == -1:
                    for i in [-1,-2,-3]:
                        entIndex = sText.find(entity.get("text"), charOffset[0]+i)
                        if entIndex != -1:
                            break
                if entIndex != 0: # could be lowercase
                    sTextLower = sText.lower()
                    for i in [0,-1,-2,-3]:
                        lowerEntIndex = sTextLower.find(entity.get("text"), charOffset[0]+i)
                        if lowerEntIndex != -1:
                            break
                    if lowerEntIndex != -1 and abs(lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]):
                        entIndex = lowerEntIndex
                assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id"))
                indexDiff = entIndex - charOffset[0]
                if indexDiff != 0:
                    counts["incorrect-ent-index"] += 1
                    counts["incorrect-ent-index-diff"+str(indexDiff)] += 1
                    print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id"))
                # move offset       
                charOffset = (charOffset[0]+indexDiff, charOffset[1]+indexDiff)
                # validate new offset
                sEntity = sText[charOffset[0]:charOffset[1]]
                assert sEntity == entity.get("text") or sEntity.lower() == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id"))
                entity.set("charOffset", Range.tuplesToCharOffset( (charOffset[0], charOffset[1])))
                entity.set("given", "True")
        for interaction in sentence.findall("interaction"):
            interaction.set("type", "DDI")
    print "Fix counts:", counts
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:54,代码来源:convertDDI.py

示例7: makeDDI13SubmissionFile

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None):
    xml = ETUtils.ETFromObj(input)
    outFile = open(output, "wt")
    for sentence in xml.getiterator("sentence"):
        sentenceId = sentence.get("id")
        if idfilter != None and idfilter not in sentenceId:
            continue
        # Output entities
        if mode == "entities":
            for entity in sentence.findall("entity"):
                if entity.get("type") != "neg":
                    outFile.write(sentenceId)
                    offsets = Range.charOffsetToTuples(entity.get("charOffset"))
                    for i in range(len(offsets)):
                        offsets[i] = (offsets[i][0], offsets[i][1]-1)
                    outFile.write("|" + Range.tuplesToCharOffset(offsets, rangeSep=";"))
                    outFile.write("|" + entity.get("text"))
                    outFile.write("|" + entity.get("type"))
                    outFile.write("\n")    
        if mode == "interactions":
            # First determine which pairs interact
            intMap = defaultdict(lambda:defaultdict(lambda:None))
            for interaction in sentence.findall("interaction"):
                # Make mapping both ways to discard edge directionality. This isn't actually needed,
                # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function,
                # but shouldn't harm to include it and now it works regardless of pair direction.
                if interaction.get("type") != "neg" and interaction.get("given") != "True":
                    intMap[interaction.get("e1")][interaction.get("e2")] = interaction
                    intMap[interaction.get("e2")][interaction.get("e1")] = interaction
            # Then write all pairs to the output file
            entities = sentence.findall("entity")
            for i in range(0, len(entities)-1):
                for j in range(i+1, len(entities)):
                    eIId = entities[i].get("id")
                    eJId = entities[j].get("id")
                    outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|")
                    if intMap[eIId][eJId] != None:
                        interaction = intMap[eIId][eJId]
                        assert interaction.get("type") != "neg"
                        outFile.write("1|" + interaction.get("type") + "\n")
                    else:
                        outFile.write("0|null\n")
    outFile.close()
开发者ID:jbjorne,项目名称:TEES,代码行数:45,代码来源:DDITools.py

示例8: convert

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def convert(metamapEl, sentenceEl):
    """
    Convert MetaMap XML into phrase-elements
    """
    newMetamapEl = ET.Element("metamap") # make a new metamap element
    utteranceCount = 0
    for utterance in metamapEl.getiterator("Utterance"): # process all utterances (sentences)
        utteranceCount += 1
        #print "UT:", utterance.find("UttText").text
        uttOffsetBegin = int(utterance.find("UttStartPos").text)
        for phrase in utterance.getiterator("Phrase"): # process all phrases for each utterance
            #print "Phrase:", phrase.find("PhraseText").text
            phraseEl = ET.Element("phrase")
            phraseOffset = [int(phrase.find("PhraseStartPos").text), int(phrase.find("PhraseStartPos").text) + int(phrase.find("PhraseLength").text)]
            phraseOffset = [phraseOffset[0] - uttOffsetBegin, phraseOffset[1] - uttOffsetBegin]
            phraseEl.set("charOffset", Range.tuplesToCharOffset(phraseOffset))
            phraseEl.set("text", phrase.find("PhraseText").text)
            for candidate in phrase.getiterator("Candidate"): # process first candidate of each phrase
                phraseEl.set("score", candidate.find("CandidateScore").text)
                phraseEl.set("cui", candidate.find("CandidateCUI").text)
                phraseEl.set("matched", candidate.find("CandidateMatched").text)
                phraseEl.set("preferred", candidate.find("CandidatePreferred").text)
                semTypes = set()
                for semType in candidate.getiterator("SemType"):
                    semTypes.add(semType.text)
                phraseEl.set("semTypes", ",".join(sorted(list(semTypes))))
                sources = set()
                for source in candidate.getiterator("Source"):
                    sources.add(source.text)
                phraseEl.set("sources", ",".join(sorted(list(sources))))
                break
            if phraseEl.get("matched") != None: # include only matched phrases as new elements
                newMetamapEl.append(phraseEl)
            #print ET.tostring(phraseEl, "utf-8")
    
    if utteranceCount > 1:
        print >> sys.stderr, "Warning, sentence", sentenceEl.get("id"), "has", utteranceCount, "utterances"
    return newMetamapEl
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:40,代码来源:MetaMap.py

示例9: moveElements

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            overlaps = False
            for entityOffset in entityOffsets:
                if Range.overlap(sentenceOffset, entityOffset):
                    overlaps = True
                    break
            if overlaps:
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                newEntityOffsets = []
                for entityOffset in entityOffsets:
                    newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                    newOffset = (max(0, newOffset[0]), max(0, newOffset[1]))
                    if newOffset != (0, 0):
                        assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset)
                        newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) )
                assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset)
                entity.set("origOffset", entity.get("charOffset"))
                #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) 
                entCount += 1
        sentenceCount += 1
    if len([x for x in document.findall("entity")]) != 0:
        raise Exception("Sentence splitting does not cover the entire document")
    # Move interactions
    intCount = 0
    interactions = []
    interactionOldToNewId = {}
    for interaction in document.findall("interaction"):
        interactions.append(interaction)
        #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
        #    targetSentence = entSentence[interaction.get("e1")]
        #else:
        #    targetSentence = entSentence[interaction.get("e2")]
        
        # Interactions go to a sentence always by e1, as this is the event they are an argument of.
        # If an intersentence interaction is a relation, this shouldn't matter.
        targetSentence = entSentence[interaction.get("e1")]  
        document.remove(interaction)
        targetSentence.append(interaction)
        newId = targetSentence.get("id") + ".i" + str(intCount)
        interactionOldToNewId[interaction.get("id")] = newId
        interaction.set("id", newId)
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
    for interaction in interactions:
        if interaction.get("siteOf") != None:
            interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
开发者ID:jbjorne,项目名称:TEES,代码行数:74,代码来源:GeniaSentenceSplitter.py

示例10: extend

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]

#.........这里部分代码省略.........
            for i in range(len(tokens)):
                token = tokens[i]
                tokPos[1] = tokPos[0] + len(token) # - 1
                if Range.overlap(headOffset, tokPos):
                    tokIndex = i
                    break
                tokPos[0] += len(token)
            assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens)
            skip = False
            if tokPos[0] < headOffset[0]:
                tokPos = headOffset
                skip = True
            if not skip:
                # Extend before
                beginIndex = tokIndex
                for i in range(tokIndex-1, -1, -1):
                    token = tokens[i]
                    if token.isspace():
                        continue
                    if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
                        beginIndex = i + 1
                        break
                    if i == 0:
                        beginIndex = i
                while tokens[beginIndex].isspace() or isExtraWord(tokens[beginIndex], toLower=False):
                    beginIndex += 1
                    if beginIndex >= tokIndex:
                        beginIndex = tokIndex
                        break
                # Extend after
                endIndex = tokIndex
                if tokens[tokIndex][-1] != ",":
                    endIndex = tokIndex
                    for i in range(tokIndex+1, len(tokens)):
                        token = tokens[i]
                        if token.isspace():
                            continue
                        if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
                            endIndex = i - 1
                            break
                        if i == len(tokens) - 1:
                            endIndex = i
                    while tokens[endIndex].isspace():
                        endIndex -= 1
                # Modify range
                if tokIndex > beginIndex:
                    for token in reversed(tokens[beginIndex:tokIndex]):
                        tokPos[0] -= len(token)
                if tokIndex < endIndex:
                    for token in tokens[tokIndex+1:endIndex+1]:
                        tokPos[1] += len(token)
                # Attempt to remove trailing periods and commas
                while not sentenceText[tokPos[1] - 1].isalnum():
                    tokPos[1] -= 1
                    if tokPos[1] < tokPos[0] + 1:
                        tokPos[1] = tokPos[0] + 1
                        break
                while not sentenceText[tokPos[0]].isalnum():
                    tokPos[0] += 1
                    if tokPos[0] >= tokPos[1]:
                        tokPos[0] = tokPos[1] - 1
                        break
                # Split merged names
                #newPos = [tokPos[0], tokPos[1]]
                #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"):
                #    newPos[0] += len(split)
                #    if                 
            # Insert changed charOffset
            counts["entities"] += 1
            newOffset = tuple(tokPos)
            newOffsetString = Range.tuplesToCharOffset([newOffset])
            if verbose:
                print "Entity", entity.get("id"), 
                #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]],
                print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]]], 
                print [entity.get("charOffset"), entity.get("headOffset"), newOffsetString], "Sent:", len(sentence.get("text")),
            if newOffset != headOffset:
                counts["extended"] += 1
                if verbose: print "EXTENDED",
            if newOffset == charOffset:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                incorrectCount += 1
                if verbose: print "INCORRECT"
            entity.set("charOffset", newOffsetString)
            #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1])
            entity.set("text", sentenceText[newOffset[0]:newOffset[1]])
        if incorrectCount > 0 and verbose:
            print "TOKENS:", "|".join(tokens)
            print "--------------------------------"
    if verbose:
        print counts
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree                    
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:104,代码来源:ExtendTriggers.py

示例11: convertChemProt

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def convertChemProt(inDirs=None, setNames=None, outPath=None, goldTestSet=True, downloadDir=None, extractDir=None, redownload=False, debug=False):
    tempDir = None
    if inDirs == None:
        print >> sys.stderr, "---------------", "Downloading ChemProt files", "---------------"
        if extractDir == None:
            tempDir = tempfile.mkdtemp()
        inDirs = []
        for setName in ("TRAIN", "DEVEL", "TEST"):
            if goldTestSet and setName == "TEST":
                setName = "TEST_GOLD"
            if Settings.URL["CP17_" + setName] != None:
                currentExtractDir = extractDir if extractDir else tempDir
                currentExtractDir = os.path.join(currentExtractDir, setName.lower())
                inDirs.append(downloadFile(Settings.URL["CP17_" + setName], downloadDir, currentExtractDir, redownload))
    print >> sys.stderr, "Reading ChemProt corpus from input", inDirs, "using dataset mapping", setNames
    dataSets = OrderedDict()
    for inDir in inDirs:
        print >> sys.stderr, "Reading input directory", inDir
        filenames = os.listdir(inDir)
        filetypes = ["_abstracts", "_entities", "_relations"]
        # Collect the file paths for the data types
        dirDataSets = set()
        for filename in filenames:
            if not (filename.endswith(".tsv") and any([x in filename for x in filetypes])):
                continue
            dataSetId, dataType = filename.replace("_gs", "").rsplit("_", 1)
            if setNames != None:
                dataSetId = setNames.get(dataSetId, dataSetId)
            dirDataSets.add(dataSetId)
            dataType = dataType.split(".")[0]
            if dataSetId not in dataSets:
                dataSets[dataSetId] = {}
            assert dataType not in dataSets[dataSetId]
            dataSets[dataSetId][dataType] = os.path.join(inDir, filename)
        print >> sys.stderr, "Found ChemProt datasets", list(dirDataSets), "at", inDir
    print >> sys.stderr, "Read datasets:", dataSets.keys()
    # Build the Interaction XML
    print >> sys.stderr, "Converting to Interaction XML"
    corpusName = "CP17"
    corpus = ET.Element("corpus", {"source":corpusName})
    counts = defaultdict(int)
    docById = {}
    entityById = {}
    entitiesByDoc = {}
    docsWithErrors = set()
    for dataSetId in sorted(dataSets.keys()):
        prevCounts = copy.copy(counts)
        print >> sys.stderr, "---", "Building elements for dataset", dataSetId, "---"
        dataSet = dataSets[dataSetId]
        counts["sets"] += 1
        with open(dataSet["abstracts"], "rt") as f:
            print >> sys.stderr, "Adding document elements for dataset", dataSetId
            for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["id", "title", "abstract"], quoting=csv.QUOTE_NONE):
                document = ET.Element("document", {"id":corpusName + ".d" + str(counts["documents"]), "origId":row["id"], "set":dataSetId})
                document.set("text", row["title"] + " " + row["abstract"])
                document.set("titleOffset", Range.tuplesToCharOffset((0, len(row["title"]))))
                if document.get("origId") in docById:
                    assert document.get("text") == docById[document.get("origId")].get("text")
                    assert document.get("titleOffset") == docById[document.get("origId")].get("titleOffset")
                    counts["duplicate-documents"] += 1
                else:
                    corpus.append(document)
                    docById[document.get("origId")] = document
                    counts["documents"] += 1
        with open(dataSet["entities"], "rt") as f:
            print >> sys.stderr, "Adding entity elements for dataset", dataSetId
            for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["docId", "id", "type", "begin", "end", "text"], quoting=csv.QUOTE_NONE):
                document = docById[row["docId"]]
                assert row["type"] in ("CHEMICAL", "GENE-Y", "GENE-N")
                # Check for duplicate entities
                if row["docId"] not in entitiesByDoc:
                    entitiesByDoc[row["docId"]] = set()
                assert row["id"] not in entitiesByDoc[row["docId"]]
                entitiesByDoc[row["docId"]].add(row["id"])
                # Determine the offset
                offset = (int(row["begin"]), int(row["end"]))
                docSpan = document.get("text")[offset[0]:offset[1]]
                if docSpan == row["text"]:
                    entity = ET.SubElement(document, "entity", {"id":document.get("id") + ".e" + str(len([x for x in document.findall("entity")]))})
                    entity.set("given", "True")
                    entity.set("origId", row["id"])
                    entity.set("type", row["type"].split("-")[0])
                    entity.set("normalized", "True" if row["type"].endswith("-Y") else "False")
                    entity.set("charOffset", Range.tuplesToCharOffset((offset[0], offset[1])))
                    entity.set("text", row["text"])
                    if row["docId"] not in entityById:
                        entityById[row["docId"]] = {}
                    assert entity.get("origId") not in entityById[row["docId"]]
                    entityById[row["docId"]][entity.get("origId")] = entity
                    counts["entities"] += 1
                else:
                    print >> sys.stderr, "Alignment error in document", row["docId"], (offset, docSpan, row)
                    counts["entities-error"] += 1
                    docsWithErrors.add(row["docId"])
        if "relations" in dataSet:
            print >> sys.stderr, "Adding relation elements for dataset", dataSetId
            with open(dataSet["relations"], "rt") as f:
                for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["docId", "group", "groupEval", "type", "arg1", "arg2"], quoting=csv.QUOTE_NONE):
                    for argId in ("1", "2"):
                        assert row["arg" + argId].startswith("Arg" + argId + ":")
#.........这里部分代码省略.........
开发者ID:jbjorne,项目名称:TEES,代码行数:103,代码来源:convertChemProt.py


注:本文中的Utils.Range.tuplesToCharOffset方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。