当前位置: 首页>>代码示例>>Python>>正文


Python Range.charOffsetToTuples方法代码示例

本文整理汇总了Python中Utils.Range.charOffsetToTuples方法的典型用法代码示例。如果您正苦于以下问题:Python Range.charOffsetToTuples方法的具体用法?Python Range.charOffsetToTuples怎么用?Python Range.charOffsetToTuples使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Utils.Range的用法示例。


在下文中一共展示了Range.charOffsetToTuples方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: fixAltOffsets

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:36,代码来源:FixAltOffsets.py

示例2: _markNamedEntities

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
 def _markNamedEntities(self):
     """
     This method is used to define which tokens belong to _named_ entities.
     Named entities are sometimes masked when testing learning of interactions, to
     prevent the system making a trivial decision based on commonly interacting names.
     """
     self.tokenIsName = {}
     self.tokenIsEntity = {}
     self.tokenIsEntityHead = {}
     # Initialize the dictionaries
     for token in self.tokens:
         self.tokenIsName[token] = False
         self.tokenIsEntity[token] = False
         self.tokenIsEntityHead[token] = []
     for entity in self.entities:
         entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
         entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset"))
         for token in self.tokens:
             tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
             for entityOffset in entityOffsets:
                 if Range.overlap(entityOffset, tokenOffset):
                     self.tokenIsEntity[token] = True
                     if entity.get("isName") != None:
                         if entity.get("isName") == "True":
                             self.tokenIsName[token] = True
                     else:
                         entity.set("isName", "True")
                         self.tokenIsName[token] = True
             if Range.overlap(entityHeadOffset, tokenOffset):
                 self.tokenIsEntityHead[token].append(entity)
开发者ID:ninjin,项目名称:TEES,代码行数:32,代码来源:SentenceGraph.py

示例3: getAttributes

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def getAttributes(element):
    attrib = element.attrib.copy()
    #attrib[TAGKEY] = element.tag
    for key in attrib:
        if "offset" in key.lower():
            attrib[key] = Range.charOffsetToTuples(attrib[key])
            if len(attrib[key]) == 1:
                attrib[key] = attrib[key][0]
    return attrib
开发者ID:jbjorne,项目名称:TEES,代码行数:11,代码来源:JSONUtils.py

示例4: processElements

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def processElements(xml):
    for ddi in xml.getiterator("ddi"):
        ddi.tag = "interaction"
    for entity in xml.getiterator("entity"):
        entity.set("given", "True")
        # Reformat disjoint character offsets and update character range format for TEES 2.0+
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";")
        updatedCharOffsets = []
        for charOffset in charOffsets:
            updatedCharOffsets.append( (charOffset[0], charOffset[1]+1) )
        entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:13,代码来源:convertDDI13.py

示例5: getEntityHeadToken

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def getEntityHeadToken(entity, tokens, tokenHeadScores):
    if entity.get("headOffset") != None:
        charOffsets = Range.charOffsetToTuples(entity.get("headOffset"))
    elif entity.get("charOffset") != "":
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
    else:
        charOffsets = []
    # Each entity can consist of multiple syntactic tokens, covered by its
    # charOffset-range. One of these must be chosen as the head token.
    headTokens = [] # potential head tokens
    for token in tokens:
        tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
        for offset in charOffsets:
            if Range.overlap(offset, tokenOffset):
                headTokens.append(token)
    if len(headTokens)==1: # An unambiguous head token was found
        selectedHeadToken = headTokens[0]
    else: # One head token must be chosen from the candidates
        selectedHeadToken = findHeadToken(headTokens, tokenHeadScores)
        #if verbose:
        #    print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"]
    assert selectedHeadToken != None, entityElement.get("id")
    return selectedHeadToken
开发者ID:ninjin,项目名称:TEES,代码行数:25,代码来源:DetectHeads.py

示例6: updateXML

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def updateXML(root, removeAnalyses=True):
    counts = defaultdict(int)
    for document in root.findall("document"):
        sentencePos = 0
        counts["documents"] += 1
        for sentence in document.findall("sentence"):
            counts["sentences"] += 1
            # Remove the original parses
            analyses = sentence.find("sentenceanalyses")
            if analyses != None:
                counts["analyses"] += 1
                if removeAnalyses:
                    counts["removed-analyses"] += 1
                    sentence.remove(analyses)
            # Add an artifical sentence offset so that sentences can be exported as a single document
            sentenceText = sentence.get("text")
            sentence.set("charOffset", Range.tuplesToCharOffset((sentencePos, sentencePos + len(sentenceText))))
            # Update the character offsets of all entities from the old format (begin,end) to the new one (begin,end+1)
            for entity in sentence.findall("entity"):
                counts["entities"] += 1
                offsets = [(x[0], x[1] + 1) for x in Range.charOffsetToTuples(entity.get("charOffset"))]
                entityText = entity.get("text")
                for offset, entitySpan in zip(offsets, [sentenceText[x[0]:x[1]] for x in offsets]):
                    counts["entity-offsets"] += 1
                    lenOffset = offset[1] - offset[0]
                    offsetText, entityText = entityText[:lenOffset].strip(), entityText[lenOffset:].strip()
                    assert offsetText == entitySpan, (offsets, (entity.get("text"), entitySpan), (offsetText, entityText), sentenceText)
                entity.set("charOffset", Range.tuplesToCharOffset(offsets))
            # Convert positive pairs into interaction elements
            numInteractions = 0
            for pair in sentence.findall("pair"):
                counts["pairs"] += 1
                sentence.remove(pair)
                if pair.get("interaction") == "True":
                    del pair.attrib["interaction"]
                    pair.set("id", pair.get("id").rsplit(".", 1)[0] + ".i" + str(numInteractions))
                    pair.set("type", "PPI")
                    ET.SubElement(sentence, "interaction", pair.attrib)
                    numInteractions += 1
                    counts["interactions"] += 1
            sentencePos += len(sentenceText) + 1
    print >> sys.stderr, "Updated Interaction XML format:", dict(counts)
    return root
开发者ID:jbjorne,项目名称:TEES,代码行数:45,代码来源:convertPPI.py

示例7: makeDDI13SubmissionFile

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None):
    xml = ETUtils.ETFromObj(input)
    outFile = open(output, "wt")
    for sentence in xml.getiterator("sentence"):
        sentenceId = sentence.get("id")
        if idfilter != None and idfilter not in sentenceId:
            continue
        # Output entities
        if mode == "entities":
            for entity in sentence.findall("entity"):
                if entity.get("type") != "neg":
                    outFile.write(sentenceId)
                    offsets = Range.charOffsetToTuples(entity.get("charOffset"))
                    for i in range(len(offsets)):
                        offsets[i] = (offsets[i][0], offsets[i][1]-1)
                    outFile.write("|" + Range.tuplesToCharOffset(offsets, rangeSep=";"))
                    outFile.write("|" + entity.get("text"))
                    outFile.write("|" + entity.get("type"))
                    outFile.write("\n")    
        if mode == "interactions":
            # First determine which pairs interact
            intMap = defaultdict(lambda:defaultdict(lambda:None))
            for interaction in sentence.findall("interaction"):
                # Make mapping both ways to discard edge directionality. This isn't actually needed,
                # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function,
                # but shouldn't harm to include it and now it works regardless of pair direction.
                if interaction.get("type") != "neg" and interaction.get("given") != "True":
                    intMap[interaction.get("e1")][interaction.get("e2")] = interaction
                    intMap[interaction.get("e2")][interaction.get("e1")] = interaction
            # Then write all pairs to the output file
            entities = sentence.findall("entity")
            for i in range(0, len(entities)-1):
                for j in range(i+1, len(entities)):
                    eIId = entities[i].get("id")
                    eJId = entities[j].get("id")
                    outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|")
                    if intMap[eIId][eJId] != None:
                        interaction = intMap[eIId][eJId]
                        assert interaction.get("type") != "neg"
                        outFile.write("1|" + interaction.get("type") + "\n")
                    else:
                        outFile.write("0|null\n")
    outFile.close()
开发者ID:jbjorne,项目名称:TEES,代码行数:45,代码来源:DDITools.py

示例8: moveElements

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            overlaps = False
            for entityOffset in entityOffsets:
                if Range.overlap(sentenceOffset, entityOffset):
                    overlaps = True
                    break
            if overlaps:
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                newEntityOffsets = []
                for entityOffset in entityOffsets:
                    newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                    newOffset = (max(0, newOffset[0]), max(0, newOffset[1]))
                    if newOffset != (0, 0):
                        assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset)
                        newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) )
                assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset)
                entity.set("origOffset", entity.get("charOffset"))
                #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) 
                entCount += 1
        sentenceCount += 1
    if len([x for x in document.findall("entity")]) != 0:
        raise Exception("Sentence splitting does not cover the entire document")
    # Move interactions
    intCount = 0
    interactions = []
    interactionOldToNewId = {}
    for interaction in document.findall("interaction"):
        interactions.append(interaction)
        #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
        #    targetSentence = entSentence[interaction.get("e1")]
        #else:
        #    targetSentence = entSentence[interaction.get("e2")]
        
        # Interactions go to a sentence always by e1, as this is the event they are an argument of.
        # If an intersentence interaction is a relation, this shouldn't matter.
        targetSentence = entSentence[interaction.get("e1")]  
        document.remove(interaction)
        targetSentence.append(interaction)
        newId = targetSentence.get("id") + ".i" + str(intCount)
        interactionOldToNewId[interaction.get("id")] = newId
        interaction.set("id", newId)
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
    for interaction in interactions:
        if interaction.get("siteOf") != None:
            interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
开发者ID:jbjorne,项目名称:TEES,代码行数:74,代码来源:GeniaSentenceSplitter.py

示例9: extend

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()
    
    bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens()
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        incorrectCount = 0
        sentenceText = sentence.get("text")
        tokens = tokenize(sentenceText)
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            if entity.get("type") not in entityTypes:
                continue
            headOffset = entity.get("headOffset")
            if headOffset == None:
                if verbose: print "WARNING, no head offset for entity", entity.get("id")
                headOffset = entity.get("charOffset")
            headOffset = Range.charOffsetToTuples(headOffset)[0]
            charOffset = entity.get("charOffset")
            assert charOffset != None, "WARNING, no head offset for entity " + str(entity.get("id"))
            charOffset = Range.charOffsetToTuples(charOffset)[0]
            tokPos = [0,0]
            tokIndex = None
            # find main token
            for i in range(len(tokens)):
                token = tokens[i]
                tokPos[1] = tokPos[0] + len(token) # - 1
                if Range.overlap(headOffset, tokPos):
                    tokIndex = i
                    break
                tokPos[0] += len(token)
            assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens)
            skip = False
            if tokPos[0] < headOffset[0]:
                tokPos = headOffset
                skip = True
            if not skip:
                # Extend before
                beginIndex = tokIndex
                for i in range(tokIndex-1, -1, -1):
                    token = tokens[i]
                    if token.isspace():
                        continue
                    if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
                        beginIndex = i + 1
                        break
                    if i == 0:
                        beginIndex = i
                while tokens[beginIndex].isspace() or isExtraWord(tokens[beginIndex], toLower=False):
                    beginIndex += 1
                    if beginIndex >= tokIndex:
                        beginIndex = tokIndex
                        break
                # Extend after
                endIndex = tokIndex
                if tokens[tokIndex][-1] != ",":
                    endIndex = tokIndex
                    for i in range(tokIndex+1, len(tokens)):
                        token = tokens[i]
                        if token.isspace():
                            continue
                        if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
                            endIndex = i - 1
                            break
                        if i == len(tokens) - 1:
                            endIndex = i
                    while tokens[endIndex].isspace():
                        endIndex -= 1
                # Modify range
                if tokIndex > beginIndex:
                    for token in reversed(tokens[beginIndex:tokIndex]):
                        tokPos[0] -= len(token)
                if tokIndex < endIndex:
                    for token in tokens[tokIndex+1:endIndex+1]:
                        tokPos[1] += len(token)
                # Attempt to remove trailing periods and commas
                while not sentenceText[tokPos[1] - 1].isalnum():
                    tokPos[1] -= 1
                    if tokPos[1] < tokPos[0] + 1:
                        tokPos[1] = tokPos[0] + 1
                        break
                while not sentenceText[tokPos[0]].isalnum():
                    tokPos[0] += 1
                    if tokPos[0] >= tokPos[1]:
                        tokPos[0] = tokPos[1] - 1
                        break
                # Split merged names
                #newPos = [tokPos[0], tokPos[1]]
                #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"):
                #    newPos[0] += len(split)
                #    if                 
            # Insert changed charOffset
#.........这里部分代码省略.........
开发者ID:DUT-LiuYang,项目名称:TEES,代码行数:103,代码来源:ExtendTriggers.py

示例10: addEntitiesToSTDoc

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def addEntitiesToSTDoc(doc, docElement, tMap, eMap, entityElementMap, useOrigIds=False):
    containerElements = [docElement] + [x for x in docElement.getiterator("sentence")]
    for containerElement in containerElements:
        for entity in containerElement.findall("entity"):
            eType = entity.get("type")
            if eType == "neg": # skip negative predictions if they are present
                continue
            assert entity.get("id") != None
            entityElementMap[entity.get("id")] = entity
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            ann = Annotation()
            ann.type = eType
            if useOrigIds:
                entityOrigId = entity.get("origId")
                if entityOrigId != None and entityOrigId.find(".") != -1: # fix gluing of doc and ann id
                    entityOrigId = entityOrigId.rsplit(".",1)[-1]
                if entityOrigId != None:
                    if entityOrigId[0] == "E": # a special id denoting a numbered, but triggerless event
                        ann.eventId = entityOrigId
                        ann.id = None
                    else:
                        ann.id = entityOrigId
            ann.text = entity.get("text")
            if entity.get("normalization") != None:
                ann.normalization = entity.get("normalization")
            #assert entityOffset[1] - entityOffset[0] in [len(ann.text), len(ann.text) - 1], (ann.text, entityOffset)
            ann.charOffsets = entityOffsets
            #ann.charBegin = entityOffset[0]
            #ann.charEnd = entityOffset[0] + len(ann.text) # entityOffset[1] + 1
            if containerElement.tag == "sentence": # entity offset is relative to the container element, and for sentences, they can be relative to the document
                sentenceOffset = Range.charOffsetToSingleTuple(containerElement.get("charOffset"))
                for i in range(len(ann.charOffsets)):
                    ann.charOffsets[i] = (ann.charOffsets[i][0] + sentenceOffset[0], ann.charOffsets[i][1] + sentenceOffset[0]) 
                #ann.charBegin += sentenceOffset[0]
                #ann.charEnd += sentenceOffset[0]
#            idStem = entity.get("id").split(".e", 1)[0]
#            if sentenceOffsets.has_key(idStem):
#                sentenceOffset = sentenceOffsets[idStem]
#                ann.charBegin += sentenceOffset[0]
#                ann.charEnd += sentenceOffset[0]
            if entity.get("speculation") == "True":
                ann.speculation = True
            if entity.get("negation") == "True":
                ann.negation = True
            ann.extra = getExtraFromElement(entity) # add all scores and extra data
            if entity.get("given") == "True":
                # Remember to use original id for names!
                if entity.get("origId") != None:
                    ann.id = entity.get("origId").rsplit(".", 1)[-1]
                    assert ann.id[0].isupper(), ann.id
                    for c in ann.id[1:]:
                        assert c.isdigit(), ann.id
                doc.proteins.append(ann)
                tMap[entity.get("id")] = ann
                # The part below is dangerous, and incompatibilities should be handled rather
                # by not converting to the shared task format when it cannot be done 
                #if entity.get("origId") != None:
                #    # Attempt to process origId, assuming it corresponds to the BioNLP Shared Task format
                #    nonNamedEntityOrigId = entity.get("origId").rsplit(".", 1)[-1]
                #    if len(nonNamedEntityOrigId) > 1 and nonNamedEntityOrigId[0].isupper() and nonNamedEntityOrigId[1:].isdigit():
                #        ann.id = nonNamedEntityOrigId
                #stDoc.proteins.append(ann)
            else: # a predicted protein or trigger
                duplicateAnn = findDuplicateForSTTrigger(ann, doc.triggers)
                if duplicateAnn == None:
                    doc.triggers.append(ann)
                    tMap[entity.get("id")] = ann
                    # Add confidence scores
                    #ann.extra = getExtraFromElement(entity, ["conf"])
                    #ann.triggerScores = entity.get("predictions")
                    #ann.unmergingScores = entity.get("umStrength")
                    #ann.speculationScores = entity.get("modPred")
                    #ann.negationScores = entity.get("modPred")
                    # Events with 0 interactions (such as some Process-type events) would not be formed when constructing events based on interactions
                    if entity.get("event") == "True":
                        event = makeSTEvent(ann, entityElementMap[entity.get("id")])
                        eMap[entity.get("id")] = event
                        doc.events.append(event)
                else: # a duplicate trigger already exists
                    tMap[entity.get("id")] = duplicateAnn
开发者ID:ayoshiaki,项目名称:TEES,代码行数:82,代码来源:ConvertXML.py

示例11: mapEntity

# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
 def mapEntity(self, entityElement, verbose=False):
     """
     Determine the head token for a named entity or trigger. The head token is the token closest
     to the root for the subtree of the dependency parse spanned by the text of the element.
     
     @param entityElement: a semantic node (trigger or named entity)
     @type entityElement: cElementTree.Element
     @param verbose: Print selected head tokens on screen
     @param verbose: boolean
     """
     headOffset = None
     if entityElement.get("headOffset") != None:
         headOffset = Range.charOffsetToSingleTuple(entityElement.get("headOffset"))
     if entityElement.get("charOffset") != "":
         charOffsets = Range.charOffsetToTuples(entityElement.get("charOffset"))
     else:
         charOffsets = []
     # Each entity can consist of multiple syntactic tokens, covered by its
     # charOffset-range. One of these must be chosen as the head token.
     headTokens = [] # potential head tokens
     for token in self.tokens:
         #print token.attrib["id"], token.attrib["charOffset"]
         tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
         if headOffset != None and entityElement.get("type") != "Binding":
             # A head token can already be defined in the headOffset-attribute.
             # However, depending on the tokenization, even this range may
             # contain multiple tokens. Still, it can always be assumed that
             # if headOffset is defined, the corret head token is in this range.
             if Range.overlap(headOffset,tokenOffset):
                 headTokens.append(token)
         else:
             for offset in charOffsets:
                 if Range.overlap(offset,tokenOffset):
                     headTokens.append(token)
     if len(headTokens)==1: # An unambiguous head token was found
         token = headTokens[0]
     else: # One head token must be chosen from the candidates
         selHead = None
         if entityElement.get("type") == "Binding":
             for t in headTokens:
                 compText = t.get("text").lower()
                 if compText.find("bind") != -1 or compText.find("complex") != -1:
                     selHead = t
                     #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset")
                     entityElement.set("headOffset", selHead.get("charOffset"))
                     break
         if selHead == None: 
             token = self.findHeadToken(headTokens)
         else:
             token = selHead
         if verbose:
             print >> sys.stderr, "Selected head:", token.get("id"), token.get("text")
     #assert token != None, entityElement.get("id")
     if token != None:
         # The ElementTree entity-element is modified by setting the headOffset attribute
         if entityElement.get("headOffset") == None or entityElement.get("headOffset") != token.get("charOffset"):
             entityElement.set("headOffset", token.get("charOffset"))
         if not self.entitiesByToken.has_key(token):
             self.entitiesByToken[token] = []
         self.entitiesByToken[token].append(entityElement)
     else:
         print >> sys.stderr, "Warning, no tokens for entity", entityElement.get("id")
     return token
开发者ID:ninjin,项目名称:TEES,代码行数:65,代码来源:SentenceGraph.py


注:本文中的Utils.Range.charOffsetToTuples方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。