本文整理汇总了Python中Utils.Range.charOffsetToTuples方法的典型用法代码示例。如果您正苦于以下问题:Python Range.charOffsetToTuples方法的具体用法?Python Range.charOffsetToTuples怎么用?Python Range.charOffsetToTuples使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Utils.Range
的用法示例。
在下文中一共展示了Range.charOffsetToTuples方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fixAltOffsets
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def fixAltOffsets(input, output=None):
print >> sys.stderr, "Loading corpus", input
corpusTree = ETUtils.ETFromObj(input)
print >> sys.stderr, "Corpus file loaded"
corpusRoot = corpusTree.getroot()
docCount = 0
sentencesCreated = 0
sentences = [x for x in corpusRoot.getiterator("sentence")]
counter = ProgressCounter(len(sentences), "FixAltOffsets")
fixCount = 0
# fix spans
for sentence in sentences:
counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
for entity in sentence.findall("entity"):
altOffsetString = entity.get("altOffset")
if altOffsetString == None:
continue
#print altOffsetString
altOffsets = Range.charOffsetToTuples(altOffsetString)
assert len(altOffsets) == 1
for i in range(len(altOffsets)):
altOffset = altOffsets[i]
altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
fixCount += 1
print >> sys.stderr, "Fixed", fixCount, "altOffsets"
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(corpusRoot, output)
return corpusTree
示例2: _markNamedEntities
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def _markNamedEntities(self):
"""
This method is used to define which tokens belong to _named_ entities.
Named entities are sometimes masked when testing learning of interactions, to
prevent the system making a trivial decision based on commonly interacting names.
"""
self.tokenIsName = {}
self.tokenIsEntity = {}
self.tokenIsEntityHead = {}
# Initialize the dictionaries
for token in self.tokens:
self.tokenIsName[token] = False
self.tokenIsEntity[token] = False
self.tokenIsEntityHead[token] = []
for entity in self.entities:
entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset"))
for token in self.tokens:
tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
for entityOffset in entityOffsets:
if Range.overlap(entityOffset, tokenOffset):
self.tokenIsEntity[token] = True
if entity.get("isName") != None:
if entity.get("isName") == "True":
self.tokenIsName[token] = True
else:
entity.set("isName", "True")
self.tokenIsName[token] = True
if Range.overlap(entityHeadOffset, tokenOffset):
self.tokenIsEntityHead[token].append(entity)
示例3: getAttributes
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def getAttributes(element):
attrib = element.attrib.copy()
#attrib[TAGKEY] = element.tag
for key in attrib:
if "offset" in key.lower():
attrib[key] = Range.charOffsetToTuples(attrib[key])
if len(attrib[key]) == 1:
attrib[key] = attrib[key][0]
return attrib
示例4: processElements
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def processElements(xml):
for ddi in xml.getiterator("ddi"):
ddi.tag = "interaction"
for entity in xml.getiterator("entity"):
entity.set("given", "True")
# Reformat disjoint character offsets and update character range format for TEES 2.0+
charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";")
updatedCharOffsets = []
for charOffset in charOffsets:
updatedCharOffsets.append( (charOffset[0], charOffset[1]+1) )
entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
示例5: getEntityHeadToken
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def getEntityHeadToken(entity, tokens, tokenHeadScores):
if entity.get("headOffset") != None:
charOffsets = Range.charOffsetToTuples(entity.get("headOffset"))
elif entity.get("charOffset") != "":
charOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
else:
charOffsets = []
# Each entity can consist of multiple syntactic tokens, covered by its
# charOffset-range. One of these must be chosen as the head token.
headTokens = [] # potential head tokens
for token in tokens:
tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
for offset in charOffsets:
if Range.overlap(offset, tokenOffset):
headTokens.append(token)
if len(headTokens)==1: # An unambiguous head token was found
selectedHeadToken = headTokens[0]
else: # One head token must be chosen from the candidates
selectedHeadToken = findHeadToken(headTokens, tokenHeadScores)
#if verbose:
# print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"]
assert selectedHeadToken != None, entityElement.get("id")
return selectedHeadToken
示例6: updateXML
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def updateXML(root, removeAnalyses=True):
counts = defaultdict(int)
for document in root.findall("document"):
sentencePos = 0
counts["documents"] += 1
for sentence in document.findall("sentence"):
counts["sentences"] += 1
# Remove the original parses
analyses = sentence.find("sentenceanalyses")
if analyses != None:
counts["analyses"] += 1
if removeAnalyses:
counts["removed-analyses"] += 1
sentence.remove(analyses)
# Add an artifical sentence offset so that sentences can be exported as a single document
sentenceText = sentence.get("text")
sentence.set("charOffset", Range.tuplesToCharOffset((sentencePos, sentencePos + len(sentenceText))))
# Update the character offsets of all entities from the old format (begin,end) to the new one (begin,end+1)
for entity in sentence.findall("entity"):
counts["entities"] += 1
offsets = [(x[0], x[1] + 1) for x in Range.charOffsetToTuples(entity.get("charOffset"))]
entityText = entity.get("text")
for offset, entitySpan in zip(offsets, [sentenceText[x[0]:x[1]] for x in offsets]):
counts["entity-offsets"] += 1
lenOffset = offset[1] - offset[0]
offsetText, entityText = entityText[:lenOffset].strip(), entityText[lenOffset:].strip()
assert offsetText == entitySpan, (offsets, (entity.get("text"), entitySpan), (offsetText, entityText), sentenceText)
entity.set("charOffset", Range.tuplesToCharOffset(offsets))
# Convert positive pairs into interaction elements
numInteractions = 0
for pair in sentence.findall("pair"):
counts["pairs"] += 1
sentence.remove(pair)
if pair.get("interaction") == "True":
del pair.attrib["interaction"]
pair.set("id", pair.get("id").rsplit(".", 1)[0] + ".i" + str(numInteractions))
pair.set("type", "PPI")
ET.SubElement(sentence, "interaction", pair.attrib)
numInteractions += 1
counts["interactions"] += 1
sentencePos += len(sentenceText) + 1
print >> sys.stderr, "Updated Interaction XML format:", dict(counts)
return root
示例7: makeDDI13SubmissionFile
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None):
xml = ETUtils.ETFromObj(input)
outFile = open(output, "wt")
for sentence in xml.getiterator("sentence"):
sentenceId = sentence.get("id")
if idfilter != None and idfilter not in sentenceId:
continue
# Output entities
if mode == "entities":
for entity in sentence.findall("entity"):
if entity.get("type") != "neg":
outFile.write(sentenceId)
offsets = Range.charOffsetToTuples(entity.get("charOffset"))
for i in range(len(offsets)):
offsets[i] = (offsets[i][0], offsets[i][1]-1)
outFile.write("|" + Range.tuplesToCharOffset(offsets, rangeSep=";"))
outFile.write("|" + entity.get("text"))
outFile.write("|" + entity.get("type"))
outFile.write("\n")
if mode == "interactions":
# First determine which pairs interact
intMap = defaultdict(lambda:defaultdict(lambda:None))
for interaction in sentence.findall("interaction"):
# Make mapping both ways to discard edge directionality. This isn't actually needed,
# since MultiEdgeExampleBuilder builds entity pairs in the same order as this function,
# but shouldn't harm to include it and now it works regardless of pair direction.
if interaction.get("type") != "neg" and interaction.get("given") != "True":
intMap[interaction.get("e1")][interaction.get("e2")] = interaction
intMap[interaction.get("e2")][interaction.get("e1")] = interaction
# Then write all pairs to the output file
entities = sentence.findall("entity")
for i in range(0, len(entities)-1):
for j in range(i+1, len(entities)):
eIId = entities[i].get("id")
eJId = entities[j].get("id")
outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|")
if intMap[eIId][eJId] != None:
interaction = intMap[eIId][eJId]
assert interaction.get("type") != "neg"
outFile.write("1|" + interaction.get("type") + "\n")
else:
outFile.write("0|null\n")
outFile.close()
示例8: moveElements
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def moveElements(document):
entMap = {}
entSentence = {}
entSentenceIndex = {}
sentences = document.findall("sentence")
sentenceCount = 0
for sentence in sentences:
sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
# Move entities
entCount = 0
for entity in document.findall("entity"):
entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
overlaps = False
for entityOffset in entityOffsets:
if Range.overlap(sentenceOffset, entityOffset):
overlaps = True
break
if overlaps:
document.remove(entity)
sentence.append(entity)
entityId = entity.get("id")
entityIdLastPart = entityId.rsplit(".", 1)[-1]
if entityIdLastPart.startswith("e"):
entity.set("id", sentence.get("id") + "." + entityIdLastPart)
entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
else:
entity.set("docId", entityId)
entity.set("id", sentence.get("id") + ".e" + str(entCount))
entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
entSentence[entityId] = sentence
entSentenceIndex[entityId] = sentenceCount
#newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
newEntityOffsets = []
for entityOffset in entityOffsets:
newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
newOffset = (max(0, newOffset[0]), max(0, newOffset[1]))
if newOffset != (0, 0):
assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset)
newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) )
assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset)
entity.set("origOffset", entity.get("charOffset"))
#entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets))
entCount += 1
sentenceCount += 1
if len([x for x in document.findall("entity")]) != 0:
raise Exception("Sentence splitting does not cover the entire document")
# Move interactions
intCount = 0
interactions = []
interactionOldToNewId = {}
for interaction in document.findall("interaction"):
interactions.append(interaction)
#if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
# targetSentence = entSentence[interaction.get("e1")]
#else:
# targetSentence = entSentence[interaction.get("e2")]
# Interactions go to a sentence always by e1, as this is the event they are an argument of.
# If an intersentence interaction is a relation, this shouldn't matter.
targetSentence = entSentence[interaction.get("e1")]
document.remove(interaction)
targetSentence.append(interaction)
newId = targetSentence.get("id") + ".i" + str(intCount)
interactionOldToNewId[interaction.get("id")] = newId
interaction.set("id", newId)
interaction.set("e1", entMap[interaction.get("e1")])
interaction.set("e2", entMap[interaction.get("e2")])
intCount += 1
for interaction in interactions:
if interaction.get("siteOf") != None:
interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
示例9: extend
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False):
if not (ET.iselement(input) and input.tag == "sentence"):
print >> sys.stderr, "Loading corpus file", input
corpusTree = ETUtils.ETFromObj(input)
corpusRoot = corpusTree.getroot()
bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens()
if not (ET.iselement(input) and input.tag == "sentence"):
sentences = corpusRoot.getiterator("sentence")
else:
sentences = [input]
counts = defaultdict(int)
for sentence in sentences:
incorrectCount = 0
sentenceText = sentence.get("text")
tokens = tokenize(sentenceText)
for entity in sentence.findall("entity"):
counts["all-entities"] += 1
if entity.get("type") not in entityTypes:
continue
headOffset = entity.get("headOffset")
if headOffset == None:
if verbose: print "WARNING, no head offset for entity", entity.get("id")
headOffset = entity.get("charOffset")
headOffset = Range.charOffsetToTuples(headOffset)[0]
charOffset = entity.get("charOffset")
assert charOffset != None, "WARNING, no head offset for entity " + str(entity.get("id"))
charOffset = Range.charOffsetToTuples(charOffset)[0]
tokPos = [0,0]
tokIndex = None
# find main token
for i in range(len(tokens)):
token = tokens[i]
tokPos[1] = tokPos[0] + len(token) # - 1
if Range.overlap(headOffset, tokPos):
tokIndex = i
break
tokPos[0] += len(token)
assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens)
skip = False
if tokPos[0] < headOffset[0]:
tokPos = headOffset
skip = True
if not skip:
# Extend before
beginIndex = tokIndex
for i in range(tokIndex-1, -1, -1):
token = tokens[i]
if token.isspace():
continue
if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
beginIndex = i + 1
break
if i == 0:
beginIndex = i
while tokens[beginIndex].isspace() or isExtraWord(tokens[beginIndex], toLower=False):
beginIndex += 1
if beginIndex >= tokIndex:
beginIndex = tokIndex
break
# Extend after
endIndex = tokIndex
if tokens[tokIndex][-1] != ",":
endIndex = tokIndex
for i in range(tokIndex+1, len(tokens)):
token = tokens[i]
if token.isspace():
continue
if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
endIndex = i - 1
break
if i == len(tokens) - 1:
endIndex = i
while tokens[endIndex].isspace():
endIndex -= 1
# Modify range
if tokIndex > beginIndex:
for token in reversed(tokens[beginIndex:tokIndex]):
tokPos[0] -= len(token)
if tokIndex < endIndex:
for token in tokens[tokIndex+1:endIndex+1]:
tokPos[1] += len(token)
# Attempt to remove trailing periods and commas
while not sentenceText[tokPos[1] - 1].isalnum():
tokPos[1] -= 1
if tokPos[1] < tokPos[0] + 1:
tokPos[1] = tokPos[0] + 1
break
while not sentenceText[tokPos[0]].isalnum():
tokPos[0] += 1
if tokPos[0] >= tokPos[1]:
tokPos[0] = tokPos[1] - 1
break
# Split merged names
#newPos = [tokPos[0], tokPos[1]]
#for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"):
# newPos[0] += len(split)
# if
# Insert changed charOffset
#.........这里部分代码省略.........
示例10: addEntitiesToSTDoc
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def addEntitiesToSTDoc(doc, docElement, tMap, eMap, entityElementMap, useOrigIds=False):
containerElements = [docElement] + [x for x in docElement.getiterator("sentence")]
for containerElement in containerElements:
for entity in containerElement.findall("entity"):
eType = entity.get("type")
if eType == "neg": # skip negative predictions if they are present
continue
assert entity.get("id") != None
entityElementMap[entity.get("id")] = entity
entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
ann = Annotation()
ann.type = eType
if useOrigIds:
entityOrigId = entity.get("origId")
if entityOrigId != None and entityOrigId.find(".") != -1: # fix gluing of doc and ann id
entityOrigId = entityOrigId.rsplit(".",1)[-1]
if entityOrigId != None:
if entityOrigId[0] == "E": # a special id denoting a numbered, but triggerless event
ann.eventId = entityOrigId
ann.id = None
else:
ann.id = entityOrigId
ann.text = entity.get("text")
if entity.get("normalization") != None:
ann.normalization = entity.get("normalization")
#assert entityOffset[1] - entityOffset[0] in [len(ann.text), len(ann.text) - 1], (ann.text, entityOffset)
ann.charOffsets = entityOffsets
#ann.charBegin = entityOffset[0]
#ann.charEnd = entityOffset[0] + len(ann.text) # entityOffset[1] + 1
if containerElement.tag == "sentence": # entity offset is relative to the container element, and for sentences, they can be relative to the document
sentenceOffset = Range.charOffsetToSingleTuple(containerElement.get("charOffset"))
for i in range(len(ann.charOffsets)):
ann.charOffsets[i] = (ann.charOffsets[i][0] + sentenceOffset[0], ann.charOffsets[i][1] + sentenceOffset[0])
#ann.charBegin += sentenceOffset[0]
#ann.charEnd += sentenceOffset[0]
# idStem = entity.get("id").split(".e", 1)[0]
# if sentenceOffsets.has_key(idStem):
# sentenceOffset = sentenceOffsets[idStem]
# ann.charBegin += sentenceOffset[0]
# ann.charEnd += sentenceOffset[0]
if entity.get("speculation") == "True":
ann.speculation = True
if entity.get("negation") == "True":
ann.negation = True
ann.extra = getExtraFromElement(entity) # add all scores and extra data
if entity.get("given") == "True":
# Remember to use original id for names!
if entity.get("origId") != None:
ann.id = entity.get("origId").rsplit(".", 1)[-1]
assert ann.id[0].isupper(), ann.id
for c in ann.id[1:]:
assert c.isdigit(), ann.id
doc.proteins.append(ann)
tMap[entity.get("id")] = ann
# The part below is dangerous, and incompatibilities should be handled rather
# by not converting to the shared task format when it cannot be done
#if entity.get("origId") != None:
# # Attempt to process origId, assuming it corresponds to the BioNLP Shared Task format
# nonNamedEntityOrigId = entity.get("origId").rsplit(".", 1)[-1]
# if len(nonNamedEntityOrigId) > 1 and nonNamedEntityOrigId[0].isupper() and nonNamedEntityOrigId[1:].isdigit():
# ann.id = nonNamedEntityOrigId
#stDoc.proteins.append(ann)
else: # a predicted protein or trigger
duplicateAnn = findDuplicateForSTTrigger(ann, doc.triggers)
if duplicateAnn == None:
doc.triggers.append(ann)
tMap[entity.get("id")] = ann
# Add confidence scores
#ann.extra = getExtraFromElement(entity, ["conf"])
#ann.triggerScores = entity.get("predictions")
#ann.unmergingScores = entity.get("umStrength")
#ann.speculationScores = entity.get("modPred")
#ann.negationScores = entity.get("modPred")
# Events with 0 interactions (such as some Process-type events) would not be formed when constructing events based on interactions
if entity.get("event") == "True":
event = makeSTEvent(ann, entityElementMap[entity.get("id")])
eMap[entity.get("id")] = event
doc.events.append(event)
else: # a duplicate trigger already exists
tMap[entity.get("id")] = duplicateAnn
示例11: mapEntity
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import charOffsetToTuples [as 别名]
def mapEntity(self, entityElement, verbose=False):
"""
Determine the head token for a named entity or trigger. The head token is the token closest
to the root for the subtree of the dependency parse spanned by the text of the element.
@param entityElement: a semantic node (trigger or named entity)
@type entityElement: cElementTree.Element
@param verbose: Print selected head tokens on screen
@param verbose: boolean
"""
headOffset = None
if entityElement.get("headOffset") != None:
headOffset = Range.charOffsetToSingleTuple(entityElement.get("headOffset"))
if entityElement.get("charOffset") != "":
charOffsets = Range.charOffsetToTuples(entityElement.get("charOffset"))
else:
charOffsets = []
# Each entity can consist of multiple syntactic tokens, covered by its
# charOffset-range. One of these must be chosen as the head token.
headTokens = [] # potential head tokens
for token in self.tokens:
#print token.attrib["id"], token.attrib["charOffset"]
tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
if headOffset != None and entityElement.get("type") != "Binding":
# A head token can already be defined in the headOffset-attribute.
# However, depending on the tokenization, even this range may
# contain multiple tokens. Still, it can always be assumed that
# if headOffset is defined, the corret head token is in this range.
if Range.overlap(headOffset,tokenOffset):
headTokens.append(token)
else:
for offset in charOffsets:
if Range.overlap(offset,tokenOffset):
headTokens.append(token)
if len(headTokens)==1: # An unambiguous head token was found
token = headTokens[0]
else: # One head token must be chosen from the candidates
selHead = None
if entityElement.get("type") == "Binding":
for t in headTokens:
compText = t.get("text").lower()
if compText.find("bind") != -1 or compText.find("complex") != -1:
selHead = t
#print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset")
entityElement.set("headOffset", selHead.get("charOffset"))
break
if selHead == None:
token = self.findHeadToken(headTokens)
else:
token = selHead
if verbose:
print >> sys.stderr, "Selected head:", token.get("id"), token.get("text")
#assert token != None, entityElement.get("id")
if token != None:
# The ElementTree entity-element is modified by setting the headOffset attribute
if entityElement.get("headOffset") == None or entityElement.get("headOffset") != token.get("charOffset"):
entityElement.set("headOffset", token.get("charOffset"))
if not self.entitiesByToken.has_key(token):
self.entitiesByToken[token] = []
self.entitiesByToken[token].append(entityElement)
else:
print >> sys.stderr, "Warning, no tokens for entity", entityElement.get("id")
return token