本文整理汇总了Python中Utils.Range.tuplesToCharOffset方法的典型用法代码示例。如果您正苦于以下问题:Python Range.tuplesToCharOffset方法的具体用法?Python Range.tuplesToCharOffset怎么用?Python Range.tuplesToCharOffset使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Utils.Range
的用法示例。
在下文中一共展示了Range.tuplesToCharOffset方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fixAltOffsets
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def fixAltOffsets(input, output=None):
print >> sys.stderr, "Loading corpus", input
corpusTree = ETUtils.ETFromObj(input)
print >> sys.stderr, "Corpus file loaded"
corpusRoot = corpusTree.getroot()
docCount = 0
sentencesCreated = 0
sentences = [x for x in corpusRoot.getiterator("sentence")]
counter = ProgressCounter(len(sentences), "FixAltOffsets")
fixCount = 0
# fix spans
for sentence in sentences:
counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
for entity in sentence.findall("entity"):
altOffsetString = entity.get("altOffset")
if altOffsetString == None:
continue
#print altOffsetString
altOffsets = Range.charOffsetToTuples(altOffsetString)
assert len(altOffsets) == 1
for i in range(len(altOffsets)):
altOffset = altOffsets[i]
altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
fixCount += 1
print >> sys.stderr, "Fixed", fixCount, "altOffsets"
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(corpusRoot, output)
return corpusTree
示例2: makeEntityElement
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def makeEntityElement(ann, idCount, docEl):
entEl = ET.Element("entity")
entEl.set("type", ann.type)
entEl.set("text", ann.text)
# identifiers
protId = docEl.get("id") + ".e" + str(idCount)
entEl.set("id", protId)
if ann.id != None:
entEl.set("origId", docEl.get("origId") + "." + str(ann.id))
# offsets
entEl.set("charOffset", Range.tuplesToCharOffset(ann.charOffsets))
if len(ann.alternativeOffsets) > 0:
altOffs = []
for alternativeOffset in ann.alternativeOffsets:
altOffs.append( str(alternativeOffset[0]) + "-" + str(alternativeOffset[1]-1) )
entEl.set("altOffset", ",".join(altOffs))
if ann.normalization != None:
entEl.set("normalization", ann.normalization)
addExtraToElement(entEl, ann.extra)
# determine if given data
assert ann.fileType in ["a1", "a2", "rel"], ann.fileType
if ann.fileType == "a1": #protein.isName():
entEl.set("given", "True")
#else:
# entEl.set("given", "False")
return entEl
示例3: addParseElements
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def addParseElements(doc, docEl):
if docEl.tag != "sentence":
return
sentAnalysesEl = ET.SubElement(docEl, "analyses")
#parsesEl = ET.SubElement(sentAnalysesEl, "parses")
parseEl = ET.SubElement(sentAnalysesEl, "parse")
#tokenizationsEl = ET.SubElement(sentAnalysesEl, "tokenizations")
tokenizationEl = ET.SubElement(sentAnalysesEl, "tokenization")
parseEl.set("parser", "gold")
parseEl.set("tokenizer", "gold")
tokenizationEl.set("tokenizer", "gold")
tokenMap = {}
for word in doc.words:
tokEl = ET.SubElement(tokenizationEl, "token")
tokEl.set("id", word.id)
tokEl.set("text", word.text)
tokEl.set("POS", "None")
assert len(word.charOffsets) == 1, (word, word.charOffsets)
tokEl.set("charOffset", Range.tuplesToCharOffset(word.charOffsets))
tokenMap[word.id] = tokEl
for dep in doc.dependencies:
depEl = ET.SubElement(parseEl, "dependency")
depEl.set("id", dep.id)
depEl.set("type", dep.type)
assert len(dep.arguments) == 2
depEl.set("t1", dep.arguments[0].target.id)
depEl.set("t2", dep.arguments[1].target.id)
if dep.type.find(":") != -1:
word1Type, word2Type = dep.type.split("(")[0].split(":")[-1].split("-")
tokenMap[dep.arguments[0].target.id].set("POS", word1Type)
tokenMap[dep.arguments[1].target.id].set("POS", word2Type)
示例4: updateXML
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def updateXML(root, removeAnalyses=True):
counts = defaultdict(int)
for document in root.findall("document"):
sentencePos = 0
counts["documents"] += 1
for sentence in document.findall("sentence"):
counts["sentences"] += 1
# Remove the original parses
analyses = sentence.find("sentenceanalyses")
if analyses != None:
counts["analyses"] += 1
if removeAnalyses:
counts["removed-analyses"] += 1
sentence.remove(analyses)
# Add an artifical sentence offset so that sentences can be exported as a single document
sentenceText = sentence.get("text")
sentence.set("charOffset", Range.tuplesToCharOffset((sentencePos, sentencePos + len(sentenceText))))
# Update the character offsets of all entities from the old format (begin,end) to the new one (begin,end+1)
for entity in sentence.findall("entity"):
counts["entities"] += 1
offsets = [(x[0], x[1] + 1) for x in Range.charOffsetToTuples(entity.get("charOffset"))]
entityText = entity.get("text")
for offset, entitySpan in zip(offsets, [sentenceText[x[0]:x[1]] for x in offsets]):
counts["entity-offsets"] += 1
lenOffset = offset[1] - offset[0]
offsetText, entityText = entityText[:lenOffset].strip(), entityText[lenOffset:].strip()
assert offsetText == entitySpan, (offsets, (entity.get("text"), entitySpan), (offsetText, entityText), sentenceText)
entity.set("charOffset", Range.tuplesToCharOffset(offsets))
# Convert positive pairs into interaction elements
numInteractions = 0
for pair in sentence.findall("pair"):
counts["pairs"] += 1
sentence.remove(pair)
if pair.get("interaction") == "True":
del pair.attrib["interaction"]
pair.set("id", pair.get("id").rsplit(".", 1)[0] + ".i" + str(numInteractions))
pair.set("type", "PPI")
ET.SubElement(sentence, "interaction", pair.attrib)
numInteractions += 1
counts["interactions"] += 1
sentencePos += len(sentenceText) + 1
print >> sys.stderr, "Updated Interaction XML format:", dict(counts)
return root
示例5: processElements
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def processElements(xml):
for ddi in xml.getiterator("ddi"):
ddi.tag = "interaction"
for entity in xml.getiterator("entity"):
entity.set("given", "True")
# Reformat disjoint character offsets and update character range format for TEES 2.0+
charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";")
updatedCharOffsets = []
for charOffset in charOffsets:
updatedCharOffsets.append( (charOffset[0], charOffset[1]+1) )
entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
示例6: fixEntities
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def fixEntities(xml):
counts = defaultdict(int)
for sentence in xml.getiterator("sentence"):
sText = sentence.get("text")
for entity in sentence.findall("entity"):
charOffset = entity.get("charOffset")
if charOffset == "-":
assert False, str(entity)
sentence.remove(entity)
counts["removed-invalid"] += 1
else:
charOffset = Range.charOffsetToSingleTuple(charOffset)
# fix length
realLength = len(entity.get("text"))
lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength
if lenDiff != realLength:
counts["incorrect-ent-offset"] += 1
counts["incorrect-ent-offset-diff"+str(lenDiff)] += 1
if abs(lenDiff) > 2:
print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id"))
charOffset = (charOffset[0], charOffset[0] + realLength)
# find starting position
entIndex = sText.find(entity.get("text"), charOffset[0])
if entIndex == -1:
for i in [-1,-2,-3]:
entIndex = sText.find(entity.get("text"), charOffset[0]+i)
if entIndex != -1:
break
if entIndex != 0: # could be lowercase
sTextLower = sText.lower()
for i in [0,-1,-2,-3]:
lowerEntIndex = sTextLower.find(entity.get("text"), charOffset[0]+i)
if lowerEntIndex != -1:
break
if lowerEntIndex != -1 and abs(lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]):
entIndex = lowerEntIndex
assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id"))
indexDiff = entIndex - charOffset[0]
if indexDiff != 0:
counts["incorrect-ent-index"] += 1
counts["incorrect-ent-index-diff"+str(indexDiff)] += 1
print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id"))
# move offset
charOffset = (charOffset[0]+indexDiff, charOffset[1]+indexDiff)
# validate new offset
sEntity = sText[charOffset[0]:charOffset[1]]
assert sEntity == entity.get("text") or sEntity.lower() == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id"))
entity.set("charOffset", Range.tuplesToCharOffset( (charOffset[0], charOffset[1])))
entity.set("given", "True")
for interaction in sentence.findall("interaction"):
interaction.set("type", "DDI")
print "Fix counts:", counts
示例7: makeDDI13SubmissionFile
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None):
xml = ETUtils.ETFromObj(input)
outFile = open(output, "wt")
for sentence in xml.getiterator("sentence"):
sentenceId = sentence.get("id")
if idfilter != None and idfilter not in sentenceId:
continue
# Output entities
if mode == "entities":
for entity in sentence.findall("entity"):
if entity.get("type") != "neg":
outFile.write(sentenceId)
offsets = Range.charOffsetToTuples(entity.get("charOffset"))
for i in range(len(offsets)):
offsets[i] = (offsets[i][0], offsets[i][1]-1)
outFile.write("|" + Range.tuplesToCharOffset(offsets, rangeSep=";"))
outFile.write("|" + entity.get("text"))
outFile.write("|" + entity.get("type"))
outFile.write("\n")
if mode == "interactions":
# First determine which pairs interact
intMap = defaultdict(lambda:defaultdict(lambda:None))
for interaction in sentence.findall("interaction"):
# Make mapping both ways to discard edge directionality. This isn't actually needed,
# since MultiEdgeExampleBuilder builds entity pairs in the same order as this function,
# but shouldn't harm to include it and now it works regardless of pair direction.
if interaction.get("type") != "neg" and interaction.get("given") != "True":
intMap[interaction.get("e1")][interaction.get("e2")] = interaction
intMap[interaction.get("e2")][interaction.get("e1")] = interaction
# Then write all pairs to the output file
entities = sentence.findall("entity")
for i in range(0, len(entities)-1):
for j in range(i+1, len(entities)):
eIId = entities[i].get("id")
eJId = entities[j].get("id")
outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|")
if intMap[eIId][eJId] != None:
interaction = intMap[eIId][eJId]
assert interaction.get("type") != "neg"
outFile.write("1|" + interaction.get("type") + "\n")
else:
outFile.write("0|null\n")
outFile.close()
示例8: convert
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def convert(metamapEl, sentenceEl):
"""
Convert MetaMap XML into phrase-elements
"""
newMetamapEl = ET.Element("metamap") # make a new metamap element
utteranceCount = 0
for utterance in metamapEl.getiterator("Utterance"): # process all utterances (sentences)
utteranceCount += 1
#print "UT:", utterance.find("UttText").text
uttOffsetBegin = int(utterance.find("UttStartPos").text)
for phrase in utterance.getiterator("Phrase"): # process all phrases for each utterance
#print "Phrase:", phrase.find("PhraseText").text
phraseEl = ET.Element("phrase")
phraseOffset = [int(phrase.find("PhraseStartPos").text), int(phrase.find("PhraseStartPos").text) + int(phrase.find("PhraseLength").text)]
phraseOffset = [phraseOffset[0] - uttOffsetBegin, phraseOffset[1] - uttOffsetBegin]
phraseEl.set("charOffset", Range.tuplesToCharOffset(phraseOffset))
phraseEl.set("text", phrase.find("PhraseText").text)
for candidate in phrase.getiterator("Candidate"): # process first candidate of each phrase
phraseEl.set("score", candidate.find("CandidateScore").text)
phraseEl.set("cui", candidate.find("CandidateCUI").text)
phraseEl.set("matched", candidate.find("CandidateMatched").text)
phraseEl.set("preferred", candidate.find("CandidatePreferred").text)
semTypes = set()
for semType in candidate.getiterator("SemType"):
semTypes.add(semType.text)
phraseEl.set("semTypes", ",".join(sorted(list(semTypes))))
sources = set()
for source in candidate.getiterator("Source"):
sources.add(source.text)
phraseEl.set("sources", ",".join(sorted(list(sources))))
break
if phraseEl.get("matched") != None: # include only matched phrases as new elements
newMetamapEl.append(phraseEl)
#print ET.tostring(phraseEl, "utf-8")
if utteranceCount > 1:
print >> sys.stderr, "Warning, sentence", sentenceEl.get("id"), "has", utteranceCount, "utterances"
return newMetamapEl
示例9: moveElements
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def moveElements(document):
entMap = {}
entSentence = {}
entSentenceIndex = {}
sentences = document.findall("sentence")
sentenceCount = 0
for sentence in sentences:
sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
# Move entities
entCount = 0
for entity in document.findall("entity"):
entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
overlaps = False
for entityOffset in entityOffsets:
if Range.overlap(sentenceOffset, entityOffset):
overlaps = True
break
if overlaps:
document.remove(entity)
sentence.append(entity)
entityId = entity.get("id")
entityIdLastPart = entityId.rsplit(".", 1)[-1]
if entityIdLastPart.startswith("e"):
entity.set("id", sentence.get("id") + "." + entityIdLastPart)
entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
else:
entity.set("docId", entityId)
entity.set("id", sentence.get("id") + ".e" + str(entCount))
entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
entSentence[entityId] = sentence
entSentenceIndex[entityId] = sentenceCount
#newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
newEntityOffsets = []
for entityOffset in entityOffsets:
newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
newOffset = (max(0, newOffset[0]), max(0, newOffset[1]))
if newOffset != (0, 0):
assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset)
newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) )
assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset)
entity.set("origOffset", entity.get("charOffset"))
#entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets))
entCount += 1
sentenceCount += 1
if len([x for x in document.findall("entity")]) != 0:
raise Exception("Sentence splitting does not cover the entire document")
# Move interactions
intCount = 0
interactions = []
interactionOldToNewId = {}
for interaction in document.findall("interaction"):
interactions.append(interaction)
#if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
# targetSentence = entSentence[interaction.get("e1")]
#else:
# targetSentence = entSentence[interaction.get("e2")]
# Interactions go to a sentence always by e1, as this is the event they are an argument of.
# If an intersentence interaction is a relation, this shouldn't matter.
targetSentence = entSentence[interaction.get("e1")]
document.remove(interaction)
targetSentence.append(interaction)
newId = targetSentence.get("id") + ".i" + str(intCount)
interactionOldToNewId[interaction.get("id")] = newId
interaction.set("id", newId)
interaction.set("e1", entMap[interaction.get("e1")])
interaction.set("e2", entMap[interaction.get("e2")])
intCount += 1
for interaction in interactions:
if interaction.get("siteOf") != None:
interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
示例10: extend
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
#.........这里部分代码省略.........
for i in range(len(tokens)):
token = tokens[i]
tokPos[1] = tokPos[0] + len(token) # - 1
if Range.overlap(headOffset, tokPos):
tokIndex = i
break
tokPos[0] += len(token)
assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens)
skip = False
if tokPos[0] < headOffset[0]:
tokPos = headOffset
skip = True
if not skip:
# Extend before
beginIndex = tokIndex
for i in range(tokIndex-1, -1, -1):
token = tokens[i]
if token.isspace():
continue
if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
beginIndex = i + 1
break
if i == 0:
beginIndex = i
while tokens[beginIndex].isspace() or isExtraWord(tokens[beginIndex], toLower=False):
beginIndex += 1
if beginIndex >= tokIndex:
beginIndex = tokIndex
break
# Extend after
endIndex = tokIndex
if tokens[tokIndex][-1] != ",":
endIndex = tokIndex
for i in range(tokIndex+1, len(tokens)):
token = tokens[i]
if token.isspace():
continue
if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
endIndex = i - 1
break
if i == len(tokens) - 1:
endIndex = i
while tokens[endIndex].isspace():
endIndex -= 1
# Modify range
if tokIndex > beginIndex:
for token in reversed(tokens[beginIndex:tokIndex]):
tokPos[0] -= len(token)
if tokIndex < endIndex:
for token in tokens[tokIndex+1:endIndex+1]:
tokPos[1] += len(token)
# Attempt to remove trailing periods and commas
while not sentenceText[tokPos[1] - 1].isalnum():
tokPos[1] -= 1
if tokPos[1] < tokPos[0] + 1:
tokPos[1] = tokPos[0] + 1
break
while not sentenceText[tokPos[0]].isalnum():
tokPos[0] += 1
if tokPos[0] >= tokPos[1]:
tokPos[0] = tokPos[1] - 1
break
# Split merged names
#newPos = [tokPos[0], tokPos[1]]
#for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"):
# newPos[0] += len(split)
# if
# Insert changed charOffset
counts["entities"] += 1
newOffset = tuple(tokPos)
newOffsetString = Range.tuplesToCharOffset([newOffset])
if verbose:
print "Entity", entity.get("id"),
#print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]],
print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]]],
print [entity.get("charOffset"), entity.get("headOffset"), newOffsetString], "Sent:", len(sentence.get("text")),
if newOffset != headOffset:
counts["extended"] += 1
if verbose: print "EXTENDED",
if newOffset == charOffset:
counts["correct"] += 1
if verbose: print "CORRECT"
else:
counts["incorrect"] += 1
incorrectCount += 1
if verbose: print "INCORRECT"
entity.set("charOffset", newOffsetString)
#entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1])
entity.set("text", sentenceText[newOffset[0]:newOffset[1]])
if incorrectCount > 0 and verbose:
print "TOKENS:", "|".join(tokens)
print "--------------------------------"
if verbose:
print counts
if not (ET.iselement(input) and input.tag == "sentence"):
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(corpusRoot, output)
return corpusTree
示例11: convertChemProt
# 需要导入模块: from Utils import Range [as 别名]
# 或者: from Utils.Range import tuplesToCharOffset [as 别名]
def convertChemProt(inDirs=None, setNames=None, outPath=None, goldTestSet=True, downloadDir=None, extractDir=None, redownload=False, debug=False):
tempDir = None
if inDirs == None:
print >> sys.stderr, "---------------", "Downloading ChemProt files", "---------------"
if extractDir == None:
tempDir = tempfile.mkdtemp()
inDirs = []
for setName in ("TRAIN", "DEVEL", "TEST"):
if goldTestSet and setName == "TEST":
setName = "TEST_GOLD"
if Settings.URL["CP17_" + setName] != None:
currentExtractDir = extractDir if extractDir else tempDir
currentExtractDir = os.path.join(currentExtractDir, setName.lower())
inDirs.append(downloadFile(Settings.URL["CP17_" + setName], downloadDir, currentExtractDir, redownload))
print >> sys.stderr, "Reading ChemProt corpus from input", inDirs, "using dataset mapping", setNames
dataSets = OrderedDict()
for inDir in inDirs:
print >> sys.stderr, "Reading input directory", inDir
filenames = os.listdir(inDir)
filetypes = ["_abstracts", "_entities", "_relations"]
# Collect the file paths for the data types
dirDataSets = set()
for filename in filenames:
if not (filename.endswith(".tsv") and any([x in filename for x in filetypes])):
continue
dataSetId, dataType = filename.replace("_gs", "").rsplit("_", 1)
if setNames != None:
dataSetId = setNames.get(dataSetId, dataSetId)
dirDataSets.add(dataSetId)
dataType = dataType.split(".")[0]
if dataSetId not in dataSets:
dataSets[dataSetId] = {}
assert dataType not in dataSets[dataSetId]
dataSets[dataSetId][dataType] = os.path.join(inDir, filename)
print >> sys.stderr, "Found ChemProt datasets", list(dirDataSets), "at", inDir
print >> sys.stderr, "Read datasets:", dataSets.keys()
# Build the Interaction XML
print >> sys.stderr, "Converting to Interaction XML"
corpusName = "CP17"
corpus = ET.Element("corpus", {"source":corpusName})
counts = defaultdict(int)
docById = {}
entityById = {}
entitiesByDoc = {}
docsWithErrors = set()
for dataSetId in sorted(dataSets.keys()):
prevCounts = copy.copy(counts)
print >> sys.stderr, "---", "Building elements for dataset", dataSetId, "---"
dataSet = dataSets[dataSetId]
counts["sets"] += 1
with open(dataSet["abstracts"], "rt") as f:
print >> sys.stderr, "Adding document elements for dataset", dataSetId
for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["id", "title", "abstract"], quoting=csv.QUOTE_NONE):
document = ET.Element("document", {"id":corpusName + ".d" + str(counts["documents"]), "origId":row["id"], "set":dataSetId})
document.set("text", row["title"] + " " + row["abstract"])
document.set("titleOffset", Range.tuplesToCharOffset((0, len(row["title"]))))
if document.get("origId") in docById:
assert document.get("text") == docById[document.get("origId")].get("text")
assert document.get("titleOffset") == docById[document.get("origId")].get("titleOffset")
counts["duplicate-documents"] += 1
else:
corpus.append(document)
docById[document.get("origId")] = document
counts["documents"] += 1
with open(dataSet["entities"], "rt") as f:
print >> sys.stderr, "Adding entity elements for dataset", dataSetId
for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["docId", "id", "type", "begin", "end", "text"], quoting=csv.QUOTE_NONE):
document = docById[row["docId"]]
assert row["type"] in ("CHEMICAL", "GENE-Y", "GENE-N")
# Check for duplicate entities
if row["docId"] not in entitiesByDoc:
entitiesByDoc[row["docId"]] = set()
assert row["id"] not in entitiesByDoc[row["docId"]]
entitiesByDoc[row["docId"]].add(row["id"])
# Determine the offset
offset = (int(row["begin"]), int(row["end"]))
docSpan = document.get("text")[offset[0]:offset[1]]
if docSpan == row["text"]:
entity = ET.SubElement(document, "entity", {"id":document.get("id") + ".e" + str(len([x for x in document.findall("entity")]))})
entity.set("given", "True")
entity.set("origId", row["id"])
entity.set("type", row["type"].split("-")[0])
entity.set("normalized", "True" if row["type"].endswith("-Y") else "False")
entity.set("charOffset", Range.tuplesToCharOffset((offset[0], offset[1])))
entity.set("text", row["text"])
if row["docId"] not in entityById:
entityById[row["docId"]] = {}
assert entity.get("origId") not in entityById[row["docId"]]
entityById[row["docId"]][entity.get("origId")] = entity
counts["entities"] += 1
else:
print >> sys.stderr, "Alignment error in document", row["docId"], (offset, docSpan, row)
counts["entities-error"] += 1
docsWithErrors.add(row["docId"])
if "relations" in dataSet:
print >> sys.stderr, "Adding relation elements for dataset", dataSetId
with open(dataSet["relations"], "rt") as f:
for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["docId", "group", "groupEval", "type", "arg1", "arg2"], quoting=csv.QUOTE_NONE):
for argId in ("1", "2"):
assert row["arg" + argId].startswith("Arg" + argId + ":")
#.........这里部分代码省略.........