本文整理汇总了Python中Document.Document.addRef方法的典型用法代码示例。如果您正苦于以下问题:Python Document.addRef方法的具体用法?Python Document.addRef怎么用?Python Document.addRef使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Document.Document
的用法示例。
在下文中一共展示了Document.addRef方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from Document import Document [as 别名]
# 或者: from Document.Document import addRef [as 别名]
def __init__(self, clusterNum, dataDir, lemmaDir, outputDir, printStats=False):
#print "* Parsing cluster " + str(clusterNum)
self.clusterNum = clusterNum
self.references = defaultdict(list) # ref_id -> [m_id1, m_id2, ... ] which spans all docs in the cluster
self.docs = defaultdict(Document) # doc_id -> Document
# NOTE: the point of this variable is just print stats and see how many
# Mentions are 'singletons' (not encompassed by any Ref)
self.mentions = defaultdict(list) # m_id -> [ref_id1, ref_id2, ... ] # which spans all docs in the cluster
self.headDoc = str(clusterNum) + "_1ecbplus.xml"
makeGoldTruth = False
# data directories
self.dataDir = dataDir + str(clusterNum) + '/'
self.lemmaDir = lemmaDir + str(clusterNum) + '/ecbplus/'
self.outputDir = outputDir# + 'clusterMentions/'
self.numPairs = 0
self.numMentions = 0
# makes the gold truth files
self.mentionsList = []
# iterates through each file in the given dir
for f in glob(self.dataDir + '*plus.xml'):
#print "file: " + str(f)
doc_id = f[f.rfind("/") + 1:]
doc = Document(doc_id)
tokenIDs = defaultdict(str)
# gets the contents of the file
with open (f, "r") as myfile:
fileContents=myfile.read().replace('\n', ' ')
# reads <tokens>
it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents))
for match in it:
t_id = int(match.group(1))
token = match.group(3)
tokenIDs[t_id] = token
# reads <markers>
regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>"
markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")]
it = tuple(re.finditer(regex, markables))
for match in it:
isPred = False
if "ACTION" in match.group(1):
isPred = True
m_id = int(match.group(2))
# gets the token IDs
regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>"
it2 = tuple(re.finditer(regex2, match.group(3)))
curTokenIDs = []
text = ""
for match2 in it2:
tokenID = int(match2.group(1))
curTokenIDs.append(tokenID)
text = text + str(tokenIDs[tokenID]) + " "
text = text.rstrip()
# constructs the Mention
mention = Mention(m_id, text, curTokenIDs, isPred, doc_id)
# adds to the Doc and stores it (we will update the Doc w/ ref info below)
doc.addMention(mention)
# reads <relations>
relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")]
regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>"
it = tuple(re.finditer(regex, relations))
for match in it:
ref_id = match.group(1)
regex2 = r"<source m_id=\"(\d+)\".*?/>"
it2 = tuple(re.finditer(regex2, match.group(2)))
for match2 in it2:
m_id = int(match2.group(1))
doc.mentions[m_id].addReference(ref_id)
if doc.mentions[m_id] in self.references[ref_id]:
print "** we already have the mention added to the ref!"
exit(1)
else:
self.references[ref_id].append(doc.mentions[m_id])
# adds the current ref_id to the Doc
doc.addRef(ref_id)
self.docs[doc_id] = doc # stores the Doc object locally
# now let's read the lemmas, provided by Sergio's StanfordNLP-parsed files
f_lemma = open(self.lemmaDir + doc_id, 'r')
fileContents = f_lemma.read().replace('\n', ' ')
lemmaContent = fileContents[fileContents.find("<lemmas>")+8:fileContents.find("</lemmas>")]
regex = r"<span m\_id=\"(.+?)/(.+?)\".+?pos=\"(.+?)\">(.+?)</span>" #(.*?)?</.*?>"
it = tuple(re.finditer(regex, lemmaContent))
for match in it:
#.........这里部分代码省略.........