当前位置: 首页>>代码示例>>Python>>正文


Python Document.addRef方法代码示例

本文整理汇总了Python中Document.Document.addRef方法的典型用法代码示例。如果您正苦于以下问题:Python Document.addRef方法的具体用法?Python Document.addRef怎么用?Python Document.addRef使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Document.Document的用法示例。


在下文中一共展示了Document.addRef方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from Document import Document [as 别名]
# 或者: from Document.Document import addRef [as 别名]
	def __init__(self, clusterNum, dataDir, lemmaDir, outputDir, printStats=False):
		#print "* Parsing cluster " + str(clusterNum)
		self.clusterNum = clusterNum
		self.references = defaultdict(list) # ref_id -> [m_id1, m_id2, ... ] which spans all docs in the cluster
		self.docs = defaultdict(Document) # doc_id -> Document

		# NOTE: the point of this variable is just print stats and see how many
		#       Mentions are 'singletons' (not encompassed by any Ref)
		self.mentions = defaultdict(list) # m_id -> [ref_id1, ref_id2, ... ] # which spans all docs in the cluster
		self.headDoc = str(clusterNum) + "_1ecbplus.xml"
		makeGoldTruth = False
		# data directories
		self.dataDir = dataDir + str(clusterNum) + '/'
		self.lemmaDir = lemmaDir + str(clusterNum) + '/ecbplus/'
		self.outputDir = outputDir# + 'clusterMentions/'
		self.numPairs = 0
		self.numMentions = 0
		# makes the gold truth files
		self.mentionsList = []

		# iterates through each file in the given dir
		for f in glob(self.dataDir + '*plus.xml'):

			#print "file: " + str(f)
			doc_id = f[f.rfind("/") + 1:]
			doc = Document(doc_id)

			tokenIDs = defaultdict(str)

			# gets the contents of the file
			with open (f, "r") as myfile:
				fileContents=myfile.read().replace('\n', ' ')

			# reads <tokens>
			it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents))
			for match in it:
				t_id = int(match.group(1))
				token = match.group(3)
				tokenIDs[t_id] = token

			# reads <markers>
			regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>"
			markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")]
			it = tuple(re.finditer(regex, markables))
			for match in it:
				isPred = False
				if "ACTION" in match.group(1):
					isPred = True
				m_id = int(match.group(2))

				# gets the token IDs
				regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>"
				it2 = tuple(re.finditer(regex2, match.group(3)))
				curTokenIDs = []
				text = ""
				for match2 in it2:
					tokenID = int(match2.group(1))
					curTokenIDs.append(tokenID)
					text = text + str(tokenIDs[tokenID]) + " "
				text = text.rstrip()

				# constructs the Mention
				mention = Mention(m_id, text, curTokenIDs, isPred, doc_id)

				# adds to the Doc and stores it (we will update the Doc w/ ref info below)
				doc.addMention(mention)

			# reads <relations>
			relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")]
			regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>"
			it = tuple(re.finditer(regex, relations))
			for match in it:
				ref_id = match.group(1)
				regex2 = r"<source m_id=\"(\d+)\".*?/>"
				it2 = tuple(re.finditer(regex2, match.group(2)))

				for match2 in it2:
					m_id = int(match2.group(1))

					doc.mentions[m_id].addReference(ref_id)
					if doc.mentions[m_id] in self.references[ref_id]:
						print "** we already have the mention added to the ref!"
						exit(1)
					else:
						self.references[ref_id].append(doc.mentions[m_id])

					

				# adds the current ref_id to the Doc
				doc.addRef(ref_id)

			self.docs[doc_id] = doc # stores the Doc object locally

			# now let's read the lemmas, provided by Sergio's StanfordNLP-parsed files
			f_lemma = open(self.lemmaDir + doc_id, 'r')
			fileContents = f_lemma.read().replace('\n', ' ')
			lemmaContent = fileContents[fileContents.find("<lemmas>")+8:fileContents.find("</lemmas>")]
			regex = r"<span m\_id=\"(.+?)/(.+?)\".+?pos=\"(.+?)\">(.+?)</span>" #(.*?)?</.*?>"
			it = tuple(re.finditer(regex, lemmaContent))
			for match in it:
#.........这里部分代码省略.........
开发者ID:chriswtanner,项目名称:PredArgAlignment,代码行数:103,代码来源:ParseCluster.py


注:本文中的Document.Document.addRef方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。