当前位置: 首页>>代码示例>>Python>>正文


Python Document.addMention方法代码示例

本文整理汇总了Python中Document.Document.addMention方法的典型用法代码示例。如果您正苦于以下问题:Python Document.addMention方法的具体用法?Python Document.addMention怎么用?Python Document.addMention使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Document.Document的用法示例。


在下文中一共展示了Document.addMention方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from Document import Document [as 别名]
# 或者: from Document.Document import addMention [as 别名]
	def __init__(self, clusterNum, dataDir, lemmaDir, outputDir, printStats=False):
		#print "* Parsing cluster " + str(clusterNum)
		self.clusterNum = clusterNum
		self.references = defaultdict(list) # ref_id -> [m_id1, m_id2, ... ] which spans all docs in the cluster
		self.docs = defaultdict(Document) # doc_id -> Document

		# NOTE: the point of this variable is just print stats and see how many
		#       Mentions are 'singletons' (not encompassed by any Ref)
		self.mentions = defaultdict(list) # m_id -> [ref_id1, ref_id2, ... ] # which spans all docs in the cluster
		self.headDoc = str(clusterNum) + "_1ecbplus.xml"
		makeGoldTruth = False
		# data directories
		self.dataDir = dataDir + str(clusterNum) + '/'
		self.lemmaDir = lemmaDir + str(clusterNum) + '/ecbplus/'
		self.outputDir = outputDir# + 'clusterMentions/'
		self.numPairs = 0
		self.numMentions = 0
		# makes the gold truth files
		self.mentionsList = []

		# iterates through each file in the given dir
		for f in glob(self.dataDir + '*plus.xml'):

			#print "file: " + str(f)
			doc_id = f[f.rfind("/") + 1:]
			doc = Document(doc_id)

			tokenIDs = defaultdict(str)

			# gets the contents of the file
			with open (f, "r") as myfile:
				fileContents=myfile.read().replace('\n', ' ')

			# reads <tokens>
			it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents))
			for match in it:
				t_id = int(match.group(1))
				token = match.group(3)
				tokenIDs[t_id] = token

			# reads <markers>
			regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>"
			markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")]
			it = tuple(re.finditer(regex, markables))
			for match in it:
				isPred = False
				if "ACTION" in match.group(1):
					isPred = True
				m_id = int(match.group(2))

				# gets the token IDs
				regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>"
				it2 = tuple(re.finditer(regex2, match.group(3)))
				curTokenIDs = []
				text = ""
				for match2 in it2:
					tokenID = int(match2.group(1))
					curTokenIDs.append(tokenID)
					text = text + str(tokenIDs[tokenID]) + " "
				text = text.rstrip()

				# constructs the Mention
				mention = Mention(m_id, text, curTokenIDs, isPred, doc_id)

				# adds to the Doc and stores it (we will update the Doc w/ ref info below)
				doc.addMention(mention)

			# reads <relations>
			relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")]
			regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>"
			it = tuple(re.finditer(regex, relations))
			for match in it:
				ref_id = match.group(1)
				regex2 = r"<source m_id=\"(\d+)\".*?/>"
				it2 = tuple(re.finditer(regex2, match.group(2)))

				for match2 in it2:
					m_id = int(match2.group(1))

					doc.mentions[m_id].addReference(ref_id)
					if doc.mentions[m_id] in self.references[ref_id]:
						print "** we already have the mention added to the ref!"
						exit(1)
					else:
						self.references[ref_id].append(doc.mentions[m_id])

					

				# adds the current ref_id to the Doc
				doc.addRef(ref_id)

			self.docs[doc_id] = doc # stores the Doc object locally

			# now let's read the lemmas, provided by Sergio's StanfordNLP-parsed files
			f_lemma = open(self.lemmaDir + doc_id, 'r')
			fileContents = f_lemma.read().replace('\n', ' ')
			lemmaContent = fileContents[fileContents.find("<lemmas>")+8:fileContents.find("</lemmas>")]
			regex = r"<span m\_id=\"(.+?)/(.+?)\".+?pos=\"(.+?)\">(.+?)</span>" #(.*?)?</.*?>"
			it = tuple(re.finditer(regex, lemmaContent))
			for match in it:
#.........这里部分代码省略.........
开发者ID:chriswtanner,项目名称:PredArgAlignment,代码行数:103,代码来源:ParseCluster.py

示例2: createSemanticSpaceSimVectors

# 需要导入模块: from Document import Document [as 别名]
# 或者: from Document.Document import addMention [as 别名]
	def createSemanticSpaceSimVectors(self, outPickle, outFile, N, W, sliceNum, totalSlices):
		print "* creating semantic space vectors"
		fullWindowSize = W*2 + 1

		outPickleFile = self.outputDir + outPickle
		outputFile = self.outputDir + outFile
		mentionTypes = [] # stores the tokens found within Mentions (non-stopwords and > 1 in length)

		# gets the 2,000 most popular words (non-stopwords and > 1 in length)
		print "* gathering most popular " + str(N) + " words"
		sys.stdout.flush()

		wordCounts = defaultdict(int)
		docs = []
		for clusterNum in self.validClusters:

			# iterates through each file in the given dir/cluster
			for f in glob(self.dataDir + str(clusterNum) + '/*plus.xml'):

				doc_id = f[f.rfind("/") + 1:]
				doc = Document(doc_id)

				tokenIDs = defaultdict(str)
				# gets the contents of the file
				with open (f, "r") as myfile:
					fileContents=myfile.read().replace('\n', ' ')

				# reads <tokens>
				it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents))
				for match in it:
					t_id = int(match.group(1))
					sent_num = int(match.group(2))
					token = match.group(3).lower()
					tokenIDs[t_id] = token
					if sent_num > 0 and token not in self.stopwords and len(token) > 1:
						wordCounts[token] = wordCounts[token] + 1
						if token not in mentionTypes:
							mentionTypes.append(token)


				# reads <markers>
				regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>"
				markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")]
				it = tuple(re.finditer(regex, markables))
				for match in it:
					isPred = False
					if "ACTION" in match.group(1):
						isPred = True
					m_id = int(match.group(2))

					# gets the token IDs
					regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>"
					it2 = tuple(re.finditer(regex2, match.group(3)))
					curTokenIDs = []
					text = ""
					for match2 in it2:
						tokenID = int(match2.group(1))
						curTokenIDs.append(tokenID)
						text = text + str(tokenIDs[tokenID]) + " "
					text = text.rstrip()

					# constructs the Mention
					mention = Mention(m_id, text, curTokenIDs, isPred, doc_id)

					# adds to the Doc and stores it (we will update the Doc w/ ref info below)
					doc.addMention(mention)

				docs.append(doc)
		
		print "* there were " + str(len(docs)) + " unique docs"
		sys.stdout.flush()

		# puts the top N words into a 'topWords'
		sorted_wordCounts = sorted(wordCounts.items(), key=operator.itemgetter(1), reverse=True)
		commonTypes = [x[0] for x in sorted_wordCounts][0:N]

		print "# unique mention tokens: " + str(len(mentionTypes))

		# goes through all docs again, this time i do the sliding window
		# in order to calculate the PMI1 and PMI2, where
		# PMI1 = freq(p,c) / (freq(p)* freq(c))
		# PMI2 = log(prob(p,c) / (prob(x)*prob(c)))
		mentionCounts = defaultdict(int)
		commonWordsCounts = defaultdict(int)
		mentionAndCommonCounts = defaultdict(int)
		print "* calculating PMI counts for all Mentions across all clusters of docs"
		for clusterNum in self.validClusters:
			# iterates through each file in the given dir/cluster
			for f in glob(self.dataDir + str(clusterNum) + '/*plus.xml'):

				#docTokens = []
				mentionLocations = defaultdict(list)
				commonWordsLocations = defaultdict(list)

				# gets the contents of the file
				with open (f, "r") as myfile:
					fileContents=myfile.read().replace('\n', ' ')

				# reads <tokens>
				it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents))
#.........这里部分代码省略.........
开发者ID:chriswtanner,项目名称:PredArgAlignment,代码行数:103,代码来源:FeatureGeneratorSS.py


注:本文中的Document.Document.addMention方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。