当前位置: 首页>>代码示例>>Python>>正文


Python Parser.tokenise方法代码示例

本文整理汇总了Python中Parser.Parser.tokenise方法的典型用法代码示例。如果您正苦于以下问题:Python Parser.tokenise方法的具体用法?Python Parser.tokenise怎么用?Python Parser.tokenise使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Parser.Parser的用法示例。


在下文中一共展示了Parser.tokenise方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from Parser import Parser [as 别名]
# 或者: from Parser.Parser import tokenise [as 别名]
class VectorSpaces:
	""" A algebraic model for representing text documents as vectors of identifiers. 
	A document is represented as a vector. Each dimension of the vector corresponds to a 
	separate term. If a term occurs in the document, then the value in the vector is non-zero.
	"""

	#Collection of document term vectors
	documentVectors = []

	#Mapping of vector index to keyword
	vectorKeywordIndex=[]

	parser=None


	def __init__(self, documents=[]):
		self.documentVectors=[]
		self.parser = Parser()
		if(len(documents)>0):
			self.build(documents)

	def removeDuplicates(self, list):		
		return Set((item for item in list))

	def cosineSimilarity(self, vector1, vector2):
		""" Calculate Cosine Similarity between the two document vectors :
			cosine  = ( V1 * V2 ) / ||V1|| x ||V2|| """
		return float(dot(vector1,vector2) / (norm(vector1) * norm(vector2)))

	def build(self,documents):
		""" Create the vector space for the input document """
		self.vectorKeywordIndex = self.getVectorKeywordIndex(documents)

		self.documentVectors = [self.createVector(document) for document in documents]


	def getVectorKeywordIndex(self, documentList):
		""" create the keyword associated to the position of the elements within the document vectors """

		#Mapped documents into a single word string	
		vocabularyString = " ".join(documentList)
			
		vocabularyList = self.parser.tokenise(vocabularyString)
		#Remove common stop words which have no search value
		vocabularyList = self.parser.removeStopWords(vocabularyList)
		uniqueVocabularyList = self.removeDuplicates(vocabularyList)
		
		vectorIndex={}
		offset=0
		#Associate a position with the keywords which maps to the dimension on the vector used to represent this word
		for word in uniqueVocabularyList:
			vectorIndex[word]=offset
			offset+=1
		return vectorIndex  #(keyword:position)


	def createVector(self, wordString):
		""" @pre: unique(vectorIndex) """

		#Initialize vector with 0's
		vector = [0.0] * len(self.vectorKeywordIndex)
		wordList = self.parser.tokenise(wordString)
		wordList = self.parser.removeStopWords(wordList)
		for word in wordList:			
			if word in self.vectorKeywordIndex:
				vector[self.vectorKeywordIndex[word]] += 1.0; #Use simple Term Count Model (Bag of words)
		return vector


	def buildQueryVector(self, termList):
		""" convert query string into a term vector """
		query = self.createVector(" ".join(termList))
		return query


	def search(self,searchList):
		""" search for documents that match based on a list of terms from a query"""
		queryVector = self.buildQueryVector(searchList)
		
		rankings = [self.cosineSimilarity(queryVector, documentVector) for documentVector in self.documentVectors]
		rankIndices = [i[0] for i in sorted(enumerate(rankings), key = lambda x:x[1], reverse=True)]
		
		return rankIndices
开发者ID:navink,项目名称:Kaggle-Best_Buy_Hackathon,代码行数:85,代码来源:VectorSpaces.py

示例2: __init__

# 需要导入模块: from Parser import Parser [as 别名]
# 或者: from Parser.Parser import tokenise [as 别名]
class VectorSpace:
	""" A algebraic model for representing text documents as vectors of identifiers. 
	A document is represented as a vector. Each dimension of the vector corresponds to a 
	separate term. If a term occurs in the document, then the value in the vector is non-zero.
	"""

	#Collection of document term vectors
	documentVectors = []

	#Mapping of vector index to keyword
	vectorKeywordIndex=[]

	#Tidies terms
	parser=None


	def __init__(self, documents=[]):
		self.documentVectors=[]
		self.parser = Parser()
		if(len(documents)>0):
			self.build(documents)


	def build(self,documents):
		""" Create the vector space for the passed document strings """
		self.vectorKeywordIndex = self.getVectorKeywordIndex(documents)

		self.documentVectors = [self.makeVector(document) for document in documents]


	def getVectorKeywordIndex(self, documentList):
		""" create the keyword associated to the position of the elements within the document vectors """

		#Mapped documents into a single word string	
		vocabularyString = " ".join(documentList)
			
		vocabularyList = self.parser.tokenise(vocabularyString)
		#Remove common words which have no search value
		vocabularyList = self.parser.removeStopWords(vocabularyList)
		uniqueVocabularyList = util.removeDuplicates(vocabularyList)
		
		vectorIndex={}
		offset=0
		#Associate a position with the keywords which maps to the dimension on the vector used to represent this word
		for word in uniqueVocabularyList:
			vectorIndex[word]=offset
			offset+=1
		return vectorIndex  #(keyword:position)


	def makeVector(self, wordString):
		""" @pre: unique(vectorIndex) """

		#Initialise vector with 0's
		vector = [0] * len(self.vectorKeywordIndex)
		wordList = self.parser.tokenise(wordString)
		wordList = self.parser.removeStopWords(wordList)
		for word in wordList:
			vector[self.vectorKeywordIndex[word]] += 1; #Use simple Term Count Model
		return vector


	def buildQueryVector(self, termList):
		""" convert query string into a term vector """
		query = self.makeVector(" ".join(termList))
		return query


	def related(self,documentId):
		""" find documents that are related to the document indexed by passed Id within the document Vectors"""
		ratings = [util.cosine(self.documentVectors[documentId], documentVector) for documentVector in self.documentVectors]
		ratings.sort(reverse=True)
		return ratings


	def search(self,searchList):
		""" search for documents that match based on a list of terms """
		queryVector = self.buildQueryVector(searchList)

		ratings = [util.cosine(queryVector, documentVector) for documentVector in self.documentVectors]
		#ratings.sort(reverse=True)
		return ratings
开发者ID:apassant,项目名称:motools,代码行数:84,代码来源:VectorSpace.py

示例3: open

# 需要导入模块: from Parser import Parser [as 别名]
# 或者: from Parser.Parser import tokenise [as 别名]
from Parser import Parser
import util

f = open('data/gq.txt','r')

parser = Parser()

otp = open('data-lda-gibbs.txt','w')
while True:
	line = f.readline()
	if line == '':
		break
	line = parser.tokenise(line)
	line = parser.removeStopWords(line)
	for word in line:
		otp.write(word+' ')


otp.close()
		
f.close()
开发者ID:apassant,项目名称:motools,代码行数:23,代码来源:createDataLDA-gibbs.py


注:本文中的Parser.Parser.tokenise方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。