当前位置: 首页>>代码示例>>Python>>正文


Python util.ngrams函数代码示例

本文整理汇总了Python中nltk.util.ngrams函数的典型用法代码示例。如果您正苦于以下问题:Python ngrams函数的具体用法?Python ngrams怎么用?Python ngrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了ngrams函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: modified_precision

    def modified_precision(candidate, references, n):
        candidate_ngrams=[]
        candidate_n = ngrams(candidate, n)
        
        for x in candidate_n:
            #print x
            candidate_ngrams.append(x)
        # print candidate_ngrams
        #print type(candidate_ngrams)
            #length+=1
            
        
        if len(candidate_ngrams) == 0:
            return 0
        
        #raw_input()
        c_words = set(candidate_ngrams)
        #print c_words
        for word in c_words:
            count_w = candidate_ngrams.count(word) + 1
            #print count_w

            count_max = 0
            for reference in references:
                reference_ngrams=[]
                reference_n = ngrams(reference, n)
                for x in reference_n:
                    reference_ngrams.append(x)

                count = reference_ngrams.count(word) + 1
                if count > count_max:
                    count_max = count

        return min(count_w, count_max) / (len(candidate) + len(c_words))
开发者ID:ab93,项目名称:Text-Summarization,代码行数:34,代码来源:evalSummary.py

示例2: getTrainData

def getTrainData(corpus, embedsize, ngramsize, m):
	f = open(corpus)
	datap = []
	for line in f:
		data = line.strip().split('\t')
		s1 = data[0]
		s2 = data[1]
		label = data[2]
		s1ng = ngrams(s1.split(' '), ngramsize)
		s2ng = ngrams(s2.split(' '), ngramsize)
		s1ng = set([ng for ng in s1ng])
		s2ng = set([ng for ng in s2ng])
		#diff = s2ng.difference(s1ng)
		all = s1ng.union(s2ng)
		for ng in all:
			datap.append([ng, label])
	X = np.zeros((len(datap), ngramsize, embedsize))
	Y = np.zeros((len(datap), 3))
	wildcard = np.array([0.0]*embedsize)
	for i in range(0, len(datap)):
		item = datap[i]
		ngram = item[0]
		label = item[1]
		vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard)
		labels = getLabels(label)
		X[i] = vectors
		Y[i] = labels
	return X, Y
开发者ID:ghpaetzold,项目名称:phd-backup,代码行数:28,代码来源:Run_NN_MLP.py

示例3: str_common_grams

 def str_common_grams(str1, str2, length=3):
     '''Return how many times the ngrams (of length min_len to max_len) of str1
     appeared on str2
     '''
     grams1 = list(ngrams(str1, length))
     grams2 = list(ngrams(str2, length))
     return sum(grams2.count(gram) for gram in grams1)
开发者ID:KhaoticMind,项目名称:kaggle-homedepot,代码行数:7,代码来源:homedepot.py

示例4: getTestData

def getTestData(corpus, embedsize, ngramsize, m):
	f = open(corpus)
	datap = []
	for line in f:
		data = line.strip().split('\t')
		s1 = data[0]
		s2 = data[1]
		label = data[2]
		s1ng = ngrams(s1.split(' '), ngramsize)
		s2ng = ngrams(s2.split(' '), ngramsize)
		s1ng = set([ng for ng in s1ng])
		s2ng = set([ng for ng in s2ng])
		#diff = s2ng.difference(s1ng)
		all = s1ng.union(s2ng)
		datap.append(list(all))
	Xs = []
	wildcard = np.array([0.0]*embedsize)
	for ngs in datap:
		X = np.zeros((len(ngs), ngramsize, embedsize))
		for i in range(0, len(ngs)):
			ngram = ngs[i]
			vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard)
			X[i] = vectors
		Xs.append(X)
	return Xs
开发者ID:ghpaetzold,项目名称:phd-backup,代码行数:25,代码来源:Run_NN_MLP.py

示例5: extract_terms_features

    def extract_terms_features(terms, separateGrams=False):
        vector = dict()
        
        while('' in terms):
            terms.remove('')
#        for term in terms:
#            if vector.has_key(term):
#                vector[term] += 1
#            else:
#                vector[term] = 1
#        for i in range(len(terms) - 2):
#            cb2 = ' '.join(terms[i:i+1])
#            cb3 = ' '.join(terms[i:i+2])
#            if vector.has_key(cb2):
#                vector[cb2] += 1
#            else:
#                vector[cb2] = 1
#            if vector.has_key(cb3):
#                vector[cb3] += 1
#            else:
#                vector[cb3] = 1
#        cb2 = ' '.join(terms[len(terms)-2:len(terms)])
#        if vector.has_key(cb2):
#            vector[cb2] += 1
#        else:
#            vector[cb2] = 1
#        print terms
        g2 = ngrams(terms, 2)
        g3 = ngrams(terms, 3)
        
        
        g2j = [' '.join(gterms) for gterms in g2]
        g3j = [' '.join(gterms) for gterms in g3]
        
        
        vec1 = {}
        vec2 = {}
        vec3 = {}
        
        for t in terms:
            if(not vector.has_key(t)):
                vec1[t] = 1
            else:
                vec1[t] += 1
        for t in g2j:
            if(not vector.has_key(t)):
                vec2[t] = 1
            else:
                vec2[t] += 1
        for t in g3j:
            if(not vector.has_key(t)):
                vec3[t] = 1
            else:
                vec3[t] += 1
        
        vector = dict(vec1.items() + vec2.items() + vec3.items())
        if(separateGrams == True):
            return (vector, vec1, vec2, vec3)
        else:
            return vector
开发者ID:klyc0k,项目名称:EDSFilter,代码行数:60,代码来源:twitter_methods.py

示例6: format_text

def format_text(entries, LSTM_shape=True):
	THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
	sentences = []
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	decoded = base64.b64decode(entries)
	decoded = str(decoded)
	decoded = decoded[2:]
	decoded = decoded[:-1]
	decoded = decoded.split(".")
	#print(decoded, "is decoded")
	for entry in decoded:
		token_sentences = tokenizer.tokenize(entry)
		for sentence in token_sentences:
			sentences.append(sentence)

	tokenized_sentences = []
	#remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\'']
	#remove_tokens = string.punctuation
	remove_tokens = '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~'
	stop_words = set(stopwords.words('english'))
	tweet_tknzr = TweetTokenizer()
	for sentence in sentences:
		tokens = tweet_tknzr.tokenize(sentence)
		tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens))
		tokenized_sentences.append(tokens)

	all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item()
	all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item()
	all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item()
	#once the model gets updated with good data, ngrams.py needs to get changed/updated too!

	X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3)))
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 1)
		for gram in my_ngrams:
			if gram in all_ngrams1:
				index = all_ngrams1[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 2)
		for gram in my_ngrams:
			if gram in all_ngrams2:
				index = len(all_ngrams1) + all_ngrams2[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 3)
		for gram in my_ngrams:
			if gram in all_ngrams3:
				index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram]
				X[i][index] = 1


	if LSTM_shape:
		X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
	else:
		X = np.reshape(X, (X.shape[0], X.shape[1]))
	return X
开发者ID:mit-teaching-systems-lab,项目名称:threeflows,代码行数:60,代码来源:calculate_emotion.py

示例7: getNgramProbs

def getNgramProbs(file):
	f = open(file,'r');
	unigramList = [] ;
	for line in f.read().split():
		unigramList.append( line );

	bigramList = ngrams(unigramList, 2);
	trigramList = ngrams(unigramList, 3);

	#dictionary of unigrams, bigrams, trigrams
	unigramDict = dict()
	bigramDict = dict()
	trigramDict = dict()

	#Counts for Unigrams
	countUni = 0 ;
	for item in unigramList:
		countUni += 1
		if item not in unigramDict:
			unigramDict[item] = 1
		else:
			unigramDict[item] += 1

	#Counts for Bigram
	for item in bigramList:
		if item not in bigramDict:
			bigramDict[item] = 1
		else:
			bigramDict[item] += 1

	#Counts for Trigrams
	for item in trigramList:
		if item not in trigramDict:
			trigramDict[item] = 1
		else:
			trigramDict[item] += 1

	#Probabilities for Trigrams
	for key,item in trigramDict.iteritems():
		trigramDict[key] /= float(bigramDict[(key[0],key[1])]) ; 

	#Probabilities for Bigrams
	for key,item in bigramDict.iteritems():
		bigramDict[key] /= float(unigramDict[key[0]]) ; 

	#Probabilities for Unigrams
	for key,item in unigramDict.iteritems():
		unigramDict[key] /= float(countUni) ; 

	# print "***** Unigrams";
	# for key,item in unigramDict.iteritems():
	#     print str(key) + ' ' + str(item) ;
	# print "***** Bigrams";
	# for key,item in bigramDict.iteritems():
	#     print str(key) + ' ' + str(item) ;
	# print "***** Trigrams";
	# for key,item in trigramDict.iteritems():
	#     print str(key) + ' ' + str(item) ;
	
	return [unigramDict,bigramDict,trigramDict];
开发者ID:soumyasanyal,项目名称:NLPTermProject,代码行数:60,代码来源:kldiv.py

示例8: scoreScopeOverlap

	def scoreScopeOverlap(self,scopeHyp,scopeRef):
		
		totalScore = 0

		for scope_h in scopeHyp:
			bestScore = 0
			for scope_r in scopeRef:

				if scope_r==[] or scope_h==[]:
					partialScore = 0
					if partialScore > bestScore: bestScore = partialScore
				else:
					ngram_range=range(1,len(scope_h)+1)
					logging.info("ngram_range")
					logging.info(ngram_range)
					score_weights=map(lambda x: round(x/reduce(lambda x,y:x+y,ngram_range),4),ngram_range)
					logging.info(score_weights)
				
					partialScore=float()
					for i in ngram_range:
						hyp=ngrams(scope_h,i)
						ref=ngrams(scope_r,i)
						partialScore+=(len(set(hyp).intersection(set(ref)))*score_weights[i-1])
					logging.info("partialScore")
					logging.info(partialScore)
					if partialScore > bestScore: bestScore = partialScore

			totalScore+=bestScore
			logging.info("totalScore")
			logging.info(totalScore)
			
		return totalScore
开发者ID:wilkeraziz,项目名称:chisel-features,代码行数:32,代码来源:main.py

示例9: create_candidate_list

def create_candidate_list(sentence):
    tokens = nltk.tokenize.word_tokenize(sentence)

    candidates_lists = create_candidates_lists(tokens)

    # Create list of 1-grams.
    candidates = []
    for l in candidates_lists:
        candidates += l

    # Remove irrelevant stop words in 1-grams.
    res = [token for token in candidates
        if token not in ENGLISH_STOPWORDS]

    # Create list of bigrams.
    bigrams = []
    for l in candidates_lists:
        bigrams += ngrams(l, 2)

    # Create list of trigrams.
    trigrams = []
    for l in candidates_lists:
        trigrams += ngrams(l, 3)

    # Create list of 4-grams.
    fourgrams = []
    for l in candidates_lists:
        fourgrams += ngrams(l, 4)

    res += [' '.join(a) for a in bigrams]
    res += [' '.join(a) for a in trigrams]
    res += [' '.join(a) for a in fourgrams]

    return res
开发者ID:srom,项目名称:ensu,代码行数:34,代码来源:select_aliases.py

示例10: calc_ngram

def calc_ngram(htokens,etokens):
    features = []
    for n in range(1,5):
        hgrams = nltk.FreqDist(ngrams(htokens,n))
        egrams = nltk.FreqDist(ngrams(etokens,n))
        prec = 0
        num = 0
        for k in hgrams:
            if k in egrams:
                prec = prec + hgrams[k]
            num = num + hgrams[k]
        if num > 0:
            prec = float(prec) / num
        features.append(prec)
        recall = 0
        num = 0
        for k in egrams:
            if k in hgrams:
                recall = recall + egrams[k]
            num = num + egrams[k]
        if num > 0:
            recall = float(recall) / num
        features.append(recall)
        features.append(calc_f1(prec,recall))
    return features
开发者ID:da03,项目名称:sp2016.11-731,代码行数:25,代码来源:generate_feature.py

示例11: rouge_s

    def rouge_s(references, candidate, beta, d_skip=None, averaging=True, smoothing=False):

        rouge_s_list = []
        k_c = len(candidate) if d_skip is None else d_skip
        cand_skip_list = list(skipgrams(tokenizer.tokenize(candidate),
                              n=2, k=k_c))
        for ref in references:
            k_ref = len(ref) if d_skip is None else d_skip
            ref_skip_list = list(skipgrams(tokenizer.tokenize(ref),
                                 n=2, k=k_ref))
            count = 0
            for bigram in cand_skip_list:
                if bigram in ref_skip_list:
                    count = count+1
            if not smoothing:
                r_skip = count/len(ref_skip_list)
                p_skip = count/len(cand_skip_list)
            else:
                cand_ungm = list(ngrams(tokenizer.tokenize(candidate),
                                      n=1))
                ref_ungm = list(ngrams(tokenizer.tokenize(ref),
                                     n=1))
                for ungm in cand_ungm:
                    if ungm in ref_ungm:
                        count += 1
                r_skip = count/(len(ref_skip_list)+len(ref_ungm))
                p_skip = count/(len(cand_skip_list)+len(cand_ungm))
            score = Rouge.get_score(r_skip, p_skip, beta)           
            rouge_s_list.append(score)
        return Rouge.jacknifing(rouge_s_list, averaging=averaging)
开发者ID:53X,项目名称:NLP-Metrics,代码行数:30,代码来源:rouge.py

示例12: char_ngram_similarity

def char_ngram_similarity(doc1, doc2, n, top=100):
    """
    Gives a positive dissimilarity score of two documents with respect to their top m character n-grams distribution.
    If the value is 0 the documents are identical (or at least share an identical top m character n-grams distribution.
    :param doc1:
    :param doc2:
    :param n: the n-gram length
    :param top: Only use the N most frequent n-grams from each document.
    :return: A positive dissimilarity score. If the value is 0 the documents are identical (or at least their top m
             character n-grams distribution.)
    """

    ngrams1 = Counter(ngrams(doc1, n))
    ngrams2 = Counter(ngrams(doc2, n))

    profile1 = [n[0] for n in ngrams1.most_common(top)]
    profile2 = [n[0] for n in ngrams2.most_common(top)]

    # normalise the two ngram distributions
    total1 = np.sum(list(ngrams1.values()))
    for key in ngrams1:
        ngrams1[key] /= total1

    total2 = np.sum(list(ngrams2.values()))
    for key in ngrams2:
        ngrams2[key] /= total2

    # calculate global dissimilarity score
    score = 0
    for n in set(profile1 + profile2):
        f1 = ngrams1[n]
        f2 = ngrams2[n]
        score += ((2 * (f1 - f2)) / (f1 + f2)) ** 2
    return score
开发者ID:rug-compling,项目名称:glad,代码行数:34,代码来源:glad-main.py

示例13: jaccardIdx

def jaccardIdx(w1, w2):
    w1ngrams = set(ngrams(w1, 2))
    w2ngrams = set(ngrams(w2, 2))

    union = w1ngrams.union(w2ngrams)
    intersect = w1ngrams.intersection(w2ngrams)

    return 1.0 - float(len(intersect)) / float(len(union))
开发者ID:weezel,项目名称:ITIS13,代码行数:8,代码来源:russiannames.py

示例14: count_word

    def count_word(self,doc,unigram = True,bigram = False,binary = False):
        str = word_tokenize(self.remove_non_ascii(doc))
        doc_voc = {}
        if(unigram):
            uni = ngrams(str,1)
            self.count_word_sub(doc_voc,uni,binary)

        if(bigram):
            bi = ngrams(str,2)
            self.count_word_sub(doc_voc,bi,binary)
开发者ID:akshaynavada,项目名称:NLP,代码行数:10,代码来源:NB.py

示例15: trainModel

	def trainModel(self, listOfFilenames):
		#dictionary of unigrams, bigrams, trigrams
		unigramDict = dict()
		bigramDict = dict()
		trigramDict = dict()

		#total count of unigrams, bigrams, trigrams
		countUni = 0
		countBi = 0
		countTri = 0

		i = 1
		#iterate over list of files
		for fileName in listOfFilenames:
			print "Reading", i
			i += 1
			stag = STagger(fileName)
			stag.find_unigrams(True, False)
			for item in stag.unigrams:
				countUni += 1
				if item not in unigramDict:
					unigramDict[item] = 1
				else:
					unigramDict[item] += 1
			codeBigrams = ngrams(stag.unigrams, 2)
			codeTrigrams = ngrams(stag.unigrams, 3)
			for item in codeBigrams:
				countBi += 1
				if item not in bigramDict:
					bigramDict[item] = 1
				else:
					bigramDict[item] += 1
			for item in codeTrigrams:
				countTri += 1
				if item not in trigramDict:
					trigramDict[item] = 1
				else:
					trigramDict[item] += 1

		
		#write the ngrams to the file
		outputFile = open('corpus.txt', 'w')
		outputFile.write(str(countUni) + "\n")
		for key, x in unigramDict.iteritems():
			outputFile.write(str(key) + " " + str(x) + "\n")

		outputFile.write(str(countBi) + "\n")
		for key, x in bigramDict.iteritems():
			outputFile.write(str(key[0]) + " "  + str(key[1]) + " " + str(x) + "\n")

		outputFile.write(str(countTri) + "\n")
		for key, x in trigramDict.iteritems():
			outputFile.write(str(key[0]) + " " + str(key[1]) + " " + str(key[2]) + " " + str(x) + "\n")

		outputFile.close()
开发者ID:soumyasanyal,项目名称:NLPTermProject,代码行数:55,代码来源:extract2.py


注:本文中的nltk.util.ngrams函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。