當前位置: 首頁>>代碼示例>>Python>>正文


Python nltk.trigrams方法代碼示例

本文整理匯總了Python中nltk.trigrams方法的典型用法代碼示例。如果您正苦於以下問題:Python nltk.trigrams方法的具體用法?Python nltk.trigrams怎麽用?Python nltk.trigrams使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在nltk的用法示例。


在下文中一共展示了nltk.trigrams方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: test

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import trigrams [as 別名]
def test():
    lm1 = pickle.load(open("lm.bin", 'rb'))

    tweets_list = deviger.load_dataset('test.txt')

    for line in tweets_list:
        sentences = nltk.sent_tokenize(line.strip())
        print("Tweet sentences:", sentences)
        for sent in sentences:
            words = nltk.word_tokenize(sent)
            word_trigrams = nltk.trigrams(words)
                sum_log_prob = 0
            for trigram in word_trigrams:
                logprob = lm1.log_prob(trigram)
                sum_log_prob += logprob
            print("(", sum_log_prob / len(words), ")") 
開發者ID:iorch,項目名稱:jakaton_feminicidios,代碼行數:18,代碼來源:lang_model_2.py

示例2: create_qb_tokenizer

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import trigrams [as 別名]
def create_qb_tokenizer(
        unigrams=True, bigrams=False, trigrams=False,
        zero_length_token='zerolengthunk', strip_qb_patterns=True):
    def tokenizer(text):
        if strip_qb_patterns:
            text = re.sub(
                '\s+', ' ',
                re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
            ).strip().capitalize()
        import nltk
        tokens = nltk.word_tokenize(text)
        if len(tokens) == 0:
            return [zero_length_token]
        else:
            ngrams = []
            if unigrams:
                ngrams.extend(tokens)
            if bigrams:
                ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
            if trigrams:
                ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])

            if len(ngrams) == 0:
                ngrams.append(zero_length_token)
            return ngrams

    return tokenizer 
開發者ID:Pinafore,項目名稱:qb,代碼行數:29,代碼來源:dataset.py

示例3: train_trigram

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import trigrams [as 別名]
def train_trigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for sent in lst:
        sent = sent.split()
        for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True):
            model[(w1,w2)][w2] += 1
    total_count = 0
    for w1,w2 in model:
        total_count = float(sum(model[(w1, w2)].values()))
        for w3 in model[(w1,w2)]:
            model[(w1, w2)][w3] /= total_count

#Total Sum Of Trigram Probablity Of A Sentence[Returns Float]: 
開發者ID:GauravBh1010tt,項目名稱:DeepLearn,代碼行數:15,代碼來源:lex_sem_ft.py

示例4: trigram_counts

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import trigrams [as 別名]
def trigram_counts(word_list):
	tgs = nltk.trigrams(word_list)
	fdist = nltk.FreqDist(tgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d 
開發者ID:yyht,項目名稱:BERT,代碼行數:9,代碼來源:utils.py

示例5: tokenize

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import trigrams [as 別名]
def tokenize(text):
      # text = NB.remove_punctuation(text)
      try:
        text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
      except:
        text = text.encode('ascii', 'replace').strip().lower()
      word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)]   # split punctuations but dont split single quotes for words like don't
      biword =  [b for b in nltk.bigrams(word)]
      triword =  [t for t in nltk.trigrams(word)]
      # word = [w for w in word if w not in stopwords.words('english')]
      return  word # triword 
開發者ID:sriniiyer,項目名稱:codenn,代碼行數:13,代碼來源:SVM.py

示例6: get_collocations

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import trigrams [as 別名]
def get_collocations(words):
	# returns n-grams up to trigrams that appear at least 3 times, with pruning of grams that are redundant
	minimum_frequency = 3
	ngrams = {"_".join(ngram): frequency/len(words) for ngram, frequency in FreqDist(everygrams(words, max_len=3)).items() if frequency > minimum_frequency}
	collocations = dict(ngrams)
	for ngram, likelihood in dict(ngrams).iteritems():
		grams = ngram.split("_")
		if len(grams) != 1:
			gram_likelihoods = [ngrams[gram] for gram in grams]
			if likelihood < 0.5 * np.prod(gram_likelihoods)**(1 / len(grams)):
				collocations.pop(ngram, None)
			else:
				for gram in grams:
					collocations.pop(gram, None)
	return sorted(collocations.items(), key=itemgetter(1), reverse=True) 
開發者ID:overlap-ai,項目名稱:words2map,代碼行數:17,代碼來源:words2map.py


注:本文中的nltk.trigrams方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。