本文整理汇总了Python中nltk.trigrams方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.trigrams方法的具体用法?Python nltk.trigrams怎么用?Python nltk.trigrams使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.trigrams方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def test():
lm1 = pickle.load(open("lm.bin", 'rb'))
tweets_list = deviger.load_dataset('test.txt')
for line in tweets_list:
sentences = nltk.sent_tokenize(line.strip())
print("Tweet sentences:", sentences)
for sent in sentences:
words = nltk.word_tokenize(sent)
word_trigrams = nltk.trigrams(words)
sum_log_prob = 0
for trigram in word_trigrams:
logprob = lm1.log_prob(trigram)
sum_log_prob += logprob
print("(", sum_log_prob / len(words), ")")
示例2: create_qb_tokenizer
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def create_qb_tokenizer(
unigrams=True, bigrams=False, trigrams=False,
zero_length_token='zerolengthunk', strip_qb_patterns=True):
def tokenizer(text):
if strip_qb_patterns:
text = re.sub(
'\s+', ' ',
re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
).strip().capitalize()
import nltk
tokens = nltk.word_tokenize(text)
if len(tokens) == 0:
return [zero_length_token]
else:
ngrams = []
if unigrams:
ngrams.extend(tokens)
if bigrams:
ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
if trigrams:
ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])
if len(ngrams) == 0:
ngrams.append(zero_length_token)
return ngrams
return tokenizer
示例3: train_trigram
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def train_trigram(lst):
model = defaultdict(lambda: defaultdict(lambda: 0))
for sent in lst:
sent = sent.split()
for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True):
model[(w1,w2)][w2] += 1
total_count = 0
for w1,w2 in model:
total_count = float(sum(model[(w1, w2)].values()))
for w3 in model[(w1,w2)]:
model[(w1, w2)][w3] /= total_count
#Total Sum Of Trigram Probablity Of A Sentence[Returns Float]:
示例4: trigram_counts
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def trigram_counts(word_list):
tgs = nltk.trigrams(word_list)
fdist = nltk.FreqDist(tgs)
d = Counter()
for k, v in fdist.items():
d[k] = v
return d
示例5: tokenize
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def tokenize(text):
# text = NB.remove_punctuation(text)
try:
text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
except:
text = text.encode('ascii', 'replace').strip().lower()
word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)] # split punctuations but dont split single quotes for words like don't
biword = [b for b in nltk.bigrams(word)]
triword = [t for t in nltk.trigrams(word)]
# word = [w for w in word if w not in stopwords.words('english')]
return word # triword
示例6: get_collocations
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def get_collocations(words):
# returns n-grams up to trigrams that appear at least 3 times, with pruning of grams that are redundant
minimum_frequency = 3
ngrams = {"_".join(ngram): frequency/len(words) for ngram, frequency in FreqDist(everygrams(words, max_len=3)).items() if frequency > minimum_frequency}
collocations = dict(ngrams)
for ngram, likelihood in dict(ngrams).iteritems():
grams = ngram.split("_")
if len(grams) != 1:
gram_likelihoods = [ngrams[gram] for gram in grams]
if likelihood < 0.5 * np.prod(gram_likelihoods)**(1 / len(grams)):
collocations.pop(ngram, None)
else:
for gram in grams:
collocations.pop(gram, None)
return sorted(collocations.items(), key=itemgetter(1), reverse=True)