当前位置: 首页>>代码示例>>Python>>正文


Python nltk.trigrams方法代码示例

本文整理汇总了Python中nltk.trigrams方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.trigrams方法的具体用法?Python nltk.trigrams怎么用?Python nltk.trigrams使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk的用法示例。


在下文中一共展示了nltk.trigrams方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def test():
    lm1 = pickle.load(open("lm.bin", 'rb'))

    tweets_list = deviger.load_dataset('test.txt')

    for line in tweets_list:
        sentences = nltk.sent_tokenize(line.strip())
        print("Tweet sentences:", sentences)
        for sent in sentences:
            words = nltk.word_tokenize(sent)
            word_trigrams = nltk.trigrams(words)
                sum_log_prob = 0
            for trigram in word_trigrams:
                logprob = lm1.log_prob(trigram)
                sum_log_prob += logprob
            print("(", sum_log_prob / len(words), ")") 
开发者ID:iorch,项目名称:jakaton_feminicidios,代码行数:18,代码来源:lang_model_2.py

示例2: create_qb_tokenizer

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def create_qb_tokenizer(
        unigrams=True, bigrams=False, trigrams=False,
        zero_length_token='zerolengthunk', strip_qb_patterns=True):
    def tokenizer(text):
        if strip_qb_patterns:
            text = re.sub(
                '\s+', ' ',
                re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
            ).strip().capitalize()
        import nltk
        tokens = nltk.word_tokenize(text)
        if len(tokens) == 0:
            return [zero_length_token]
        else:
            ngrams = []
            if unigrams:
                ngrams.extend(tokens)
            if bigrams:
                ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
            if trigrams:
                ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])

            if len(ngrams) == 0:
                ngrams.append(zero_length_token)
            return ngrams

    return tokenizer 
开发者ID:Pinafore,项目名称:qb,代码行数:29,代码来源:dataset.py

示例3: train_trigram

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def train_trigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for sent in lst:
        sent = sent.split()
        for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True):
            model[(w1,w2)][w2] += 1
    total_count = 0
    for w1,w2 in model:
        total_count = float(sum(model[(w1, w2)].values()))
        for w3 in model[(w1,w2)]:
            model[(w1, w2)][w3] /= total_count

#Total Sum Of Trigram Probablity Of A Sentence[Returns Float]: 
开发者ID:GauravBh1010tt,项目名称:DeepLearn,代码行数:15,代码来源:lex_sem_ft.py

示例4: trigram_counts

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def trigram_counts(word_list):
	tgs = nltk.trigrams(word_list)
	fdist = nltk.FreqDist(tgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d 
开发者ID:yyht,项目名称:BERT,代码行数:9,代码来源:utils.py

示例5: tokenize

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def tokenize(text):
      # text = NB.remove_punctuation(text)
      try:
        text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
      except:
        text = text.encode('ascii', 'replace').strip().lower()
      word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)]   # split punctuations but dont split single quotes for words like don't
      biword =  [b for b in nltk.bigrams(word)]
      triword =  [t for t in nltk.trigrams(word)]
      # word = [w for w in word if w not in stopwords.words('english')]
      return  word # triword 
开发者ID:sriniiyer,项目名称:codenn,代码行数:13,代码来源:SVM.py

示例6: get_collocations

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import trigrams [as 别名]
def get_collocations(words):
	# returns n-grams up to trigrams that appear at least 3 times, with pruning of grams that are redundant
	minimum_frequency = 3
	ngrams = {"_".join(ngram): frequency/len(words) for ngram, frequency in FreqDist(everygrams(words, max_len=3)).items() if frequency > minimum_frequency}
	collocations = dict(ngrams)
	for ngram, likelihood in dict(ngrams).iteritems():
		grams = ngram.split("_")
		if len(grams) != 1:
			gram_likelihoods = [ngrams[gram] for gram in grams]
			if likelihood < 0.5 * np.prod(gram_likelihoods)**(1 / len(grams)):
				collocations.pop(ngram, None)
			else:
				for gram in grams:
					collocations.pop(gram, None)
	return sorted(collocations.items(), key=itemgetter(1), reverse=True) 
开发者ID:overlap-ai,项目名称:words2map,代码行数:17,代码来源:words2map.py


注:本文中的nltk.trigrams方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。