本文整理汇总了Python中nltk.collocations.TrigramCollocationFinder类的典型用法代码示例。如果您正苦于以下问题:Python TrigramCollocationFinder类的具体用法?Python TrigramCollocationFinder怎么用?Python TrigramCollocationFinder使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了TrigramCollocationFinder类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: ngram_collocation
def ngram_collocation(words, sents, n, support=10, topK=200):
if n>=4:
finder = TrigramCollocationFinder.from_words(words)
ngram_measures = TrigramAssocMeasures()
finder.apply_freq_filter(support)
pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)
ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support/3, 0.3)
print_ngrams(ext_ngrams)
return ext_ngrams
#pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK)
#the current collocation measure is PMI
else:
if n==2:
finder = BigramCollocationFinder.from_words(words)
ngram_measures = BigramAssocMeasures()
if n==3:
finder = TrigramCollocationFinder.from_words(words)
ngram_measures = TrigramAssocMeasures()
finder.apply_freq_filter(support)
pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)
print_ngrams(pmi_ngrams)
return pmi_ngrams
示例2: collocations
def collocations(stream, top_n=10000, min_bigram_freq=50, min_trigram_freq=20):
"""Extract text collocations (bigrams and trigrams), from a stream of words.
Parameters
----------
stream: iterable object
An iterable of words
top_n: int
Number of collocations to retrieve from the stream of words (order by decreasing frequency). Default is 10000
min_bigram_freq: int
Minimum frequency of a bigram in order to retrieve it. Default is 50.
min_trigram_freq: int
Minimum frequency of a trigram in order to retrieve it. Default is 20.
"""
tcf = TrigramCollocationFinder.from_words(stream)
tcf.apply_freq_filter(min_trigram_freq)
trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))
bcf = tcf.bigram_finder()
bcf.apply_freq_filter(min_bigram_freq)
bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))
bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)
return bigrams_patterns, trigrams_patterns
示例3: trigramFeats
def trigramFeats(thesewords, n=100):
si = iter(thesewords)
words = [c + " " + next(si, '') + " " + next(si, '') for c in si]
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_freq_filter(n)
trigram = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, n)
return dict([(ngram, True) for ngram in itertools.chain(words, trigram)])
示例4: get_frequencies
def get_frequencies(self, desc):
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
words = word_tokenize(desc)
print '------gram--------'
words_to_count = [word for word in words if word not in stopset]
words_to_count = [word for word in words_to_count if not len(word) < 3]
c = Counter(words_to_count)
single = c.most_common(20)
print single
print '------bigram--------'
bcf = BigramCollocationFinder.from_words(words)
bcf.apply_word_filter(filter_stops)
bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
print bigrm
print '------trigram--------'
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_word_filter(filter_stops)
tcf.apply_freq_filter(3) #only keep those that appear more than 3 times
trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
print trigrm
matches = [single,bigrm,trigrm]
return matches
示例5: trigram
def trigram(words, score_fn=TrigramAssocMeasures.likelihood_ratio, n=1500, freq=1):
"""
tmp_words=[]
for w in words:
tmp_words.append(w)
words=tmp_words
"""
if len(words) <= 0:
return {}
tmp_dict = {}
for w in words:
tmp_dict[w] = 1
if len(tmp_dict.keys()) < 3:
return {}
trigram_finder = TrigramCollocationFinder.from_words(words) # 把文本变成双词搭配的形式
trigram_finder.apply_freq_filter(freq)
trigrams = trigram_finder.nbest(score_fn, n) # 使用了卡方统计的方法,选择排名前1000的双词
# print type(words)
res = {}
for s in trigrams:
if res.has_key(s[0] + s[1] + s[2]) == True:
res[s[0] + s[1] + s[2]] += 1
else:
res[s[0] + s[1] + s[2]] = 1
return res
示例6: best_ngrams
def best_ngrams(words, top_n=10, min_freq=5):
"""
Extract `top_n` most salient collocations (bigrams and trigrams),
from a stream of words. Ignore collocations with frequency
lower than `min_freq`.
This fnc uses NLTK for the collocation detection itself -- not very scalable!
Return the detected ngrams as compiled regular expressions, for their faster
detection later on.
"""
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_freq_filter(min_freq)
trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))
bcf = tcf.bigram_finder()
bcf.apply_freq_filter(min_freq)
bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))
pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)
print pat_gram2
return pat_gram2, pat_gram3
示例7: create_tri_collocations
def create_tri_collocations(features_words,document_preprocess):
finder = TrigramCollocationFinder.from_words(movie_reviews.words())
finder.apply_freq_filter(3)
tricoll = finder.nbest(trigram_measures.pmi,1000)
for f in document_preprocess:
tricoll = [(f(a),f(b),f(c)) for (a,b,c) in tricoll if (f(a) and f(b) and f(c))]
return tricoll
示例8: __init__
def __init__(self, words, sentences, language):
self.num_words = len(words)
self.unique_words = len(set(words))
self.num_sentences = len(sentences)
self.average_sentence_length = round(self.num_words / self.num_sentences)
self.lexical_diversity = round(self.num_words / self.unique_words)
fdist = FreqDist(words)
stop_words = stopwords.words(language)
not_stopwords = [w for w in words if w not in stop_words]
fdist2 = FreqDist(not_stopwords)
self.fifty_first_words = fdist.most_common(50)
self.hundreds_nsw = fdist2.most_common(300)
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.apply_freq_filter(10)
self.fifty_collocations = finder.nbest(bigram_measures.pmi, 50)
trigram_measures = TrigramAssocMeasures()
finder3 = TrigramCollocationFinder.from_words(words)
finder3.apply_freq_filter(10)
self.fifty_collocations3 = finder3.nbest(trigram_measures.pmi, 50)
self.stcs_width_words = [' '.join(sent) for sent in sentences
if "malheureusement" in sent.lower()]
示例9: extract_trigrams
def extract_trigrams(self, sent):
sent = self._preprocess_sent(sent)
trigram_measures = TrigramAssocMeasures()
TriFinder = TrigramCollocationFinder.from_words(sent)
trigrams = TriFinder.nbest(trigram_measures.pmi, 10000)
trigrams = set([' '.join(i) for i in trigrams])
trigrams = trigrams & self._trigrams_set
return { i: True for i in trigrams }
示例10: set_trigramas
def set_trigramas(self,freq=2,best=20):
tcf = TrigramCollocationFinder.from_words(self.palavras)
stopset = set(stopwords.words('portuguese'))
filter_stops = lambda w: len(w) < 3 or w in stopset
tcf.apply_word_filter(filter_stops)
tcf.apply_freq_filter(freq)
a = tcf.nbest(TrigramAssocMeasures.pmi, best)
self.trigramas = a
示例11: calc_trigrams
def calc_trigrams(text, min_freq=50):
"""Returns frequency of trigrams from a text input."""
words = [w.lower() for w in text]
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_freq_filter(min_freq)
trigrams = tcf.ngram_fd.items()
trigram_list.append(trigrams)
return trigram_list
示例12: trigram_word_feats
def trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=50):
trigram_finder = TrigramCollocationFinder.from_words(words)
try:
trigrams = trigram_finder.nbest(score_fn, n)
except:
print "lost trigrams", words
return dict([(ngram, True) for ngram in itertools.chain(words)])
return dict([(ngram, True) for ngram in itertools.chain(words, trigrams)])
示例13: getTrigrams
def getTrigrams(self):
words = [w.lower() for w in nltk.word_tokenize(self.text)]
tcf = TrigramCollocationFinder.from_words(words)
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
tcf.apply_word_filter(filter_stops)
tcf.apply_freq_filter(1)
return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 6)
示例14: getTrigram
def getTrigram(haystack):
tokenizer = WordPunctTokenizer()
words = tokenizer.tokenize(haystack)
tcf = TrigramCollocationFinder.from_words(words)
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
tcf.apply_word_filter(filter_stops)
return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)
示例15: best_n_trigrams
def best_n_trigrams(self, n, method="pmi"):
trigram_measures = TrigramAssocMeasures()
tokens = self.get_word_lst()
finder = TrigramCollocationFinder.from_words(tokens)
if method == "pmi":
return finder.nbest(trigram_measures.pmi, n)
if method == "raw_freq":
return finder.nbest(trigram_measures.raw_freq, n)