本文整理汇总了Python中nltk.collocations.BigramCollocationFinder.from_words方法的典型用法代码示例。如果您正苦于以下问题:Python BigramCollocationFinder.from_words方法的具体用法?Python BigramCollocationFinder.from_words怎么用?Python BigramCollocationFinder.from_words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.collocations.BigramCollocationFinder
的用法示例。
在下文中一共展示了BigramCollocationFinder.from_words方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: collocations
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def collocations(self, num=20, window_size=2):
"""
Print collocations derived from the text, ignoring stopwords.
:seealso: find_collocations
:param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
"""
if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
self._num = num
self._window_size = window_size
#print("Building collocations list")
from nltk.corpus import stopwords
ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocMeasures()
self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
print(tokenwrap(colloc_strings, separator="; "))
示例2: test_bigram2
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def test_bigram2(self):
sent = 'this this is is a a test test'.split()
b = BigramCollocationFinder.from_words(sent)
#python 2.6 does not have assertItemsEqual or assertListEqual
self.assertEqual(
sorted(b.ngram_fd.items()),
sorted([(('a', 'a'), 1), (('a', 'test'), 1), (('is', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'is'), 1), (('this', 'this'), 1)])
)
self.assertEqual(
sorted(b.word_fd.items()),
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)])
)
self.assertTrue(len(sent) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1)
self.assertTrue(close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted([(('a', 'a'), 1.0), (('a', 'test'), 1.0), (('is', 'a'), 1.0), (('is', 'is'), 1.0), (('test', 'test'), 1.0), (('this', 'is'), 1.0), (('this', 'this'), 1.0)])
))
示例3: test_bigram3
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def test_bigram3(self):
sent = 'this this is is a a test test'.split()
b = BigramCollocationFinder.from_words(sent, window_size=3)
self.assertEqual(
sorted(b.ngram_fd.items()),
sorted([(('a', 'test'), 3), (('is', 'a'), 3), (('this', 'is'), 3), (('a', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'this'), 1)])
)
self.assertEqual(
sorted(b.word_fd.items()),
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)])
)
self.assertTrue(len(sent) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0)
self.assertTrue(close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted([(('a', 'test'), 1.584962500721156), (('is', 'a'), 1.584962500721156), (('this', 'is'), 1.584962500721156), (('a', 'a'), 0.0), (('is', 'is'), 0.0), (('test', 'test'), 0.0), (('this', 'this'), 0.0)])
))
示例4: test_bigram5
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def test_bigram5(self):
sent = 'this this is is a a test test'.split()
b = BigramCollocationFinder.from_words(sent, window_size=5)
self.assertEqual(
sorted(b.ngram_fd.items()),
sorted([(('a', 'test'), 4), (('is', 'a'), 4), (('this', 'is'), 4), (('is', 'test'), 3), (('this', 'a'), 3), (('a', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'this'), 1)])
)
self.assertEqual(
sorted(b.word_fd.items()),
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)])
)
self.assertTrue(len(sent) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0)
self.assertTrue(close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted([(('a', 'test'), 1.0), (('is', 'a'), 1.0), (('this', 'is'), 1.0), (('is', 'test'), 0.5849625007211562), (('this', 'a'), 0.5849625007211562), (('a', 'a'), -1.0), (('is', 'is'), -1.0), (('test', 'test'), -1.0), (('this', 'this'), -1.0)])
))
示例5: calculate_ngram_diversity
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def calculate_ngram_diversity(corpus):
"""
Calculates unigram and bigram diversity
Args:
corpus: tokenized list of sentences sampled
Returns:
uni_diversity: distinct-1 score
bi_diversity: distinct-2 score
"""
bigram_finder = BigramCollocationFinder.from_words(corpus)
bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N
dist = FreqDist(corpus)
uni_diversity = len(dist) / len(corpus)
return uni_diversity, bi_diversity
示例6: collocations
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def collocations(self, num=20, window_size=2):
"""
Print collocations derived from the text, ignoring stopwords.
:seealso: find_collocations
:param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
"""
if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
self._num = num
self._window_size = window_size
print "Building collocations list"
from nltk.corpus import stopwords
ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocMeasures()
self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
print tokenwrap(colloc_strings, separator="; ")
示例7: cal_Distinct
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def cal_Distinct(corpus):
"""
Calculates unigram and bigram diversity
Args:
corpus: tokenized list of sentences sampled
Returns:
uni_diversity: distinct-1 score
bi_diversity: distinct-2 score
"""
bigram_finder = BigramCollocationFinder.from_words(corpus)
bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N
dist = FreqDist(corpus)
uni_diversity = len(dist) / len(corpus)
return uni_diversity, bi_diversity
示例8: bigrams
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
'''Find the best n bigrams of a text by means of a give measure.'''
words = tokenize(text)
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
示例9: stopword_filtered_bigrams
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def stopword_filtered_bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
'''Removes the stopwords and computes the best bigrams'''
stopset = set(stopwords.words('english'))
words = [word for word in tokenize(text) if word not in stopset]
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
示例10: collocation_list
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def collocation_list(self, num=20, window_size=2):
"""
Return collocations derived from the text, ignoring stopwords.
:param num: The maximum number of collocations to return.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
"""
if not (
"_collocations" in self.__dict__
and self._num == num
and self._window_size == window_size
):
self._num = num
self._window_size = window_size
# print("Building collocations list")
from nltk.corpus import stopwords
ignored_words = stopwords.words("english")
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocMeasures()
self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
return [w1 + " " + w2 for w1, w2 in self._collocations]
示例11: test_bigram2
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def test_bigram2(self):
sent = 'this this is is a a test test'.split()
b = BigramCollocationFinder.from_words(sent)
# python 2.6 does not have assertItemsEqual or assertListEqual
self.assertEqual(
sorted(b.ngram_fd.items()),
sorted(
[
(('a', 'a'), 1),
(('a', 'test'), 1),
(('is', 'a'), 1),
(('is', 'is'), 1),
(('test', 'test'), 1),
(('this', 'is'), 1),
(('this', 'this'), 1),
]
),
)
self.assertEqual(
sorted(b.word_fd.items()),
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
)
self.assertTrue(
len(sent) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1
)
self.assertTrue(
close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted(
[
(('a', 'a'), 1.0),
(('a', 'test'), 1.0),
(('is', 'a'), 1.0),
(('is', 'is'), 1.0),
(('test', 'test'), 1.0),
(('this', 'is'), 1.0),
(('this', 'this'), 1.0),
]
),
)
)
示例12: test_bigram3
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def test_bigram3(self):
sent = 'this this is is a a test test'.split()
b = BigramCollocationFinder.from_words(sent, window_size=3)
self.assertEqual(
sorted(b.ngram_fd.items()),
sorted(
[
(('a', 'test'), 3),
(('is', 'a'), 3),
(('this', 'is'), 3),
(('a', 'a'), 1),
(('is', 'is'), 1),
(('test', 'test'), 1),
(('this', 'this'), 1),
]
),
)
self.assertEqual(
sorted(b.word_fd.items()),
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
)
self.assertTrue(
len(sent)
== sum(b.word_fd.values())
== (sum(b.ngram_fd.values()) + 2 + 1) / 2.0
)
self.assertTrue(
close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted(
[
(('a', 'test'), 1.584962500721156),
(('is', 'a'), 1.584962500721156),
(('this', 'is'), 1.584962500721156),
(('a', 'a'), 0.0),
(('is', 'is'), 0.0),
(('test', 'test'), 0.0),
(('this', 'this'), 0.0),
]
),
)
)
示例13: test_bigram5
# 需要导入模块: from nltk.collocations import BigramCollocationFinder [as 别名]
# 或者: from nltk.collocations.BigramCollocationFinder import from_words [as 别名]
def test_bigram5(self):
sent = 'this this is is a a test test'.split()
b = BigramCollocationFinder.from_words(sent, window_size=5)
self.assertEqual(
sorted(b.ngram_fd.items()),
sorted(
[
(('a', 'test'), 4),
(('is', 'a'), 4),
(('this', 'is'), 4),
(('is', 'test'), 3),
(('this', 'a'), 3),
(('a', 'a'), 1),
(('is', 'is'), 1),
(('test', 'test'), 1),
(('this', 'this'), 1),
]
),
)
self.assertEqual(
sorted(b.word_fd.items()),
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
)
self.assertTrue(
len(sent)
== sum(b.word_fd.values())
== (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0
)
self.assertTrue(
close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted(
[
(('a', 'test'), 1.0),
(('is', 'a'), 1.0),
(('this', 'is'), 1.0),
(('is', 'test'), 0.5849625007211562),
(('this', 'a'), 0.5849625007211562),
(('a', 'a'), -1.0),
(('is', 'is'), -1.0),
(('test', 'test'), -1.0),
(('this', 'this'), -1.0),
]
),
)
)