本文整理汇总了Python中nltk.corpus.brown.words方法的典型用法代码示例。如果您正苦于以下问题:Python brown.words方法的具体用法?Python brown.words怎么用?Python brown.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.brown
的用法示例。
在下文中一共展示了brown.words方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: collocations
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def collocations(self, num=20, window_size=2):
"""
Print collocations derived from the text, ignoring stopwords.
:seealso: find_collocations
:param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
"""
if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
self._num = num
self._window_size = window_size
#print("Building collocations list")
from nltk.corpus import stopwords
ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocMeasures()
self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
print(tokenwrap(colloc_strings, separator="; "))
示例2: splitter
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def splitter(data, num_words):
words = data.split(' ')
output = []
cur_count = 0
cur_words = []
for word in words:
cur_words.append(word)
cur_count += 1
if cur_count == num_words:
output.append(' '.join(cur_words))
cur_words = []
cur_count = 0
output.append(' '.join(cur_words) )
return output
示例3: collocations
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def collocations(self, num=20, window_size=2):
"""
Print collocations derived from the text, ignoring stopwords.
:seealso: find_collocations
:param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
"""
if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
self._num = num
self._window_size = window_size
print "Building collocations list"
from nltk.corpus import stopwords
ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocMeasures()
self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
print tokenwrap(colloc_strings, separator="; ")
示例4: splitter
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def splitter(content, num_of_words):
words = content.split(' ')
result = []
current_count = 0
current_words = []
for word in words:
current_words.append(word)
current_count += 1
if current_count == num_of_words:
result.append(' '.join(current_words))
current_words = []
current_count = 0
result.append(' '.join(current_words))
return result
开发者ID:PacktPublishing,项目名称:Raspberry-Pi-3-Cookbook-for-Python-Programmers-Third-Edition,代码行数:20,代码来源:chunking.py
示例5: brown_data
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def brown_data():
"""return the text_length first tokens of the brown corpus tagged in pyrata format"""
tokens = brown.words()
tokens = tokens[:text_length]
pos_tags = nltk.pos_tag(tokens)
return [{'raw':w, 'pos':p} for (w, p) in pos_tags]
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# TEST
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
示例6: common_contexts
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def common_contexts(self, words, fail_on_unknown=False):
"""
Find contexts where the specified words can all appear; and
return a frequency distribution mapping each context to the
number of times that context was used.
:param words: The words used to seed the similarity search
:type words: str
:param fail_on_unknown: If true, then raise a value error if
any of the given words do not occur at all in the index.
"""
words = [self._key(w) for w in words]
contexts = [set(self._word_to_contexts[w]) for w in words]
empty = [words[i] for i in range(len(words)) if not contexts[i]]
common = reduce(set.intersection, contexts)
if empty and fail_on_unknown:
raise ValueError("The following word(s) were not found:", " ".join(words))
elif not common:
# nothing in common -- just return an empty freqdist.
return FreqDist()
else:
fd = FreqDist(
c for w in words for c in self._word_to_contexts[w] if c in common
)
return fd
示例7: __init__
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def __init__(self, tokens, key=lambda x: x):
"""
Construct a new concordance index.
:param tokens: The document (list of tokens) that this
concordance index was created from. This list can be used
to access the context of a given word occurrence.
:param key: A function that maps each token to a normalized
version that will be used as a key in the index. E.g., if
you use ``key=lambda s:s.lower()``, then the index will be
case-insensitive.
"""
self._tokens = tokens
"""The document (list of tokens) that this concordance index
was created from."""
self._key = key
"""Function mapping each token to an index key (or None)."""
self._offsets = defaultdict(list)
"""Dictionary mapping words (or keys) to lists of offset indices."""
# Initialize the index (self._offsets)
for index, word in enumerate(tokens):
word = self._key(word)
self._offsets[word].append(index)
示例8: word_similarity_dict
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def word_similarity_dict(self, word):
"""
Return a dictionary mapping from words to 'similarity scores,'
indicating how often these two words occur in the same
context.
"""
word = self._key(word)
word_contexts = set(self._word_to_contexts[word])
scores = {}
for w, w_contexts in self._word_to_contexts.items():
scores[w] = f_measure(word_contexts, set(w_contexts))
return scores
示例9: common_contexts
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def common_contexts(self, words, fail_on_unknown=False):
"""
Find contexts where the specified words can all appear; and
return a frequency distribution mapping each context to the
number of times that context was used.
:param words: The words used to seed the similarity search
:type words: str
:param fail_on_unknown: If true, then raise a value error if
any of the given words do not occur at all in the index.
"""
words = [self._key(w) for w in words]
contexts = [set(self._word_to_contexts[w]) for w in words]
empty = [words[i] for i in range(len(words)) if not contexts[i]]
common = reduce(set.intersection, contexts)
if empty and fail_on_unknown:
raise ValueError("The following word(s) were not found:",
" ".join(words))
elif not common:
# nothing in common -- just return an empty freqdist.
return FreqDist()
else:
fd = FreqDist(c for w in words
for c in self._word_to_contexts[w]
if c in common)
return fd
示例10: __init__
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def __init__(self, tokens, key=lambda x:x):
"""
Construct a new concordance index.
:param tokens: The document (list of tokens) that this
concordance index was created from. This list can be used
to access the context of a given word occurrence.
:param key: A function that maps each token to a normalized
version that will be used as a key in the index. E.g., if
you use ``key=lambda s:s.lower()``, then the index will be
case-insensitive.
"""
self._tokens = tokens
"""The document (list of tokens) that this concordance index
was created from."""
self._key = key
"""Function mapping each token to an index key (or None)."""
self._offsets = defaultdict(list)
"""Dictionary mapping words (or keys) to lists of offset
indices."""
# Initialize the index (self._offsets)
for index, word in enumerate(tokens):
word = self._key(word)
self._offsets[word].append(index)
示例11: print_concordance
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def print_concordance(self, word, width=75, lines=25):
"""
Print a concordance for ``word`` with the specified context window.
:param word: The target word
:type word: str
:param width: The width of each line, in characters (default=80)
:type width: int
:param lines: The number of lines to display (default=25)
:type lines: int
"""
half_width = (width - len(word) - 2) // 2
context = width // 4 # approx number of words of context
offsets = self.offsets(word)
if offsets:
lines = min(lines, len(offsets))
print("Displaying %s of %s matches:" % (lines, len(offsets)))
for i in offsets:
if lines <= 0:
break
left = (' ' * half_width +
' '.join(self._tokens[i-context:i]))
right = ' '.join(self._tokens[i+1:i+context])
left = left[-half_width:]
right = right[:half_width]
print(left, self._tokens[i], right)
lines -= 1
else:
print("No matches")
示例12: dispersion_plot
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def dispersion_plot(self, words):
"""
Produce a plot showing the distribution of the words through the text.
Requires pylab to be installed.
:param words: The words to be plotted
:type words: list(str)
:seealso: nltk.draw.dispersion_plot()
"""
from nltk.draw import dispersion_plot
dispersion_plot(self, words)
示例13: demo
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def demo():
from nltk.corpus import brown
text = Text(brown.words(categories='news'))
print(text)
print()
print("Concordance:")
text.concordance('news')
print()
print("Distributionally similar words:")
text.similar('news')
print()
print("Collocations:")
text.collocations()
print()
#print("Automatically generated text:")
#text.generate()
#print()
print("Dispersion plot:")
text.dispersion_plot(['news', 'report', 'said', 'announced'])
print()
print("Vocabulary plot:")
text.plot(50)
print()
print("Indexing:")
print("text[3]:", text[3])
print("text[3:5]:", text[3:5])
print("text.vocab()['news']:", text.vocab()['news'])
示例14: get_nltk_freq_words
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def get_nltk_freq_words():
"""Use Brown corpus frequent words
More corpora: https://www.nltk.org/book/ch02.html
"""
freq_dict = nltk.FreqDist(brown.words())
for fileid in gutenberg.fileids():
freq_dict.update(nltk.FreqDist(gutenberg.words(fileid)))
freq_words = [k for k, v in freq_dict.items() if v > 10]
return freq_words, freq_dict
示例15: print_concordance
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import words [as 别名]
def print_concordance(self, word, width=75, lines=25):
"""
Print a concordance for ``word`` with the specified context window.
:param word: The target word
:type word: str
:param width: The width of each line, in characters (default=80)
:type width: int
:param lines: The number of lines to display (default=25)
:type lines: int
"""
half_width = (width - len(word) - 2) / 2
context = width/4 # approx number of words of context
offsets = self.offsets(word)
if offsets:
lines = min(lines, len(offsets))
print "Displaying %s of %s matches:" % (lines, len(offsets))
for i in offsets:
if lines <= 0:
break
left = (' ' * half_width +
' '.join(self._tokens[i-context:i]))
right = ' '.join(self._tokens[i+1:i+context])
left = left[-half_width:]
right = right[:half_width]
print left, self._tokens[i], right
lines -= 1
else:
print "No matches"