本文整理汇总了Python中nltk.corpus.gutenberg.words函数的典型用法代码示例。如果您正苦于以下问题:Python words函数的具体用法?Python words怎么用?Python words使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了words函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: exercise_gutenberg
def exercise_gutenberg():
# 打印古腾堡项目的文件列表
print gutenberg.fileids()
# 挑选一个文本: 简-奥斯丁的《爱玛》
emma = gutenberg.words("austen-emma.txt")
# 查看书的长度
print len(emma)
# 导入文本
emma_text = nltk.Text(emma)
emma_text.concordance("surprize")
for file_id in gutenberg.fileids():
chars_list = gutenberg.raw(file_id)
words_list = gutenberg.words(file_id)
sents_list = gutenberg.sents(file_id)
# 统计文件的总字符数
num_chars = len(chars_list)
# 统计文件的总单词数
num_words = len(words_list)
# 统计文件的总句子数
num_sents = len(sents_list)
# 统计文件的非重复单词数
num_vocab = len(set([w.lower() for w in words_list]))
# 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
示例2: gutenberg
def gutenberg():
from nltk.corpus import gutenberg
for t in gutenberg.fileids():
num_chars = len(gutenberg.raw(t))
num_words = len(gutenberg.words(t))
num_sents = len(gutenberg.sents(t))
num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
示例3: page57
def page57():
"""Statistics from the Gutenberg corpora"""
from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars / num_words), int(num_words / num_sents),
print int(num_words / num_vocab), fileid
示例4: fun02
def fun02():
"""fun02"""
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
# average word length average sentence length
print int(num_chars/num_words), int(num_words/num_sents),
# number of times each vocabulary item appers in the text
print int(num_words/num_vocab), fileid
示例5: for_print
def for_print():
'''
显示每个文本的三个统计量
:return:
'''
for fileid in gutenberg.fileids():
num_chars=len(gutenberg.raw(fileid))
num_words=len(gutenberg.words(fileid))
num_sents=len(gutenberg.sents(fileid))
num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
示例6: ex2
def ex2():
from nltk.corpus import gutenberg
ap = gutenberg.words("austen-persuasion.txt")
word_tokens = len(ap)
word_types = len(set([w.lower() for w in ap]))
print "#-word tokens=", word_tokens
print "#-word types=", word_types
示例7: ex17
def ex17():
from nltk.corpus import gutenberg
macbeth = gutenberg.words("shakespeare-macbeth.txt")
stopwords = set(nltk.corpus.stopwords.words())
fd = nltk.FreqDist([w for w in macbeth if w.lower() not in stopwords
and len(w) > 3 and w.isalpha()])
print fd.keys()[0:50]
示例8: main
def main():
# gutenberg
gu_words = gutenberg.words()
gu_words_exclude_stops = exclude_stopwords(gu_words)
gu_fd1 = get_frequency_distribution(gu_words)
gu_fd2 = get_frequency_distribution(gu_words_exclude_stops)
pylab.plot(gu_fd1, color='red')
pylab.plot(gu_fd2, color='orange')
# inaugural
in_words = inaugural.words()
in_words_exclude_stops = exclude_stopwords(in_words)
in_fd1 = get_frequency_distribution(in_words)
in_fd2 = get_frequency_distribution(in_words_exclude_stops)
pylab.plot(in_fd1, color='black')
pylab.plot(in_fd2, color='gray')
# reuters
yen_words = reuters.words(categories='yen')
yen_words_exclude_stops = exclude_stopwords(yen_words)
yen_fd1 = get_frequency_distribution(yen_words)
yen_fd2 = get_frequency_distribution(yen_words_exclude_stops)
pylab.plot(yen_fd1, color='blue')
pylab.plot(yen_fd2, color='green')
pylab.xscale('log')
pylab.yscale('log')
pylab.show()
示例9: generateSentence
def generateSentence():
corpus = random.randint(0,3)
if corpus == 0:
text = brown.words()
elif corpus == 1:
text = gutenberg.words()
elif corpus == 2:
text = webtext.words()
elif corpus == 3:
text = movie_reviews.words()
tweetString = ''
lengthOfTweet = random.randint(0,20)
len(text)
firstRun = True
blank = ' '
startOfWord = ''
startOfWordIndex = 0
startingWord = random.randint(0, (len(text) - 40))
punctuation = [".", ",", '"', ";", ":", "?", "!", ")", "(", "*", "[", "]", "‘", "“", "#"]
for x in xrange(startingWord,(startingWord + len(text))):
startOfWord = text[x]
if startOfWord ==".":
startOfWordIndex = x
break
for x in xrange(startOfWordIndex + 1, startOfWordIndex+lengthOfTweet):
if text[x] in punctuation:
tweetString = tweetString + text[x]
elif text[x] not in punctuation:
tweetString = tweetString + blank + text[x]
return tweetString
示例10: exercise2
def exercise2():
print
print "Exercise 2"
words = gutenberg.words('austen-persuasion.txt')
print "Number of word tokens in the text austen-persuasion.txt: %d" %len(words)
print "Number of word-types in the text austen-persuasion.txt: %d" %len(set(words))
print set(words)
print
示例11: fun01
def fun01():
"""fun01"""
print gutenberg.fileids()
# emma by jane austen
emma = gutenberg.words('austen-emma.txt')
# how many words it contains
print len(emma)
print Text(emma).concordance("surprize")
示例12: find_word_probability
def find_word_probability(CORPUS):
''' Find word occurrence probabilty from the given corpus'''
cfd = ConditionalFreqDist()
prev_word = None
for word in gutenberg.words(CORPUS):
cfd[prev_word][word] += 1
prev_word = word
return cfd
示例13: main
def main():
loader = WordLoader()
loader.load_valid_words_from_aspell("en_GB")
loader.load_valid_words_from_aspell("en_US")
all_words = brown.words() + gutenberg.words()
sorted_words_filename = 'sorted_words.txt'
loader.write_sorted_words(all_words, sorted_words_filename)
sorted_words = loader.sorted_words
print_anagrams(sorted_words, all_words)
示例14: gutenberg
def gutenberg():
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
print len(emma)
print gutenberg.fileids()
emma = gutenberg.words('austen-emma.txt')
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences[1037]
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len]
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
示例15: structure
def structure():
raw = gutenberg.raw("burgess-busterbrown.txt")
raw[1:20]
words = gutenberg.words("burgess-busterbrown.txt")
words[1:20]
sents = gutenberg.sents("burgess-busterbrown.txt")
sents[1:20]