Python gutenberg.words函数代码示例

本文整理汇总了Python中nltk.corpus.gutenberg.words函数的典型用法代码示例。如果您正苦于以下问题：Python words函数的具体用法？Python words怎么用？Python words使用的例子？那么, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了words函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: exercise_gutenberg

def exercise_gutenberg():
    # 打印古腾堡项目的文件列表
    print gutenberg.fileids()

    # 挑选一个文本： 简-奥斯丁的《爱玛》
    emma = gutenberg.words("austen-emma.txt")

    # 查看书的长度
    print len(emma)

    # 导入文本
    emma_text = nltk.Text(emma)
    emma_text.concordance("surprize")

    for file_id in gutenberg.fileids():
        chars_list = gutenberg.raw(file_id)
        words_list = gutenberg.words(file_id)
        sents_list = gutenberg.sents(file_id)

        # 统计文件的总字符数
        num_chars = len(chars_list)
        # 统计文件的总单词数
        num_words = len(words_list)
        # 统计文件的总句子数
        num_sents = len(sents_list)
        # 统计文件的非重复单词数
        num_vocab = len(set([w.lower() for w in words_list]))
        # 打印词的平均字符数， 句子的平均单词数， 每个单词出现的平均次数， 文件名
        print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id

开发者ID:BurnellLiu，项目名称:LiuProject，代码行数:29，代码来源:chapter_02.py

示例2: gutenberg

def gutenberg():
    from nltk.corpus import gutenberg
    for t in gutenberg.fileids():
        num_chars = len(gutenberg.raw(t))
        num_words = len(gutenberg.words(t))
        num_sents = len(gutenberg.sents(t))
        num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t

开发者ID:kwdhd，项目名称:nlp，代码行数:8，代码来源:main.py

示例3: page57

def page57():
    """Statistics from the Gutenberg corpora"""
    from nltk.corpus import gutenberg

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars / num_words), int(num_words / num_sents),
        print int(num_words / num_vocab), fileid

开发者ID:andreoliwa，项目名称:nlp-book，代码行数:11，代码来源:book_examples.py

示例4: fun02

def fun02():
    """fun02"""
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        # average word length average sentence length
        print int(num_chars/num_words), int(num_words/num_sents),
        # number of times each vocabulary item appers in the text
        print int(num_words/num_vocab), fileid

开发者ID:gree2，项目名称:hobby，代码行数:11，代码来源:ch02.py

示例5: for_print

def for_print():
    '''
    显示每个文本的三个统计量
    :return:
    '''
    for fileid in gutenberg.fileids():
        num_chars=len(gutenberg.raw(fileid))
        num_words=len(gutenberg.words(fileid))
        num_sents=len(gutenberg.sents(fileid))
        num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid

开发者ID:Paul-Lin，项目名称:misc，代码行数:11，代码来源:toturial.py

示例6: ex2

def ex2():
  from nltk.corpus import gutenberg
  ap = gutenberg.words("austen-persuasion.txt")
  word_tokens = len(ap)
  word_types = len(set([w.lower() for w in ap]))
  print "#-word tokens=", word_tokens
  print "#-word types=", word_types

开发者ID:447327642，项目名称:nltk-examples，代码行数:7，代码来源:ch02_ex.py

示例7: ex17

def ex17():
  from nltk.corpus import gutenberg
  macbeth = gutenberg.words("shakespeare-macbeth.txt")
  stopwords = set(nltk.corpus.stopwords.words())
  fd = nltk.FreqDist([w for w in macbeth if w.lower() not in stopwords
      and len(w) > 3 and w.isalpha()])
  print fd.keys()[0:50]

开发者ID:447327642，项目名称:nltk-examples，代码行数:7，代码来源:ch02_ex.py

示例8: main

def main():
    # gutenberg
    gu_words = gutenberg.words()
    gu_words_exclude_stops = exclude_stopwords(gu_words)
    gu_fd1 = get_frequency_distribution(gu_words)
    gu_fd2 = get_frequency_distribution(gu_words_exclude_stops)

    pylab.plot(gu_fd1, color='red')
    pylab.plot(gu_fd2, color='orange')

    # inaugural
    in_words = inaugural.words()
    in_words_exclude_stops = exclude_stopwords(in_words)
    in_fd1 = get_frequency_distribution(in_words)
    in_fd2 = get_frequency_distribution(in_words_exclude_stops)

    pylab.plot(in_fd1, color='black')
    pylab.plot(in_fd2, color='gray')

    # reuters
    yen_words = reuters.words(categories='yen')
    yen_words_exclude_stops = exclude_stopwords(yen_words)
    yen_fd1 = get_frequency_distribution(yen_words)
    yen_fd2 = get_frequency_distribution(yen_words_exclude_stops)

    pylab.plot(yen_fd1, color='blue')
    pylab.plot(yen_fd2, color='green')

    pylab.xscale('log')
    pylab.yscale('log')
    pylab.show()

开发者ID:t2y，项目名称:learnnlp，代码行数:31，代码来源:practice23_a.py

示例9: generateSentence

def generateSentence():
    corpus = random.randint(0,3)
    if corpus == 0:
        text = brown.words()
    elif corpus == 1:
        text = gutenberg.words()
    elif corpus == 2:
        text = webtext.words()
    elif corpus == 3:
        text = movie_reviews.words()
    tweetString = ''
    lengthOfTweet = random.randint(0,20)
    len(text)
    firstRun = True
    blank = ' '
    startOfWord = ''
    startOfWordIndex = 0
    startingWord = random.randint(0, (len(text) - 40))
    punctuation = [".", ",", '"', ";", ":", "?", "!", ")", "(", "*", "[", "]", "‘", "“", "#"]

    for x in xrange(startingWord,(startingWord + len(text))):
        startOfWord = text[x]
        if startOfWord ==".":
                startOfWordIndex = x
                break

    for x in xrange(startOfWordIndex + 1, startOfWordIndex+lengthOfTweet):
        if text[x] in punctuation:
            tweetString = tweetString + text[x]

        elif text[x] not in punctuation:
            tweetString = tweetString + blank + text[x]
    return tweetString

开发者ID:mathieuhendey，项目名称:Twitter-bot，代码行数:33，代码来源:Twertbot.py

示例10: exercise2

def exercise2():
    print
    print "Exercise 2"
    words = gutenberg.words('austen-persuasion.txt')
    print "Number of word tokens in the text austen-persuasion.txt: %d" %len(words)
    print "Number of word-types in the text austen-persuasion.txt: %d" %len(set(words))
    print set(words)
    print

开发者ID:GirishSrinivas，项目名称:PythonPrograms，代码行数:8，代码来源:Girish_Srinivas_Ch2.py

示例11: fun01

def fun01():
    """fun01"""
    print gutenberg.fileids()
    # emma by jane austen
    emma = gutenberg.words('austen-emma.txt')
    # how many words it contains
    print len(emma)
    print Text(emma).concordance("surprize")

开发者ID:gree2，项目名称:hobby，代码行数:8，代码来源:ch02.py

示例12: find_word_probability

def find_word_probability(CORPUS):
    ''' Find word occurrence probabilty from the given corpus'''
    cfd = ConditionalFreqDist()
    prev_word = None
    for word in gutenberg.words(CORPUS):
        cfd[prev_word][word] += 1
        prev_word = word
    return cfd

开发者ID:sreejithc321，项目名称:natural_language_processing，代码行数:8，代码来源:sentence_maker.py

示例13: main

def main():
    loader = WordLoader()
    loader.load_valid_words_from_aspell("en_GB")
    loader.load_valid_words_from_aspell("en_US")
    all_words = brown.words() + gutenberg.words()
    sorted_words_filename = 'sorted_words.txt'
    loader.write_sorted_words(all_words, sorted_words_filename)
    sorted_words = loader.sorted_words
    print_anagrams(sorted_words, all_words)

开发者ID:donkirkby，项目名称:vograbulary，代码行数:9，代码来源:pinagrams.py

示例14: gutenberg

def gutenberg():

    emma = nltk.corpus.gutenberg.words('austen-emma.txt')
    print len(emma)

    print gutenberg.fileids()
    emma = gutenberg.words('austen-emma.txt')

    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    macbeth_sentences[1037]
    longest_len = max([len(s) for s in macbeth_sentences])
    [s for s in macbeth_sentences if len(s) == longest_len]

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid

开发者ID:AkiraKane，项目名称:Python，代码行数:19，代码来源:c02_text_corpora.py

示例15: structure

def structure():

    raw = gutenberg.raw("burgess-busterbrown.txt")
    raw[1:20]

    words = gutenberg.words("burgess-busterbrown.txt")
    words[1:20]

    sents = gutenberg.sents("burgess-busterbrown.txt")
    sents[1:20]

开发者ID:AkiraKane，项目名称:Python，代码行数:10，代码来源:c02_text_corpora.py

注：本文中的nltk.corpus.gutenberg.words函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。