当前位置: 首页>>代码示例>>Python>>正文


Python LazyCorpusLoader.sents方法代码示例

本文整理汇总了Python中nltk.corpus.util.LazyCorpusLoader.sents方法的典型用法代码示例。如果您正苦于以下问题:Python LazyCorpusLoader.sents方法的具体用法?Python LazyCorpusLoader.sents怎么用?Python LazyCorpusLoader.sents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.util.LazyCorpusLoader的用法示例。


在下文中一共展示了LazyCorpusLoader.sents方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import sents [as 别名]
def test():

    from nltk.corpus.util import LazyCorpusLoader
    knbc = LazyCorpusLoader(
        'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
    assert isinstance(knbc.words()[0], string_types)
    assert isinstance(knbc.sents()[0][0], string_types)
    assert isinstance(knbc.tagged_words()[0], tuple)
    assert isinstance(knbc.tagged_sents()[0][0], tuple)
开发者ID:DrDub,项目名称:nltk,代码行数:11,代码来源:knbc.py

示例2: test

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import sents [as 别名]
def test():

    from nltk.corpus.util import LazyCorpusLoader

    knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp")
    assert isinstance(knbc.words()[0], basestring)
    assert isinstance(knbc.sents()[0][0], basestring)
    assert type(knbc.tagged_words()[0]) == tuple
    assert type(knbc.tagged_sents()[0][0]) == tuple
开发者ID:ongxuanhong,项目名称:jazzparser-master-thesis,代码行数:11,代码来源:knbc.py

示例3: read_knbc

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import sents [as 别名]
def read_knbc(train_file, test_file, reference_file):

	root = nltk.data.find('corpora/knbc/corpus1')
	fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
              if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

	knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
           sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

	sentences = knbc.sents()

	write_train(sentences[0:4000], train_file)
	write_test(sentences[4000:-1], test_file)
	write_reference(sentences[4000:-1], reference_file)
开发者ID:LeopoldC,项目名称:cross-language_IR,代码行数:16,代码来源:knbc_to_xml.py

示例4: main

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import sents [as 别名]
def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [
        ['start0'] + [word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in
                      sentence] + ['end0']
        for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence)-1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(lower_case_letters) + sentence[word][letter+1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))
开发者ID:BabisK,项目名称:M36209P,代码行数:52,代码来源:ex3.py

示例5: main

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import sents [as 别名]
def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence]
                 for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in
        bigram_length_probabilities.keys()}
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length
        in
        trigram_length_probabilities.keys()}

    random_sentences = [[words[random.randint(0, len(words) - 1)].lower() for i in range(key)] for key in
                        bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()), color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()), color='blue')
    random_bigram = plt.scatter(list(random_bigram_length_probabilities.values()),
                                list(random_bigram_length_probabilities.keys()), color='green')
    random_trigram = plt.scatter(list(random_trigram_length_probabilities.values()),
                                 list(random_trigram_length_probabilities.keys()), color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
#.........这里部分代码省略.........
开发者ID:BabisK,项目名称:M36209P,代码行数:103,代码来源:ex4.py


注:本文中的nltk.corpus.util.LazyCorpusLoader.sents方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。