当前位置: 首页>>代码示例>>Python>>正文


Python LazyCorpusLoader.words方法代码示例

本文整理汇总了Python中nltk.corpus.util.LazyCorpusLoader.words方法的典型用法代码示例。如果您正苦于以下问题:Python LazyCorpusLoader.words方法的具体用法?Python LazyCorpusLoader.words怎么用?Python LazyCorpusLoader.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.util.LazyCorpusLoader的用法示例。


在下文中一共展示了LazyCorpusLoader.words方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: demo

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

    def _knbc_fileids_sort(x):
        cells = x.split('-')
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

    print(knbc.fileids()[:10])
    print(''.join( knbc.words()[:100] ))

    print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ))

    knbc.morphs2str = lambda morphs: '/'.join(
        "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
        ).encode('utf-8')

    print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ))

    print('\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent)
                     for sent in knbc.tagged_sents()[0:2] ))
开发者ID:Arttii,项目名称:TextBlob,代码行数:31,代码来源:knbc.py

示例2: demo

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find("corpora/knbc/corpus1")
    fileids = [
        f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    def _knbc_fileids_sort(x):
        cells = x.split("-")
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp")

    print knbc.fileids()[:10]
    print "".join(knbc.words()[:100])

    print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])

    knbc.morphs2str = lambda morphs: "/".join(
        "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
    ).encode("utf-8")

    print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])

    print "\n".join(" ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2])
开发者ID:ongxuanhong,项目名称:jazzparser-master-thesis,代码行数:30,代码来源:knbc.py

示例3: test

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def test():

    from nltk.corpus.util import LazyCorpusLoader
    knbc = LazyCorpusLoader(
        'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
    assert isinstance(knbc.words()[0], string_types)
    assert isinstance(knbc.sents()[0][0], string_types)
    assert isinstance(knbc.tagged_words()[0], tuple)
    assert isinstance(knbc.tagged_sents()[0][0], tuple)
开发者ID:DrDub,项目名称:nltk,代码行数:11,代码来源:knbc.py

示例4: test

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def test():

    from nltk.corpus.util import LazyCorpusLoader

    knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp")
    assert isinstance(knbc.words()[0], basestring)
    assert isinstance(knbc.sents()[0][0], basestring)
    assert type(knbc.tagged_words()[0]) == tuple
    assert type(knbc.tagged_sents()[0][0]) == tuple
开发者ID:ongxuanhong,项目名称:jazzparser-master-thesis,代码行数:11,代码来源:knbc.py

示例5: demo

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
    print "/".join(jeita.words()[22100:22140])

    print "\nEOS\n".join(
        ["\n".join("%s/%s" % (w[0], w[1].split("\t")[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]]
    )
开发者ID:Kuew,项目名称:hashtagify,代码行数:13,代码来源:chasen.py

示例6: demo

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def demo():
    
    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader(
        'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
    print '/'.join( jeita.words()[22100:22140] ) 


    print '\nEOS\n'.join(['\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent)
                          for sent in jeita.tagged_sents()[2170:2173]])
开发者ID:Akira55,项目名称:nltk,代码行数:14,代码来源:chasen.py

示例7: dictionary_backoff

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def dictionary_backoff(option_tone, backoff):
    '''Creates a dictionary according to the option: tonal/nontonal'''
    if option_tone == "tonal":
        bambara_dict_toolbox = BambaraTagging("cookbook/bambara", ["bamadaba.txt"], option_tone, "POS")
        bambara_dict_toolbox.copy_files()
        reader = LazyCorpusLoader("cookbook/bambara/", ToolboxCorpusReader, ["bamadaba.txt"])
        entries = reader.entries("bamadaba.txt") #tonal
        words = reader.words("bamadaba.txt")#tonal
        pos = reader.words("bamadaba.txt", key="ps")#tonal
    else:
        bambara_dict_toolbox = BambaraTagging("cookbook/bambara", ["bamadaba_non_tonal.txt"], option_tone, "POS")
        bambara_dict_toolbox.copy_files()
        reader = LazyCorpusLoader("cookbook/bambara/", ToolboxCorpusReader, ["bamadaba_non_tonal.txt"])
        entries = reader.entries("bamadaba_non_tonal.txt") #tonal
        words = reader.words("bamadaba_non_tonal.txt")#tonal
        pos = reader.words("bamadaba_non_tonal.txt", key="ps")#tonal
        
    own_model = get_alt_pos(entries, pos, reader, option_tone)#tonal
    print("Dictionary created")
    dic = UnigramTagger(model=own_model, backoff=backoff)
    return dic
开发者ID:Batene,项目名称:Bamanankan,代码行数:23,代码来源:create_reader.py

示例8: main

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [
        ['start0'] + [word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in
                      sentence] + ['end0']
        for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence)-1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(lower_case_letters) + sentence[word][letter+1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))
开发者ID:BabisK,项目名称:M36209P,代码行数:52,代码来源:ex3.py

示例9: main

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence]
                 for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in
        bigram_length_probabilities.keys()}
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length
        in
        trigram_length_probabilities.keys()}

    random_sentences = [[words[random.randint(0, len(words) - 1)].lower() for i in range(key)] for key in
                        bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()), color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()), color='blue')
    random_bigram = plt.scatter(list(random_bigram_length_probabilities.values()),
                                list(random_bigram_length_probabilities.keys()), color='green')
    random_trigram = plt.scatter(list(random_trigram_length_probabilities.values()),
                                 list(random_trigram_length_probabilities.keys()), color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
#.........这里部分代码省略.........
开发者ID:BabisK,项目名称:M36209P,代码行数:103,代码来源:ex4.py

示例10: loadClassifier

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def loadClassifier(outputdir):
    classifier_filename = os.path.join("pickled_algos", "voted_classifier.pickle") 
    word_features_filename = os.path.join("pickled_algos", "word_features.pickle")
    if os.path.exists(classifier_filename) and os.path.exists(word_features_filename):
        word_features = pickleLoad("word_features.pickle")
#        classifier = pickleLoad("originalnaivebayes.pickle")
#        MNB_classifier = pickleLoad("MNB_classifier.pickle")
#        BernoulliNB_classifier = pickleLoad("BernoulliNB_classifier.pickle")
#        LogisticRegression_classifier = pickleLoad("LogisticRegression_classifier.pickle")
#        SGDClassifier_classifier = pickleLoad("SGDClassifier_classifier.pickle")
#        LinearSVC_classifier = pickleLoad("LinearSVC_classifier.pickle")
#        
#        voted_classifier = VoteClassifier(classifier,
##                                  NuSVC_classifier,
#                                  LinearSVC_classifier,
#                                  SGDClassifier_classifier,
#                                  MNB_classifier,
#                                  BernoulliNB_classifier,
#                                  LogisticRegression_classifier)
        voted_classifier= pickleLoad("voted_classifier.pickle")
        return voted_classifier, word_features
    else:
        criticas_cine = LazyCorpusLoader(
                'criticas_cine', CategorizedPlaintextCorpusReader,
                r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
                encoding='utf-8')
#        criticas_cine = LazyCorpusLoader(
#                'criticas_cine_neu', CategorizedPlaintextCorpusReader,
#                r'(?!\.).*\.txt', cat_pattern=r'(neg|neu|pos)/.*',
#                encoding='utf-8')
            
        documents = [(list(criticas_cine.words(fileid)), category)
                     for category in criticas_cine.categories()
                     for fileid in criticas_cine.fileids(category)]
#            
#        document_pos = [(list(criticas_cine.words(fileid)), "pos")
#                        for fileid in criticas_cine.fileids("pos")]
#        document_neg = [(list(criticas_cine.words(fileid)), "neg")
#                        for fileid in criticas_cine.fileids("neg")]
#        document_neu = [(list(criticas_cine.words(fileid)), "neu")
#                        for fileid in criticas_cine.fileids("neu")]
        
        random.shuffle(documents)
        
#        random.shuffle(document_pos)
#        random.shuffle(document_neg)
#        random.shuffle(document_neu)
        
        all_words = []
        
        for w in criticas_cine.words():
            all_words.append(w.lower())
        
#        for w in criticas_cine.words():
#            if not is_filtered(w.lower()):
#                all_words.append(w.lower())
#        
        all_words = nltk.FreqDist(all_words)
        
        #print (all_words.most_common(50))
        
        # Filtering by type of word
        
#        for sample in all_words:
                    
        
        word_features = list(all_words.keys())[:3000]
        pickleDump(word_features, "word_features.pickle")
        
        featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents]
        
#        featuresetpos = [(find_features(rev, word_features), category) for (rev, category) in document_pos]
#        featuresetneg = [(find_features(rev, word_features), category) for (rev, category) in document_neg]
#        featuresetneu = [(find_features(rev, word_features), category) for (rev, category) in document_neu]
        
#        training_set = featuresetpos[:1000]
#        training_set.extend(featuresetneg[:1000])
#        training_set.extend(featuresetneu[:1000])
#        testing_set = featuresetpos[1000:1273]
#        testing_set.extend(featuresetneg[1000:])
#        testing_set.extend(featuresetneu[1000:])

#        pos_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "pos"]
#        neu_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neu"]
#        neg_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neg"]
                
        training_set = featuresets[:2000]
        testing_set =  featuresets[2000:]
        classifier = nltk.NaiveBayesClassifier.train(training_set)
#        pickleDump(classifier, "originalnaivebayes.pickle")
    
        NaiveBayesClassifierAccuracy = nltk.classify.accuracy(classifier, testing_set)
        
        print("Original Naive Bayes Algo accuracy percent:", (NaiveBayesClassifierAccuracy)*100)
        
        accuracy = Accuracy(classifier,testing_set)
        print(accuracy)
        # order: neu, neg, pos
#        print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/3)
#        print("Discarded: ", (accuracy["neu"][0]+accuracy["neg"][1]+accuracy["pos"][0])/3)
#.........这里部分代码省略.........
开发者ID:amador2001,项目名称:ObservatorioHF,代码行数:103,代码来源:analisys.py

示例11: LazyCorpusLoader

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
    # build features dictionary
    features = {}
    for word in top_words:
        features['contains(%s)' % word] = (word in doc_words_set)
    return features


interrogazioni = LazyCorpusLoader(
    'opp_interrogazioni_macro',
    CategorizedPlaintextCorpusReader,
    r'\d*', cat_file='cats.txt', cat_delimiter=','
)

print "computing FreqDist over all words"
all_words = nltk.FreqDist(w.lower() for w in interrogazioni.words())
top_words = all_words.keys()[:2000]


print "generating list of documents for each category"
documents = [
    (list(interrogazioni.words(fileid)), category)
    for category in interrogazioni.categories()
    for fileid in interrogazioni.fileids(category)
]
random.shuffle(documents)

print "building the classifier"
featuresets = [(document_features(d, top_words), c) for (d,c) in documents]
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
开发者ID:cwi17857,项目名称:opp-text-classifier,代码行数:32,代码来源:build_classifier_a.py

示例12: pickleObject

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]

def pickleObject():
	obj = classifier
	savefile = open('classifier.pickle', 'w')
	cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL)

def pickleFeats():
	obj = words_in_sentence
	savefile = open('feats.pickle', 'w')
	cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL)

files_in_neg = movie_reviews.fileids('neg')
files_in_pos = movie_reviews.fileids('pos')

neg_data = [(words_in_sentence(movie_reviews.words(fileids=[f])), 'neg') for f in files_in_neg]
pos_data = [(words_in_sentence(movie_reviews.words(fileids=[f])), 'pos') for f in files_in_pos]

negative_first_test_pos = int(len(neg_data)*train_test_ratio)
positive_first_test_pos = int(len(pos_data)*train_test_ratio)

train_data = neg_data[:negative_first_test_pos] + pos_data[:positive_first_test_pos]
test_data = neg_data[negative_first_test_pos:] + pos_data[positive_first_test_pos:]
print 'training on %d paragraphs and testing on %d paragraphs' % (len(train_data), len(test_data))

classifier = NaiveBayesClassifier.train(train_data)
print 'accuracy:', nltk.classify.util.accuracy(classifier, test_data)
classifier.show_most_informative_features(20)


pickleFeats()
开发者ID:asketak,项目名称:IB030-sentiment,代码行数:32,代码来源:classifier.py

示例13: _knbc_fileids_sort

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
#!/usr/bin/env python
# encoding: utf-8
# KNBCコーパスをNLTKで読み込むサンプル

from nltk_jp import *
from nltk.corpus.reader import *
from nltk.corpus.util import LazyCorpusLoader

def _knbc_fileids_sort(x):
    cells = x.split('-')
    return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

# コーパスを読み込み
root = nltk.data.find('corpora/knbc/corpus1')
fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]
knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')
#print "fileids :", knbc.fileids()
print "words :", pp(knbc.words()[:10])
print "parsed_sents :", str(knbc.parsed_sents()[0])
print "tagged_words :", pp(knbc.tagged_words()[:5])

开发者ID:amumu,项目名称:nokuno,代码行数:22,代码来源:knbc.py

示例14: ChasenCorpusReader

# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
#!/usr/bin/env python
# encoding: utf-8
# JEITAコーパスをNLTKで読み込むサンプル

from nltk_jp import *
from nltk.corpus.reader import *
from nltk.corpus.util import LazyCorpusLoader

# コーパスを読み込み
#jeita = ChasenCorpusReader('home/ubuntu/nltk_data/corpora/jeita', r'.*chasen', encoding='utf-8')
jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
print pp(jeita.words()[:10])
print pp(jeita.tagged_sents()[1])
开发者ID:zaakya666,项目名称:test,代码行数:15,代码来源:nltk_sample.py


注:本文中的nltk.corpus.util.LazyCorpusLoader.words方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。