本文整理汇总了Python中nltk.corpus.util.LazyCorpusLoader.words方法的典型用法代码示例。如果您正苦于以下问题:Python LazyCorpusLoader.words方法的具体用法?Python LazyCorpusLoader.words怎么用?Python LazyCorpusLoader.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.util.LazyCorpusLoader
的用法示例。
在下文中一共展示了LazyCorpusLoader.words方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: demo
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
root = nltk.data.find('corpora/knbc/corpus1')
fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]
def _knbc_fileids_sort(x):
cells = x.split('-')
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')
print(knbc.fileids()[:10])
print(''.join( knbc.words()[:100] ))
print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ))
knbc.morphs2str = lambda morphs: '/'.join(
"%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
).encode('utf-8')
print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ))
print('\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent)
for sent in knbc.tagged_sents()[0:2] ))
示例2: demo
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
root = nltk.data.find("corpora/knbc/corpus1")
fileids = [
f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
]
def _knbc_fileids_sort(x):
cells = x.split("-")
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp")
print knbc.fileids()[:10]
print "".join(knbc.words()[:100])
print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])
knbc.morphs2str = lambda morphs: "/".join(
"%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
).encode("utf-8")
print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])
print "\n".join(" ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2])
示例3: test
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def test():
from nltk.corpus.util import LazyCorpusLoader
knbc = LazyCorpusLoader(
'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
assert isinstance(knbc.words()[0], string_types)
assert isinstance(knbc.sents()[0][0], string_types)
assert isinstance(knbc.tagged_words()[0], tuple)
assert isinstance(knbc.tagged_sents()[0][0], tuple)
示例4: test
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def test():
from nltk.corpus.util import LazyCorpusLoader
knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp")
assert isinstance(knbc.words()[0], basestring)
assert isinstance(knbc.sents()[0][0], basestring)
assert type(knbc.tagged_words()[0]) == tuple
assert type(knbc.tagged_sents()[0][0]) == tuple
示例5: demo
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
print "/".join(jeita.words()[22100:22140])
print "\nEOS\n".join(
["\n".join("%s/%s" % (w[0], w[1].split("\t")[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]]
)
示例6: demo
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader(
'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
print '/'.join( jeita.words()[22100:22140] )
print '\nEOS\n'.join(['\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent)
for sent in jeita.tagged_sents()[2170:2173]])
示例7: dictionary_backoff
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def dictionary_backoff(option_tone, backoff):
'''Creates a dictionary according to the option: tonal/nontonal'''
if option_tone == "tonal":
bambara_dict_toolbox = BambaraTagging("cookbook/bambara", ["bamadaba.txt"], option_tone, "POS")
bambara_dict_toolbox.copy_files()
reader = LazyCorpusLoader("cookbook/bambara/", ToolboxCorpusReader, ["bamadaba.txt"])
entries = reader.entries("bamadaba.txt") #tonal
words = reader.words("bamadaba.txt")#tonal
pos = reader.words("bamadaba.txt", key="ps")#tonal
else:
bambara_dict_toolbox = BambaraTagging("cookbook/bambara", ["bamadaba_non_tonal.txt"], option_tone, "POS")
bambara_dict_toolbox.copy_files()
reader = LazyCorpusLoader("cookbook/bambara/", ToolboxCorpusReader, ["bamadaba_non_tonal.txt"])
entries = reader.entries("bamadaba_non_tonal.txt") #tonal
words = reader.words("bamadaba_non_tonal.txt")#tonal
pos = reader.words("bamadaba_non_tonal.txt", key="ps")#tonal
own_model = get_alt_pos(entries, pos, reader, option_tone)#tonal
print("Dictionary created")
dic = UnigramTagger(model=own_model, backoff=backoff)
return dic
示例8: main
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def main():
# matplotlib.use('Qt5Agg')
# import matplotlib.pyplot as plt
download('punkt')
# Download and load the english europarl corpus
downloader.download('europarl_raw')
english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')
words = english.words()
# Calculate the frequency distribution of the words in the corpus
word_frequency_distribution = FreqDist([word.lower() for word in words])
# Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
sentences = [
['start0'] + [word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in
sentence] + ['end0']
for sentence in english.sents()]
# create train and test dataset
train = sentences[0:int(len(sentences) * 0.8)]
test = sentences[int(len(sentences) * 0.8):]
vocabulary = list(word_frequency_distribution)
vocabulary_length = word_frequency_distribution.B()
# Calculate bigrams
bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
# Calculate the conditional frequency distribution for bigrams
bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)
# Calculate the conditional probability distribution for bigrams
cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)
lower_case_letters = string.ascii_lowercase
error_test = copy.deepcopy(test)
for sentence in error_test:
word = random.randrange(1, len(sentence)-1)
sentence[word] = random.choice(vocabulary)
word = random.choice(sentence[1:-2])
word = random.randrange(1, len(sentence) - 1)
letter = random.randrange(0, len(sentence[word]))
sentence[word] = sentence[word][0:letter] + random.choice(lower_case_letters) + sentence[word][letter+1:]
corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)
print('Corrected:{}'.format(corrected))
print('Original:{}'.format(test[25]))
示例9: main
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def main():
matplotlib.use('Qt5Agg')
import matplotlib.pyplot as plt
download('punkt')
# Download and load the english europarl corpus
downloader.download('europarl_raw')
english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')
words = english.words()
# Calculate the frequency distribution of the words in the corpus
word_frequency_distribution = FreqDist([word.lower() for word in words])
# Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
sentences = [[word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence]
for sentence in english.sents()]
# create train and test dataset
train = sentences[0:int(len(sentences) * 0.8)]
test = sentences[int(len(sentences) * 0.8):]
vocabulary_length = word_frequency_distribution.B()
# Calculate bigrams and trigrams
bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))
# Calculate the conditional frequency distributions for bigrams and trigrams
bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)
trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train])
# Calculate the conditional probability distributions for bigrams and trigrams
cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)
cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length)
bigrams_test = ngrams_sentences(test, 2)
bigram_length_probabilities = defaultdict(list)
for sentence in bigrams_test:
logprob = [cpd_bigram[(w1,)].logprob(w2) for w1, w2 in sentence]
logprob = sum(logprob)
bigram_length_probabilities[len(sentence)].append(logprob)
x = 0
s = None
for sentence in bigrams_test:
if (len(sentence) > x):
x = len(sentence)
s = sentence
trigrams_test = ngrams_sentences(test, 3)
trigram_length_probabilities = defaultdict(list)
for sentence in trigrams_test:
logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
logprob = sum(logprob)
trigram_length_probabilities[len(sentence)].append(logprob)
average_bigram_length_probabilities = {
length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in
bigram_length_probabilities.keys()}
average_trigram_length_probabilities = {
length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length
in
trigram_length_probabilities.keys()}
random_sentences = [[words[random.randint(0, len(words) - 1)].lower() for i in range(key)] for key in
bigram_length_probabilities.keys()]
bigrams_random = ngrams_sentences(random_sentences, 2)
random_bigram_length_probabilities = defaultdict(list)
for sentence in bigrams_random:
logprob = [cpd_trigram[(w1,)].logprob(w2) for w1, w2 in sentence]
logprob = sum(logprob)
random_bigram_length_probabilities[len(sentence)].append(logprob)
trigrams_random = ngrams_sentences(random_sentences, 3)
random_trigram_length_probabilities = defaultdict(list)
for sentence in trigrams_random:
logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
logprob = sum(logprob)
random_trigram_length_probabilities[len(sentence)].append(logprob)
bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
list(average_bigram_length_probabilities.keys()), color='red')
trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
list(average_trigram_length_probabilities.keys()), color='blue')
random_bigram = plt.scatter(list(random_bigram_length_probabilities.values()),
list(random_bigram_length_probabilities.keys()), color='green')
random_trigram = plt.scatter(list(random_trigram_length_probabilities.values()),
list(random_trigram_length_probabilities.keys()), color='black')
plt.xlabel('$log_2(P(W_1^k))$')
plt.ylabel('$k$')
plt.legend((bigram, trigram, random_bigram, random_trigram),
('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
plt.ylim(ymin=0)
# plt.show()
plt.savefig('logprob')
seed = 'this'
for i in range(30):
#.........这里部分代码省略.........
示例10: loadClassifier
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def loadClassifier(outputdir):
classifier_filename = os.path.join("pickled_algos", "voted_classifier.pickle")
word_features_filename = os.path.join("pickled_algos", "word_features.pickle")
if os.path.exists(classifier_filename) and os.path.exists(word_features_filename):
word_features = pickleLoad("word_features.pickle")
# classifier = pickleLoad("originalnaivebayes.pickle")
# MNB_classifier = pickleLoad("MNB_classifier.pickle")
# BernoulliNB_classifier = pickleLoad("BernoulliNB_classifier.pickle")
# LogisticRegression_classifier = pickleLoad("LogisticRegression_classifier.pickle")
# SGDClassifier_classifier = pickleLoad("SGDClassifier_classifier.pickle")
# LinearSVC_classifier = pickleLoad("LinearSVC_classifier.pickle")
#
# voted_classifier = VoteClassifier(classifier,
## NuSVC_classifier,
# LinearSVC_classifier,
# SGDClassifier_classifier,
# MNB_classifier,
# BernoulliNB_classifier,
# LogisticRegression_classifier)
voted_classifier= pickleLoad("voted_classifier.pickle")
return voted_classifier, word_features
else:
criticas_cine = LazyCorpusLoader(
'criticas_cine', CategorizedPlaintextCorpusReader,
r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
encoding='utf-8')
# criticas_cine = LazyCorpusLoader(
# 'criticas_cine_neu', CategorizedPlaintextCorpusReader,
# r'(?!\.).*\.txt', cat_pattern=r'(neg|neu|pos)/.*',
# encoding='utf-8')
documents = [(list(criticas_cine.words(fileid)), category)
for category in criticas_cine.categories()
for fileid in criticas_cine.fileids(category)]
#
# document_pos = [(list(criticas_cine.words(fileid)), "pos")
# for fileid in criticas_cine.fileids("pos")]
# document_neg = [(list(criticas_cine.words(fileid)), "neg")
# for fileid in criticas_cine.fileids("neg")]
# document_neu = [(list(criticas_cine.words(fileid)), "neu")
# for fileid in criticas_cine.fileids("neu")]
random.shuffle(documents)
# random.shuffle(document_pos)
# random.shuffle(document_neg)
# random.shuffle(document_neu)
all_words = []
for w in criticas_cine.words():
all_words.append(w.lower())
# for w in criticas_cine.words():
# if not is_filtered(w.lower()):
# all_words.append(w.lower())
#
all_words = nltk.FreqDist(all_words)
#print (all_words.most_common(50))
# Filtering by type of word
# for sample in all_words:
word_features = list(all_words.keys())[:3000]
pickleDump(word_features, "word_features.pickle")
featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents]
# featuresetpos = [(find_features(rev, word_features), category) for (rev, category) in document_pos]
# featuresetneg = [(find_features(rev, word_features), category) for (rev, category) in document_neg]
# featuresetneu = [(find_features(rev, word_features), category) for (rev, category) in document_neu]
# training_set = featuresetpos[:1000]
# training_set.extend(featuresetneg[:1000])
# training_set.extend(featuresetneu[:1000])
# testing_set = featuresetpos[1000:1273]
# testing_set.extend(featuresetneg[1000:])
# testing_set.extend(featuresetneu[1000:])
# pos_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "pos"]
# neu_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neu"]
# neg_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neg"]
training_set = featuresets[:2000]
testing_set = featuresets[2000:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
# pickleDump(classifier, "originalnaivebayes.pickle")
NaiveBayesClassifierAccuracy = nltk.classify.accuracy(classifier, testing_set)
print("Original Naive Bayes Algo accuracy percent:", (NaiveBayesClassifierAccuracy)*100)
accuracy = Accuracy(classifier,testing_set)
print(accuracy)
# order: neu, neg, pos
# print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/3)
# print("Discarded: ", (accuracy["neu"][0]+accuracy["neg"][1]+accuracy["pos"][0])/3)
#.........这里部分代码省略.........
示例11: LazyCorpusLoader
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
# build features dictionary
features = {}
for word in top_words:
features['contains(%s)' % word] = (word in doc_words_set)
return features
interrogazioni = LazyCorpusLoader(
'opp_interrogazioni_macro',
CategorizedPlaintextCorpusReader,
r'\d*', cat_file='cats.txt', cat_delimiter=','
)
print "computing FreqDist over all words"
all_words = nltk.FreqDist(w.lower() for w in interrogazioni.words())
top_words = all_words.keys()[:2000]
print "generating list of documents for each category"
documents = [
(list(interrogazioni.words(fileid)), category)
for category in interrogazioni.categories()
for fileid in interrogazioni.fileids(category)
]
random.shuffle(documents)
print "building the classifier"
featuresets = [(document_features(d, top_words), c) for (d,c) in documents]
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
示例12: pickleObject
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
def pickleObject():
obj = classifier
savefile = open('classifier.pickle', 'w')
cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL)
def pickleFeats():
obj = words_in_sentence
savefile = open('feats.pickle', 'w')
cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL)
files_in_neg = movie_reviews.fileids('neg')
files_in_pos = movie_reviews.fileids('pos')
neg_data = [(words_in_sentence(movie_reviews.words(fileids=[f])), 'neg') for f in files_in_neg]
pos_data = [(words_in_sentence(movie_reviews.words(fileids=[f])), 'pos') for f in files_in_pos]
negative_first_test_pos = int(len(neg_data)*train_test_ratio)
positive_first_test_pos = int(len(pos_data)*train_test_ratio)
train_data = neg_data[:negative_first_test_pos] + pos_data[:positive_first_test_pos]
test_data = neg_data[negative_first_test_pos:] + pos_data[positive_first_test_pos:]
print 'training on %d paragraphs and testing on %d paragraphs' % (len(train_data), len(test_data))
classifier = NaiveBayesClassifier.train(train_data)
print 'accuracy:', nltk.classify.util.accuracy(classifier, test_data)
classifier.show_most_informative_features(20)
pickleFeats()
示例13: _knbc_fileids_sort
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
#!/usr/bin/env python
# encoding: utf-8
# KNBCコーパスをNLTKで読み込むサンプル
from nltk_jp import *
from nltk.corpus.reader import *
from nltk.corpus.util import LazyCorpusLoader
def _knbc_fileids_sort(x):
cells = x.split('-')
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
# コーパスを読み込み
root = nltk.data.find('corpora/knbc/corpus1')
fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]
knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')
#print "fileids :", knbc.fileids()
print "words :", pp(knbc.words()[:10])
print "parsed_sents :", str(knbc.parsed_sents()[0])
print "tagged_words :", pp(knbc.tagged_words()[:5])
示例14: ChasenCorpusReader
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import words [as 别名]
#!/usr/bin/env python
# encoding: utf-8
# JEITAコーパスをNLTKで読み込むサンプル
from nltk_jp import *
from nltk.corpus.reader import *
from nltk.corpus.util import LazyCorpusLoader
# コーパスを読み込み
#jeita = ChasenCorpusReader('home/ubuntu/nltk_data/corpora/jeita', r'.*chasen', encoding='utf-8')
jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
print pp(jeita.words()[:10])
print pp(jeita.tagged_sents()[1])