本文整理汇总了Python中nltk.corpus.util.LazyCorpusLoader.sents方法的典型用法代码示例。如果您正苦于以下问题:Python LazyCorpusLoader.sents方法的具体用法?Python LazyCorpusLoader.sents怎么用?Python LazyCorpusLoader.sents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.util.LazyCorpusLoader
的用法示例。
在下文中一共展示了LazyCorpusLoader.sents方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import sents [as 别名]
def test():
from nltk.corpus.util import LazyCorpusLoader
knbc = LazyCorpusLoader(
'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
assert isinstance(knbc.words()[0], string_types)
assert isinstance(knbc.sents()[0][0], string_types)
assert isinstance(knbc.tagged_words()[0], tuple)
assert isinstance(knbc.tagged_sents()[0][0], tuple)
示例2: test
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import sents [as 别名]
def test():
from nltk.corpus.util import LazyCorpusLoader
knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp")
assert isinstance(knbc.words()[0], basestring)
assert isinstance(knbc.sents()[0][0], basestring)
assert type(knbc.tagged_words()[0]) == tuple
assert type(knbc.tagged_sents()[0][0]) == tuple
示例3: read_knbc
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import sents [as 别名]
def read_knbc(train_file, test_file, reference_file):
root = nltk.data.find('corpora/knbc/corpus1')
fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]
knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')
sentences = knbc.sents()
write_train(sentences[0:4000], train_file)
write_test(sentences[4000:-1], test_file)
write_reference(sentences[4000:-1], reference_file)
示例4: main
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import sents [as 别名]
def main():
# matplotlib.use('Qt5Agg')
# import matplotlib.pyplot as plt
download('punkt')
# Download and load the english europarl corpus
downloader.download('europarl_raw')
english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')
words = english.words()
# Calculate the frequency distribution of the words in the corpus
word_frequency_distribution = FreqDist([word.lower() for word in words])
# Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
sentences = [
['start0'] + [word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in
sentence] + ['end0']
for sentence in english.sents()]
# create train and test dataset
train = sentences[0:int(len(sentences) * 0.8)]
test = sentences[int(len(sentences) * 0.8):]
vocabulary = list(word_frequency_distribution)
vocabulary_length = word_frequency_distribution.B()
# Calculate bigrams
bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
# Calculate the conditional frequency distribution for bigrams
bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)
# Calculate the conditional probability distribution for bigrams
cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)
lower_case_letters = string.ascii_lowercase
error_test = copy.deepcopy(test)
for sentence in error_test:
word = random.randrange(1, len(sentence)-1)
sentence[word] = random.choice(vocabulary)
word = random.choice(sentence[1:-2])
word = random.randrange(1, len(sentence) - 1)
letter = random.randrange(0, len(sentence[word]))
sentence[word] = sentence[word][0:letter] + random.choice(lower_case_letters) + sentence[word][letter+1:]
corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)
print('Corrected:{}'.format(corrected))
print('Original:{}'.format(test[25]))
示例5: main
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import sents [as 别名]
def main():
matplotlib.use('Qt5Agg')
import matplotlib.pyplot as plt
download('punkt')
# Download and load the english europarl corpus
downloader.download('europarl_raw')
english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')
words = english.words()
# Calculate the frequency distribution of the words in the corpus
word_frequency_distribution = FreqDist([word.lower() for word in words])
# Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
sentences = [[word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence]
for sentence in english.sents()]
# create train and test dataset
train = sentences[0:int(len(sentences) * 0.8)]
test = sentences[int(len(sentences) * 0.8):]
vocabulary_length = word_frequency_distribution.B()
# Calculate bigrams and trigrams
bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))
# Calculate the conditional frequency distributions for bigrams and trigrams
bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)
trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train])
# Calculate the conditional probability distributions for bigrams and trigrams
cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)
cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length)
bigrams_test = ngrams_sentences(test, 2)
bigram_length_probabilities = defaultdict(list)
for sentence in bigrams_test:
logprob = [cpd_bigram[(w1,)].logprob(w2) for w1, w2 in sentence]
logprob = sum(logprob)
bigram_length_probabilities[len(sentence)].append(logprob)
x = 0
s = None
for sentence in bigrams_test:
if (len(sentence) > x):
x = len(sentence)
s = sentence
trigrams_test = ngrams_sentences(test, 3)
trigram_length_probabilities = defaultdict(list)
for sentence in trigrams_test:
logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
logprob = sum(logprob)
trigram_length_probabilities[len(sentence)].append(logprob)
average_bigram_length_probabilities = {
length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in
bigram_length_probabilities.keys()}
average_trigram_length_probabilities = {
length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length
in
trigram_length_probabilities.keys()}
random_sentences = [[words[random.randint(0, len(words) - 1)].lower() for i in range(key)] for key in
bigram_length_probabilities.keys()]
bigrams_random = ngrams_sentences(random_sentences, 2)
random_bigram_length_probabilities = defaultdict(list)
for sentence in bigrams_random:
logprob = [cpd_trigram[(w1,)].logprob(w2) for w1, w2 in sentence]
logprob = sum(logprob)
random_bigram_length_probabilities[len(sentence)].append(logprob)
trigrams_random = ngrams_sentences(random_sentences, 3)
random_trigram_length_probabilities = defaultdict(list)
for sentence in trigrams_random:
logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
logprob = sum(logprob)
random_trigram_length_probabilities[len(sentence)].append(logprob)
bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
list(average_bigram_length_probabilities.keys()), color='red')
trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
list(average_trigram_length_probabilities.keys()), color='blue')
random_bigram = plt.scatter(list(random_bigram_length_probabilities.values()),
list(random_bigram_length_probabilities.keys()), color='green')
random_trigram = plt.scatter(list(random_trigram_length_probabilities.values()),
list(random_trigram_length_probabilities.keys()), color='black')
plt.xlabel('$log_2(P(W_1^k))$')
plt.ylabel('$k$')
plt.legend((bigram, trigram, random_bigram, random_trigram),
('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
plt.ylim(ymin=0)
# plt.show()
plt.savefig('logprob')
seed = 'this'
for i in range(30):
#.........这里部分代码省略.........