本文整理汇总了Python中nltk.corpus.gutenberg.raw函数的典型用法代码示例。如果您正苦于以下问题:Python raw函数的具体用法?Python raw怎么用?Python raw使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了raw函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_austen
def test_austen():
from nltk.data import load
from nltk.corpus import gutenberg as g
stok = load('tokenizers/punkt/english.pickle')
train = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-emma.txt'))]
test1 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-sense.txt'))]
test2 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-persuasion.txt'))]
model1 = AdditiveSmoothing(n=2)
model1.generate_model(train)
print 'cross entropy additive smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model1, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model1, test2)
model2 = KnesserNey(n=2)
model2.generate_model(train)
print 'cross entropy knesser-ney smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model2, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model2, test2)
model3 = SimpleGoodTuring(n=2)
model3.generate_model(train)
print 'cross entropy simple good-turing smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model3, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model3, test2)
model4 = KatzSmoothing(n=2)
model4.generate_model(train)
print 'cross entropy katz smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model4, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model4, test2)
示例2: test
def test():
from nltk.corpus import gutenberg
emma = gutenberg.raw('austen-emma.txt')
print len(emma)
ex = createexercise(emma, pos='v', last_index=False, fast=True)
print len(ex)
示例3: load_moby_dick_analysis
def load_moby_dick_analysis():
tokens = get_moby_dick_tokens()
text = gutenberg.raw('melville-moby_dick.txt')
try:
moby_dick_doc = Document(
url='gutenberg',
name='moby dick',
text=text,
month='Jan',
year='1851'
)
odm_session.flush()
except DuplicateKeyError:
moby_dick_doc = Document.query.get(name='moby dick')
for sum_threshold in sum_thresholds:
log.info("Trying analysis for threshold = %s" % sum_threshold)
analysis = get_optimal_window_size(tokens, window_sizes, 20, sum_threshold=sum_threshold)[1]
anal_dict = analysis.encode()
window_size = anal_dict['window_size']
log.debug("Best result = %s" % window_size)
InformationValueResult(
window_size = window_size,
threshold = sum_threshold,
document = moby_dick_doc,
iv_words = anal_dict['top_words'],
max_iv = anal_dict['max_iv'],
sum_iv = anal_dict['sum_iv']
)
odm_session.flush()
示例4: exercise_gutenberg
def exercise_gutenberg():
# 打印古腾堡项目的文件列表
print gutenberg.fileids()
# 挑选一个文本: 简-奥斯丁的《爱玛》
emma = gutenberg.words("austen-emma.txt")
# 查看书的长度
print len(emma)
# 导入文本
emma_text = nltk.Text(emma)
emma_text.concordance("surprize")
for file_id in gutenberg.fileids():
chars_list = gutenberg.raw(file_id)
words_list = gutenberg.words(file_id)
sents_list = gutenberg.sents(file_id)
# 统计文件的总字符数
num_chars = len(chars_list)
# 统计文件的总单词数
num_words = len(words_list)
# 统计文件的总句子数
num_sents = len(sents_list)
# 统计文件的非重复单词数
num_vocab = len(set([w.lower() for w in words_list]))
# 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
示例5: gutenberg
def gutenberg():
from nltk.corpus import gutenberg
for t in gutenberg.fileids():
num_chars = len(gutenberg.raw(t))
num_words = len(gutenberg.words(t))
num_sents = len(gutenberg.sents(t))
num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
示例6: handle
def handle(self, *args, **options):
for fileid in gutenberg.fileids():
out_dir = CORPUS_DIR + os.sep + fileid.replace(".txt", "")
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
f = open(out_dir + os.sep + "sentences.txt", 'w')
f.write(gutenberg.raw(fileid))
f.close()
示例7: similarity_gutenberg
def similarity_gutenberg():
for x in range(2,6):
a = []
b = 0
c = 0
d = 1
for fid in gutenberg.fileids():
a.append([])
for ffid in gutenberg.fileids():
a[b].append(Jaccard(n_window(gutenberg.raw(fid),x),n_window(gutenberg.raw(ffid),x)))
b += 1
for i in range(len(a)):
for j in range(len(a)):
c += a[i][j]/(len(a)*len(a))
d = min(d,a[i][j])
print("Media: "+ str(c))
print("Minimo: "+ str(d))
示例8: structure
def structure():
raw = gutenberg.raw("burgess-busterbrown.txt")
raw[1:20]
words = gutenberg.words("burgess-busterbrown.txt")
words[1:20]
sents = gutenberg.sents("burgess-busterbrown.txt")
sents[1:20]
示例9: for_print
def for_print():
'''
显示每个文本的三个统计量
:return:
'''
for fileid in gutenberg.fileids():
num_chars=len(gutenberg.raw(fileid))
num_words=len(gutenberg.words(fileid))
num_sents=len(gutenberg.sents(fileid))
num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
示例10: fun02
def fun02():
"""fun02"""
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
# average word length average sentence length
print int(num_chars/num_words), int(num_words/num_sents),
# number of times each vocabulary item appers in the text
print int(num_words/num_vocab), fileid
示例11: page57
def page57():
"""Statistics from the Gutenberg corpora"""
from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars / num_words), int(num_words / num_sents),
print int(num_words / num_vocab), fileid
示例12: solve_p2_greedy
def solve_p2_greedy(file):
lines = [l.lower().split("|")[1:-1] for l in open(file)]
slices = slice(lines)
n = 3
corpus = NgramLetterCorpus(n)
for fileid in gutenberg.fileids()[:3]:
corpus.update(gutenberg.raw(fileid))
slices = unshred3(slices, corpus)
print "FINAL: "
for l in linearize(slices):
print "".join(l)
示例13: test_moby_dick_window
def test_moby_dick_window(self):
#just make sure we
window_sizes = xrange(100, 6000, 100)
text = gutenberg.raw('melville-moby_dick.txt')
tokens = tokenize(text, only_alphanum=True, clean_punctuation=True)
total_number_of_tokens = len(tokens)
for window_size in window_sizes:
count = 0
number_of_windows = int(math.ceil( total_number_of_tokens / window_size))
for current_window in range(0, number_of_windows+1):
word_window = Window(tokens, window_size, current_window)
for word in word_window:
count += 1
self.assertEquals(count, total_number_of_tokens)
示例14: benchmark_sbd
def benchmark_sbd():
ps = []
rs = []
f1s = []
c = 0
for fileid in gutenberg.fileids():
c += 1
copy_sents_gold = gutenberg.sents(fileid)
sents_gold = [s for s in copy_sents_gold]
for sent_i in range(len(sents_gold)):
new_sent = [w for w in sents_gold[sent_i] if w.isalpha()]
sents_gold[sent_i] = new_sent
text = gutenberg.raw(fileid)
sents_obtained = split_text(text)
copy_sents_obtained = sents_obtained.copy()
for sent_i in range(len(sents_obtained)):
new_sent = [w.group()
for w in re.finditer(r'\w+', sents_obtained[sent_i])
if w.group().isalpha()]
sents_obtained[sent_i] = new_sent
c_common = 0
for sent in sents_obtained:
if sent in sents_gold:
c_common += 1
p, r, f1 = get_prf(c_common, len(sents_obtained), len(sents_gold))
print('\n\n', fileid)
print('Precision: {:0.2f}, Recall: {:0.2f}, F1: {:0.2f}'.format(p, r, f1))
ps.append(p)
rs.append(r)
f1s.append(f1)
print('\n\nPrecision stats: {:0.3f} +- {:0.4f}'.format(np.mean(ps),
np.std(ps)))
print('Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(rs),
np.std(rs)))
print('F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(f1s),
np.std(f1s)))
print(len(f1s))
good_ps = [p for p in ps if p >= 0.8]
good_rs = [r for r in rs if r >= 0.8]
good_f1s = [f1 for f1 in f1s if f1 >= 0.8]
print('\n Good precision stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_ps),
np.std(good_ps)))
print('Good Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_rs),
np.std(good_rs)))
print('Good F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_f1s),
np.std(good_f1s)))
print(len(good_f1s))
示例15: access
def access():
monty[0]
monty[3]
monty[5]
monty[-1]
sent = 'colorless green ideas sleep furiously'
for char in sent:
print char,
from nltk.corpus import gutenberg
raw = gutenberg.raw('melville-moby_dick.txt')
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
fdist.keys()