本文整理汇总了Python中nltk.corpus.gutenberg.fileids函数的典型用法代码示例。如果您正苦于以下问题:Python fileids函数的具体用法?Python fileids怎么用?Python fileids使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了fileids函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: exercise_gutenberg
def exercise_gutenberg():
# 打印古腾堡项目的文件列表
print gutenberg.fileids()
# 挑选一个文本: 简-奥斯丁的《爱玛》
emma = gutenberg.words("austen-emma.txt")
# 查看书的长度
print len(emma)
# 导入文本
emma_text = nltk.Text(emma)
emma_text.concordance("surprize")
for file_id in gutenberg.fileids():
chars_list = gutenberg.raw(file_id)
words_list = gutenberg.words(file_id)
sents_list = gutenberg.sents(file_id)
# 统计文件的总字符数
num_chars = len(chars_list)
# 统计文件的总单词数
num_words = len(words_list)
# 统计文件的总句子数
num_sents = len(sents_list)
# 统计文件的非重复单词数
num_vocab = len(set([w.lower() for w in words_list]))
# 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
示例2: fun01
def fun01():
"""fun01"""
print gutenberg.fileids()
# emma by jane austen
emma = gutenberg.words('austen-emma.txt')
# how many words it contains
print len(emma)
print Text(emma).concordance("surprize")
示例3: handle
def handle(self, *args, **options):
for fileid in gutenberg.fileids():
out_dir = CORPUS_DIR + os.sep + fileid.replace(".txt", "")
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
f = open(out_dir + os.sep + "sentences.txt", 'w')
f.write(gutenberg.raw(fileid))
f.close()
示例4: gutenberg
def gutenberg():
from nltk.corpus import gutenberg
for t in gutenberg.fileids():
num_chars = len(gutenberg.raw(t))
num_words = len(gutenberg.words(t))
num_sents = len(gutenberg.sents(t))
num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
示例5: gutenberg
def gutenberg():
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
print len(emma)
print gutenberg.fileids()
emma = gutenberg.words('austen-emma.txt')
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences[1037]
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len]
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
示例6: similarity_gutenberg
def similarity_gutenberg():
for x in range(2,6):
a = []
b = 0
c = 0
d = 1
for fid in gutenberg.fileids():
a.append([])
for ffid in gutenberg.fileids():
a[b].append(Jaccard(n_window(gutenberg.raw(fid),x),n_window(gutenberg.raw(ffid),x)))
b += 1
for i in range(len(a)):
for j in range(len(a)):
c += a[i][j]/(len(a)*len(a))
d = min(d,a[i][j])
print("Media: "+ str(c))
print("Minimo: "+ str(d))
示例7: page57
def page57():
"""Statistics from the Gutenberg corpora"""
from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars / num_words), int(num_words / num_sents),
print int(num_words / num_vocab), fileid
示例8: for_print
def for_print():
'''
显示每个文本的三个统计量
:return:
'''
for fileid in gutenberg.fileids():
num_chars=len(gutenberg.raw(fileid))
num_words=len(gutenberg.words(fileid))
num_sents=len(gutenberg.sents(fileid))
num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
示例9: create_model_from_NLTK
def create_model_from_NLTK():
filepath = "nltkcorpus.txt"
if isfile(filepath):
return create_model(filepath= filepath, save=False)
else:
from nltk.corpus import reuters, brown, gutenberg
sents = reuters.sents() + brown.sents()
for gsents in [gutenberg.sents(fid) for fid in gutenberg.fileids()]:
sents += gsents
return create_model(sentences=sents, savename=filepath)
示例10: fun02
def fun02():
"""fun02"""
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
# average word length average sentence length
print int(num_chars/num_words), int(num_words/num_sents),
# number of times each vocabulary item appers in the text
print int(num_words/num_vocab), fileid
示例11: solve_p2_greedy
def solve_p2_greedy(file):
lines = [l.lower().split("|")[1:-1] for l in open(file)]
slices = slice(lines)
n = 3
corpus = NgramLetterCorpus(n)
for fileid in gutenberg.fileids()[:3]:
corpus.update(gutenberg.raw(fileid))
slices = unshred3(slices, corpus)
print "FINAL: "
for l in linearize(slices):
print "".join(l)
示例12: train
def train(self):
self.vocabulary=set()
this_bigrams=[]
self.unigrams = FreqDist([])
for fileid in gutenberg.fileids():
for sentence in gutenberg.sents(fileid):
words=["<s>",] + [x.lower() for x in sentence if wordRE.search(x)] + ["</s>",]
this_bigrams += bigrams(words)
self.vocabulary.update(words)
self.unigrams.update(words)
self.bigrams=ConditionalFreqDist(this_bigrams)
self.V = len(self.vocabulary)
示例13: benchmark_sbd
def benchmark_sbd():
ps = []
rs = []
f1s = []
c = 0
for fileid in gutenberg.fileids():
c += 1
copy_sents_gold = gutenberg.sents(fileid)
sents_gold = [s for s in copy_sents_gold]
for sent_i in range(len(sents_gold)):
new_sent = [w for w in sents_gold[sent_i] if w.isalpha()]
sents_gold[sent_i] = new_sent
text = gutenberg.raw(fileid)
sents_obtained = split_text(text)
copy_sents_obtained = sents_obtained.copy()
for sent_i in range(len(sents_obtained)):
new_sent = [w.group()
for w in re.finditer(r'\w+', sents_obtained[sent_i])
if w.group().isalpha()]
sents_obtained[sent_i] = new_sent
c_common = 0
for sent in sents_obtained:
if sent in sents_gold:
c_common += 1
p, r, f1 = get_prf(c_common, len(sents_obtained), len(sents_gold))
print('\n\n', fileid)
print('Precision: {:0.2f}, Recall: {:0.2f}, F1: {:0.2f}'.format(p, r, f1))
ps.append(p)
rs.append(r)
f1s.append(f1)
print('\n\nPrecision stats: {:0.3f} +- {:0.4f}'.format(np.mean(ps),
np.std(ps)))
print('Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(rs),
np.std(rs)))
print('F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(f1s),
np.std(f1s)))
print(len(f1s))
good_ps = [p for p in ps if p >= 0.8]
good_rs = [r for r in rs if r >= 0.8]
good_f1s = [f1 for f1 in f1s if f1 >= 0.8]
print('\n Good precision stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_ps),
np.std(good_ps)))
print('Good Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_rs),
np.std(good_rs)))
print('Good F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_f1s),
np.std(good_f1s)))
print(len(good_f1s))
示例14: __init__
def __init__(self):
self.num_passages = 10
self.passagesize = 1000
self.maxpeople = 10
self.maxnouns = 5
self.total_passages = 10*len(gutenberg.fileids())
self.skeletons = []
self.index_dicts = []
#Load all of the things into memory
#j = 0
for fileid in gutenberg.fileids():
for k in range(self.num_passages):
filename = fileid+'_'+str(k) +'_skeleton.txt'
f = open(filename, 'r')
self.skeletons.append(f.read().split(" "))
f.close()
filename = fileid+'_'+str(k) +'_indices.txt'
f = open(filename, 'r')
self.index_dicts.append({})
for line in f.readlines():
splitted = line.split()
self.index_dicts[-1][splitted[0]] = splitted[1:]
f.close()
示例15: find_phrases
def find_phrases(regexp):
fids = gutenberg.fileids()
rs = []
for fid in fids:
txt = nltk.Text(gutenberg.words(fid))
ts = nltk.text.TokenSearcher(txt)
r = ts.findall(regexp)
for x in r:
if x[0].lower() in wrong_vbs:
x[0] = 'looking at'
if x[-1].lower() in wrong_vbs:
x[-1] = 'me'
rs.extend(r)
return rs