本文整理汇总了Python中nltk.corpus.PlaintextCorpusReader.words方法的典型用法代码示例。如果您正苦于以下问题:Python PlaintextCorpusReader.words方法的具体用法?Python PlaintextCorpusReader.words怎么用?Python PlaintextCorpusReader.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.PlaintextCorpusReader
的用法示例。
在下文中一共展示了PlaintextCorpusReader.words方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: save_my_count
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def save_my_count(self,corpus,patt,n,filename):
wordlists = PlaintextCorpusReader(corpus,patt)
fileids = wordlists.fileids()
res = []
for id in fileids:
leng = len(wordlists.words(id))
wordc = len(set(wordlists.words(id)))
wor = "=> corpus tokens: " + `leng` + "\n"
dis = "=> corpus token types: " + `wordc` + "\n"
ric = "=> ind lex richness: " + `leng / wordc` + "\n"
res.append(dis)
res.append(ric)
res.append(wor)
for word in sorted(set(wordlists.words(id))):
freq = (wordlists.words(id)).count(word)
f = "(" + word.lower() + "," + `round(100 * (freq / leng),1)` + ")\n"
t = "(" + word.lower() + "," + `freq` + "/" + `leng` + ")"
res.append(f)
res.append(t)
out = open("../data/"+filename,"w")
try:
for t in res[:n]:
out.write(t + "\n")
finally:
out.close()
示例2: compare
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def compare(request):
errors = []
statistics=[]
stats=[]
for x in range(1,3):
cantoname = "canto"+str(x)+".txt"
w=PlaintextCorpusReader("./",cantoname);
w.words();
t=nltk.text.Text(w.words());
l_lines=len(line_tokenize(w.raw()))
l_uwords=len(set(w.words()))
l_words=len(w.words())
l_sents=len(w.sents())
l_paras=len(w.paras())
l_linperpara=l_lines/l_paras
statistics.append(x)
statistics.append("Number of Words - "+ str(l_words))
statistics.append("Number of Unique Words - "+ str(l_uwords))
statistics.append("Number of Setences - "+ str(l_sents))
statistics.append("Number of Lines - "+ str(l_lines))
statistics.append("Number of Paras - "+ str(l_paras))
statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
lexical_density=l_words/l_uwords
l_wordpersent = l_words/l_sents
statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
statistics.append("Words per sentence - "+ str(l_wordpersent))
stats.append(statistics)
return render_to_response('compare.html', {'stats':statistics})
示例3: stats
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def stats(request):
errors = []
statistics=[]
if 'q' in request.GET:
q = request.GET['q']
if not q:
errors.append('Enter a Canto Number')
else:
cantoname = "canto"+q+".txt"
w=PlaintextCorpusReader("./",cantoname);
w.words();
t=nltk.text.Text(w.words());
l_lines=len(line_tokenize(w.raw()))
l_uwords=len(set(w.words()))
l_words=len(w.words())
l_sents=len(w.sents())
l_paras=len(w.paras())
l_linperpara=l_lines/l_paras
statistics.append("Number of Words - "+ str(l_words))
statistics.append("Number of Unique Words - "+ str(l_uwords))
statistics.append("Number of Setences - "+ str(l_sents))
statistics.append("Number of Lines - "+ str(l_lines))
statistics.append("Number of Paras - "+ str(l_paras))
statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
lexical_density=l_words/l_uwords
l_wordpersent = l_words/l_sents
statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
statistics.append("Words per sentence - "+ str(l_wordpersent))
return render_to_response('stats.html', {'statistics':statistics})
return render_to_response('stats.html', {'errors': errors})
示例4: hybrid_cfdist
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def hybrid_cfdist():
sherlock_corpus = PlaintextCorpusReader(CORPUS_ROOT_SHERLOCK, '.*', encoding='utf-8')
sherlock_bigrams = nltk.bigrams(sherlock_corpus.words())
pokemon_corpus = PlaintextCorpusReader(CORPUS_ROOT_POKEMON, '.*', encoding='utf-8')
pokemon_bigrams = nltk.bigrams(pokemon_corpus.words())
return nltk.ConditionalFreqDist(sherlock_bigrams + pokemon_bigrams)
示例5: corpus_metrics
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def corpus_metrics(self, corpus_path):
corpus_news = PlaintextCorpusReader(corpus_path, '.*\.txt')
print('Corpus documents', len(corpus_news.fileids()))
print('Train documents', len([c for c in corpus_news.fileids() if c.startswith('train')]))
print('Dev documents', len([c for c in corpus_news.fileids() if c.startswith('dev')]))
print('Test documents', len([c for c in corpus_news.fileids() if c.startswith('test')]))
words = set(corpus_news.words())
words = sorted(words)
print('Corpus different words', len(words))
longwords = [w for w in corpus_news.words() if len(w) > 2]
fdist = nltk.FreqDist(longwords)
bigramController = BigramController()
bigrams = bigramController.BuildBrigramFeatures(longwords)
bigramController.BigramStatistics(bigrams)
trigramdist = nltk.FreqDist(nltk.trigrams(longwords))
#fdist.plot(50, cumulative=False)
print(fdist.most_common(20))
print("Trigram distribution")
print(trigramdist.most_common(20))
words_attack = []
files_attack = [f for f in corpus_news.fileids()
if os.path.basename(os.path.normpath(f)).startswith('attack--')]
for file in files_attack:
for w in corpus_news.words(file):
words_attack.append(w)
words_nonattack = []
files_nonattack = [f for f in corpus_news.fileids()
if os.path.basename(os.path.normpath(f)).startswith('nonattack--')]
for file in files_nonattack:
for w in corpus_news.words(file):
words_nonattack.append(w)
words_bag = { }
words_bag['attack'] = words_attack
words_bag['nonattack'] = words_nonattack
#print(words_bag['attack'])
cfd = nltk.ConditionalFreqDist((category, word)
for category in ['attack', 'nonattack']
for word in words_bag[category]
)
示例6: main
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def main():
current_directory = os.path.dirname(__file__)
corpus_root = os.path.abspath(current_directory)
wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt')
wordlists.fileids()
ClassEvent = nltk.Text(wordlists.words())
CEWords = ["Long Island", "Weather Service", "flooding", "August",
"heavy rains", "Wednesday", "Suffolk County", "New York", "rainfall",
"record"]
# ClassEvent Statistics
print "--------- CLASS EVENT STATISTICS -------------"
print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent)
print "ClassEvent WORD LENGTH DISTRIBUTIONS:"
print_word_length_distributions(ClassEvent)
print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:"
print_percentage_of_word_in_collection(ClassEvent, CEWords)
ClassEventLettersPerWord = average_letters_per_word(ClassEvent)
ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents())
ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \
ClassEventWordsPerSent) - 21.43
print "Average number of letters per word", ClassEventLettersPerWord
print "Average number of words per sentence:", ClassEventWordsPerSent
print "Automated Readability Index:", ClassEventARI
print
wordlists_event = PlaintextCorpusReader(corpus_root, "Texas_Wild_Fire/.*\.txt")
wordlists_event.fileids()
YourSmall = nltk.Text(wordlists_event.words())
SmallEventWords = ["Fire", "Wildfire", "Water", "Damage", "Ground", "Burn",
"Town", "Heat", "Wind", "Speed", "Size", "City", "People", "Home",
"Weather", "Debris", "Death", "Smoke", "State", "Ash"]
# YourSmall statistics
print "--------- YOUR SMALL STATISTICS --------------"
print "Texas_Wild_Fire", non_stopword_fraction(YourSmall)
print "YourSmall WORD LENGTH DISTRIBUTIONS:"
print_word_length_distributions(YourSmall)
print "YourSmall PERCENTAGE OF WORD OCCURRENCES:"
print_percentage_of_word_in_collection(YourSmall, SmallEventWords)
YourSmallLettersPerWord = average_letters_per_word(YourSmall)
YourSmallWordsPerSent = len(wordlists_event.words()) / \
len(wordlists_event.sents())
YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \
YourSmallWordsPerSent) - 21.43
print "Average number of letters per word", YourSmallLettersPerWord
print "Average number of words per sentence:", YourSmallWordsPerSent
print "Automated Readability Index", YourSmallARI
示例7: get_coarse_level_features
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def get_coarse_level_features(dataset, output_file):
# accessing the corpus
corpus_root = '/home1/c/cis530/data-hw2/'
dataset_path = corpus_root + dataset
# Reading the files from the directories
files = PlaintextCorpusReader(dataset_path, '.*')
ids = files.fileids()
stopFile = PlaintextCorpusReader(corpus_root, 'stopwlist.txt')
stops = stopFile.words()
#Opening a file that has to be written to
out = open(output_file, 'w')
for i in range(0,len(ids) - 1):
#Initializing certain variables
tokens_count=0
types = 0
non_stops_count=0
sents_count = 0
avg_sent_len=0
cap_count = 0
tokens=files.words(ids[i])
#Computing Number of Tokens
tokens_count = len(tokens)
#Computing Number of types
types = len(set(tokens))
non_stops=[]
#Computing Number of Content Words
for t in tokens:
if t not in stops:
non_stops.append(t)
non_stops_count = len(non_stops)
#Finding Average Sentence Length
sent = []
sent = files.sents(ids[i])
sents_count = len(sent)
sent_len=0
for s in sent:
sent_len = sent_len + len(s)
avg_sent_len = sent_len/float(sents_count)
#Computing Number of Captilized Words
for c in non_stops:
if c.istitle():
cap_count = cap_count+1
current_file = dataset + '/' + ids[i]
e = current_file.split('/')
out.write(current_file +' '+ e[-2] + ' tok:' + str(tokens_count) + ' typ:' + \
str(types) + ' con:' + str(non_stops_count) + ' sen:' + str(sents_count) + ' len:' + str(avg_sent_len) + ' cap:' + str(cap_count)+ '\n')
out.flush()
示例8: loadCorpora
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def loadCorpora():
corpus_root = '/usr/share/dict'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
wordlists.words('connectives')
corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
file_pattern = r".*/wsj_.*\.mrg"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
ptb.fileids()
len(ptb.sents())
ptb.sents(fileids='20/wsj_2013.mrg')[19]
示例9: get_lm_features
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def get_lm_features(dataset, output_file):
corpus_root = '/home1/c/cis530/data-hw2/'
bigram_root = corpus_root + 'Language_model_set/'
fin_files = PlaintextCorpusReader(bigram_root+'Finance/','.*')
fin_words = list(fin_files.words())
fin_model = NGramModel(fin_words, 2)
health_files = PlaintextCorpusReader(bigram_root+'Health/','.*')
health_words = list(health_files.words())
health_model = NGramModel(health_words, 2)
res_files = PlaintextCorpusReader(bigram_root+'Research/','.*')
res_words = list(res_files.words())
res_model = NGramModel(res_words, 2)
com_files = PlaintextCorpusReader(bigram_root+'Computers_and_the_Internet/','.*')
com_words = list(com_files.words())
com_model = NGramModel(com_words, 2)
test_files = PlaintextCorpusReader(corpus_root+dataset, '.*')
ids = test_files.fileids()
out_file = open(output_file,'w')
for j in range(0,len(ids)):
file_words = test_files.words(ids[j])
out_str = ''
current_file = dataset + '/'+ids[j]
e = current_file.split('/')
out_str = out_str + current_file+ ' '+e[-2]
sum_fin=0
sum_health=0
sum_res=0
sum_com=0
text_len = len(file_words)
for i in range(1,len(file_words)):
sum_fin = sum_fin + math.log(fin_model.prob((file_words[i-1],),file_words[i]))
comp_fin = float((-sum_fin)*(1/float(text_len)))
sum_health = sum_health + math.log(health_model.prob((file_words[i-1],),file_words[i]))
comp_health = (float(-sum_health))*(1/float(text_len))
sum_res = sum_res + math.log(res_model.prob((file_words[i-1],),file_words[i]))
comp_res = (float(-sum_res))*(1/float(text_len))
sum_com = sum_com + math.log(com_model.prob((file_words[i-1],),file_words[i]))
comp_com = (float(-sum_com))*(1/float(text_len))
out_str = out_str + ' finprob:'+str(round(sum_fin,2))+' hlprob:'+str(round(sum_health,2))+' resprob:'\
+str(round(sum_res,2))+ ' coprob:' + str(round(sum_com,2)) + ' finper:' + str(round(comp_fin,2)) + ' hlper:'+\
str(round(comp_health,2))+ ' resper:' + str(round(comp_res,2)) + ' coper:' + str(round(comp_com,2))
out_file.write(out_str + '\n')
out_file.flush()
示例10: plot_cfreq
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def plot_cfreq(self,corpus,patt,n):
wordlists = PlaintextCorpusReader(corpus,patt)
fileids = wordlists.fileids()
for id in fileids:
words = wordlists.words(id)
fre = FreqDist(word.lower() for word in words if word.isalpha())
return fre.plot(n,cumulative=True)
示例11: __init__
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
class BigramModel:
category_root=[]
files_dataset_category=[]
word_list=[]
bigram=[]
fd = []
cfd = []
def __init__(self,category,corpus_root):
self.category_root=[]
self.files_dataset_category=[]
self.word_list=[]
self.bigram=[]
self.fd = []
self.cfd = []
self.category_root=corpus_root+'/'+category
self.files_dataset_category=PlaintextCorpusReader(self.category_root,'.*')
self.word_list = self.files_dataset_category.words()
self.bigram = nltk.bigrams(self.word_list)
self.fd = FreqDist(self.word_list)
self.cfd = nltk.ConditionalFreqDist(self.bigram)
def get_prob_and_per(self,word_list):
# The function takes a word_list and return both the log probability and log perplexity under the language model
n_types = len(set(word_list))
n_tokens=len(word_list)
# Calculate Log Prob with Laplace smoothing.
log_prob = math.log(self.fd[word_list[0]]+1)-math.log(n_tokens+n_types) #initializing prob for the first word
for (w1,w2) in nltk.bigrams(word_list):
log_prob = log_prob+math.log(self.cfd[w1][w2]+1)-math.log(len(self.cfd[w1].keys())+n_types)
# Calculate Log Perplexity
log_per=float(1)/float(-n_tokens)*log_prob
return log_prob, log_per
示例12: get_lm_features
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def get_lm_features(dataset,output_file):
# Import the corpus reader
corpus_root = '/home1/c/cis530/data-hw2/'+dataset
# Define the folder where the files are situated
files_dataset = PlaintextCorpusReader(corpus_root, '.*')
fin_model = BigramModel('Finance',corpus_root)
hel_model = BigramModel('Health',corpus_root)
res_model = BigramModel('Computers_and_the_Internet',corpus_root)
co_model = BigramModel('Research',corpus_root)
output = open('/home1/c/cis530/data-hw2/'+output_file,'w')
for fileid in files_dataset.fileids():
# Output the docid
output.write(dataset+'/'+fileid+' ')
# Output the topic_name
topic_name=fileid.split('/')[0]
output.write(topic_name+' ')
word_list = files_dataset.words(fileid)
finprob,finper = fin_model.get_prob_and_per(word_list)
hlprob,hlper = hel_model.get_prob_and_per(word_list)
resprob,resper = res_model.get_prob_and_per(word_list)
coprob,coper = co_model.get_prob_and_per(word_list)
output.write('finprob:'+str(round(finprob,1))+' ')
output.write('hlprob:'+str(round(hlprob,1))+' ')
output.write('resprob:'+str(round(resprob,1))+' ')
output.write('coprob:'+str(round(coprob,1))+' ')
output.write('finper:'+str(round(finper,1))+' ')
output.write('hlper:'+str(round(hlper,1))+' ')
output.write('resper:'+str(round(resper,1))+' ')
output.write('coper:'+str(round(coper,1))+' ')
output.write('\n')
output.close()
示例13: corpus_from_directory
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def corpus_from_directory(path, filetype='.*'):
'''
Make a corpus of all files in a given directory. Can limit type by passing
the desired extension, proper format is, e.g., '.*\.txt'
'''
corpus_reader = PlaintextCorpusReader(path, filetype)
return nltk.Text( corpus_reader.words() )
示例14: prepare_pos_features
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def prepare_pos_features(Language_model_set, output_file):
corpus_root = '/home1/c/cis530/data-hw2/' + Language_model_set
texts = PlaintextCorpusReader(corpus_root, '.*')
text = texts.words()
tagged_text = nltk.pos_tag(text)
merged_tag_text = mergeTags(tagged_text)
lists = seperate_pos(merged_tag_text)
nouns_dist = FreqDist(lists[0])
top_nouns = nouns_dist.keys()[:200]
verbs_dist = FreqDist(lists[1])
top_verbs =verbs_dist.keys()[:200]
advs_dist = FreqDist(lists[2])
top_advs =advs_dist.keys()[:100]
prep_dist = FreqDist(lists[3])
top_preps =prep_dist.keys()[:100]
adjs_dist = FreqDist(lists[4])
top_adjs =adjs_dist.keys()[:200]
out = open(output_file, 'w')
for n in top_nouns:
out.write('NN'+ n + '\n')
for v in top_verbs:
out.write('VV'+ v + '\n')
for av in top_advs:
out.write('ADV'+ av + '\n')
for p in top_preps:
out.write('PREP'+ p + '\n')
for aj in top_adjs:
out.write('ADJ'+ aj + '\n')
示例15: similar
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import words [as 别名]
def similar (text, word):
if re.match ("^[a-zA-Z0-9_\(\),\.]+$", text) and re.match ("^[a-zA-Z0-9_]+$", word):
text = '%s.txt' % text
f = open(os.path.join(CORPUS_ROOT, text), 'r')
source = f.read()
f.close()
corpus = PlaintextCorpusReader(CORPUS_ROOT, [text])
n_text = nltk.text.Text(corpus.words(text))
context_index = nltk.text.ContextIndex(n_text.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower())
word = word.lower()
wci = context_index._word_to_contexts
result = []
if word in wci.conditions():
contexts = set(wci[word])
fd = nltk.probability.FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word)
words = nltk.util.tokenwrap(fd.keys()[:20])
for middle_word in words.split(' '):
for context in contexts:
if re.search ("/" + context[0] + "(\W|\s)+" + middle_word + "(\W|\s)+" + context[1] + "/i", source) != 'none':
print (context[0], middle_word, context[1])
result.append ({'word': word, 'context_left': context[0], 'context_right': context[1]})
return dumps ({'name': text, 'word': word, 'result': result})