本文整理汇总了Python中nltk.corpus.PlaintextCorpusReader.sents方法的典型用法代码示例。如果您正苦于以下问题:Python PlaintextCorpusReader.sents方法的具体用法?Python PlaintextCorpusReader.sents怎么用?Python PlaintextCorpusReader.sents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.PlaintextCorpusReader
的用法示例。
在下文中一共展示了PlaintextCorpusReader.sents方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def main():
current_directory = os.path.dirname(__file__)
corpus_root = os.path.abspath(current_directory)
wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt')
wordlists.fileids()
ClassEvent = nltk.Text(wordlists.words())
CEWords = ["Long Island", "Weather Service", "flooding", "August",
"heavy rains", "Wednesday", "Suffolk County", "New York", "rainfall",
"record"]
# ClassEvent Statistics
print "--------- CLASS EVENT STATISTICS -------------"
print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent)
print "ClassEvent WORD LENGTH DISTRIBUTIONS:"
print_word_length_distributions(ClassEvent)
print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:"
print_percentage_of_word_in_collection(ClassEvent, CEWords)
ClassEventLettersPerWord = average_letters_per_word(ClassEvent)
ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents())
ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \
ClassEventWordsPerSent) - 21.43
print "Average number of letters per word", ClassEventLettersPerWord
print "Average number of words per sentence:", ClassEventWordsPerSent
print "Automated Readability Index:", ClassEventARI
print
wordlists_event = PlaintextCorpusReader(corpus_root, "Texas_Wild_Fire/.*\.txt")
wordlists_event.fileids()
YourSmall = nltk.Text(wordlists_event.words())
SmallEventWords = ["Fire", "Wildfire", "Water", "Damage", "Ground", "Burn",
"Town", "Heat", "Wind", "Speed", "Size", "City", "People", "Home",
"Weather", "Debris", "Death", "Smoke", "State", "Ash"]
# YourSmall statistics
print "--------- YOUR SMALL STATISTICS --------------"
print "Texas_Wild_Fire", non_stopword_fraction(YourSmall)
print "YourSmall WORD LENGTH DISTRIBUTIONS:"
print_word_length_distributions(YourSmall)
print "YourSmall PERCENTAGE OF WORD OCCURRENCES:"
print_percentage_of_word_in_collection(YourSmall, SmallEventWords)
YourSmallLettersPerWord = average_letters_per_word(YourSmall)
YourSmallWordsPerSent = len(wordlists_event.words()) / \
len(wordlists_event.sents())
YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \
YourSmallWordsPerSent) - 21.43
print "Average number of letters per word", YourSmallLettersPerWord
print "Average number of words per sentence:", YourSmallWordsPerSent
print "Automated Readability Index", YourSmallARI
示例2: textinfo
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def textinfo(path):
"""
Takes a file path and returns figures about the text file contained therein.
"""
from nltk.corpus import PlaintextCorpusReader
from nltk import FreqDist
corpusReader = PlaintextCorpusReader(text, '.*')
print "Total word count:", len([word for sentence in corpusReader.sents() for word in sentence])
print "Unique words:", len(set(corpusReader.words()))
print "Sentences:", len(corpusReader.sents())
print "Average sentence length in words:", (len([word for sentence in corpusReader.sents() for word in sentence]) / len(corpusReader.sents()))
示例3: get_coarse_level_features
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def get_coarse_level_features(dataset, output_file):
# Import the corpus reader
corpus_root = '/home1/c/cis530/data-hw2/'+dataset
# Define the folder where the files are situated
files_dataset = PlaintextCorpusReader(corpus_root, '.*')
# Open the output_file
output = open('/home1/c/cis530/data-hw2/'+output_file,'w')
# Read the stopwlist
stop_list = open('/home1/c/cis530/data-hw2/'+'stopwlist.txt').read()
types_stop_list=stop_list.split()
for fileid in files_dataset.fileids():
# Output the docid
output.write(dataset+'/'+fileid+' ')
# Output the topic_name
topic_name=fileid.split('/')[0]
output.write(topic_name+' ')
# Output the num_tokens
tokens=files_dataset.words(fileid)
output.write('tok:'+str(len(tokens))+' ')
# Output the num_types
types=set(tokens)
output.write('typ:'+str(len(types))+' ')
# Output the num_contents
output.write('con:'+str(len([w for w in tokens if w not in types_stop_list]))+' ')
# Output the num_sents
sents = files_dataset.sents(fileid)
output.write('sen:'+str(len(sents))+' ')
# Output the avg_slen
avg_slen=round(float(len(tokens))/float(len(sents)),2)
output.write('len:'+str(avg_slen)+' ')
# Output the num_caps
output.write('cap:'+str(len([w for w in tokens if w[0]>='A' and w[0]<='Z'])))
output.write('\n')
output.close()
示例4: compare
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def compare(request):
errors = []
statistics=[]
stats=[]
for x in range(1,3):
cantoname = "canto"+str(x)+".txt"
w=PlaintextCorpusReader("./",cantoname);
w.words();
t=nltk.text.Text(w.words());
l_lines=len(line_tokenize(w.raw()))
l_uwords=len(set(w.words()))
l_words=len(w.words())
l_sents=len(w.sents())
l_paras=len(w.paras())
l_linperpara=l_lines/l_paras
statistics.append(x)
statistics.append("Number of Words - "+ str(l_words))
statistics.append("Number of Unique Words - "+ str(l_uwords))
statistics.append("Number of Setences - "+ str(l_sents))
statistics.append("Number of Lines - "+ str(l_lines))
statistics.append("Number of Paras - "+ str(l_paras))
statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
lexical_density=l_words/l_uwords
l_wordpersent = l_words/l_sents
statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
statistics.append("Words per sentence - "+ str(l_wordpersent))
stats.append(statistics)
return render_to_response('compare.html', {'stats':statistics})
示例5: stats
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def stats(request):
errors = []
statistics=[]
if 'q' in request.GET:
q = request.GET['q']
if not q:
errors.append('Enter a Canto Number')
else:
cantoname = "canto"+q+".txt"
w=PlaintextCorpusReader("./",cantoname);
w.words();
t=nltk.text.Text(w.words());
l_lines=len(line_tokenize(w.raw()))
l_uwords=len(set(w.words()))
l_words=len(w.words())
l_sents=len(w.sents())
l_paras=len(w.paras())
l_linperpara=l_lines/l_paras
statistics.append("Number of Words - "+ str(l_words))
statistics.append("Number of Unique Words - "+ str(l_uwords))
statistics.append("Number of Setences - "+ str(l_sents))
statistics.append("Number of Lines - "+ str(l_lines))
statistics.append("Number of Paras - "+ str(l_paras))
statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
lexical_density=l_words/l_uwords
l_wordpersent = l_words/l_sents
statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
statistics.append("Words per sentence - "+ str(l_wordpersent))
return render_to_response('stats.html', {'statistics':statistics})
return render_to_response('stats.html', {'errors': errors})
示例6: main
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def main():
corpus_root = '../posts/'
newcorpus = PlaintextCorpusReader(corpus_root, '.*',
para_block_reader=read_block_no_metadata)
corpus_words = [w.lower() for w in newcorpus.words() if w.isalpha()]
corpus_sentences = newcorpus.sents()
analyst = TextAnalyst(corpus_words, corpus_sentences, 'french')
analyst.print_analyze()
示例7: extractPossibleTerms
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def extractPossibleTerms(root, fileids):
# get corpus
#root, filename = os.path.split(path)
reader = PlaintextCorpusReader(root, fileids)
# get chunker
grammar = 'NP: {<JJ>*<NNP>*<NN>*}'
chunker = RegexpParser(grammar)
# get terms
terms = set()
print len(reader.sents())
i = 0
for sent in reader.sents():
i += 1
if i%100==0:
print i
tree = chunker.parse(pos_tag(sent))
for t in tree.subtrees(lambda t: t.node!='S'): # exclude Sentence node
terms.add(' '.join([el[0] for el in t]))
return terms
示例8: get_coarse_level_features
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def get_coarse_level_features(dataset, output_file):
# accessing the corpus
corpus_root = '/home1/c/cis530/data-hw2/'
dataset_path = corpus_root + dataset
# Reading the files from the directories
files = PlaintextCorpusReader(dataset_path, '.*')
ids = files.fileids()
stopFile = PlaintextCorpusReader(corpus_root, 'stopwlist.txt')
stops = stopFile.words()
#Opening a file that has to be written to
out = open(output_file, 'w')
for i in range(0,len(ids) - 1):
#Initializing certain variables
tokens_count=0
types = 0
non_stops_count=0
sents_count = 0
avg_sent_len=0
cap_count = 0
tokens=files.words(ids[i])
#Computing Number of Tokens
tokens_count = len(tokens)
#Computing Number of types
types = len(set(tokens))
non_stops=[]
#Computing Number of Content Words
for t in tokens:
if t not in stops:
non_stops.append(t)
non_stops_count = len(non_stops)
#Finding Average Sentence Length
sent = []
sent = files.sents(ids[i])
sents_count = len(sent)
sent_len=0
for s in sent:
sent_len = sent_len + len(s)
avg_sent_len = sent_len/float(sents_count)
#Computing Number of Captilized Words
for c in non_stops:
if c.istitle():
cap_count = cap_count+1
current_file = dataset + '/' + ids[i]
e = current_file.split('/')
out.write(current_file +' '+ e[-2] + ' tok:' + str(tokens_count) + ' typ:' + \
str(types) + ' con:' + str(non_stops_count) + ' sen:' + str(sents_count) + ' len:' + str(avg_sent_len) + ' cap:' + str(cap_count)+ '\n')
out.flush()
示例9: train
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def train():
wordlists = PlaintextCorpusReader('', file_path)
st = stemmer()
# Get blocks of text using NLTK
words = wordlists.words(file_path)
sents = wordlists.sents(file_path)
paras = wordlists.paras(file_path)
# LOGIC
# If a sentence contains a known [posi/nega]tive word, count the instances of words in that sentence as
# [posi/nega]tive
# Count words
word_features = []
# Go through paragraphs
for p in paras:
# Classify S
score_positive_negative = 0
for s in p:
for word in s:
word = st.stem(word)
if word in words_positive:
score_positive_negative += 1
elif word in words_negative:
score_positive_negative -= 1
# Record class of paragraph for any words present
for s in p:
for word in s:
word = st.stem(word)
if score_positive_negative > 0:
word_features.append( ({"word": word}, "+") )
elif score_positive_negative < 0:
word_features.append( ({"word": word}, "-") )
else:
word_features.append( ({"word": word}, " ") )
# Create and return classifier
classifier = nltk.NaiveBayesClassifier.train(word_features)
return classifier
示例10: main
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def main():
st = stemmer()
# Get data
wordlists = PlaintextCorpusReader('', file_path)
words = wordlists.words(file_path)
sents = wordlists.sents(file_path)
paras = wordlists.paras(file_path)
# Train
classifier = train()
# Get class probabilities (for MAP estimation)
counts = {"P":0, "-":0, "N":0}
for i in range(0,len(paras)):
for s in paras[i]:
score_pos = 0
score_neg = 0
# Classify paragraph
for word in s:
word = st.stem(word)
feature = {"word":word}
classified = classifier.classify(feature)
if classified == "+":
score_pos += 1
elif classified == "-":
score_neg += 1
# Record result
if score_pos > score_neg:
counts["P"] += 1
elif score_pos < score_neg:
counts["N"] += 1
else:
counts["-"] += 1
# Done!
print counts
示例11: classifyByYear
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def classifyByYear(self) :
corpusReader = PlaintextCorpusReader(self.txtDirectory, ".*.txt", encoding = self.codec)
for journal in corpusReader.fileids() :
print ("Start " + journal)
sentList = corpusReader.sents(journal)
for sent in sentList :
getMonth = False
getDOI = False
line = ''.join(sent)
if self.doiURLTypes[0] in line :
getDOI = True
self._extractYearByDOI(self.doiURLTypes[0], journal, line)
break
elif self.doiURLTypes[1] in line :
getDOI = True
self._extractYearByDOI(self.doiURLTypes[1], journal, line)
break
for word in sent :
if getMonth :
self._extractYearByMonth(journal, word)
break
if word.lower() in self.dictMonth :
getMonth = True
if getMonth :
getMonth = False
break
elif getDOI :
getDOI = False
break
print ("End " + journal)
print (str(self.yearDirectoryList))
示例12: get_sentences_for_text
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def get_sentences_for_text(corpus_root, filename, lang="english"):
"""Segments the given text into sentences.
Args:
corpus_root: Directory in which the text file is residing.
filename: Name of the text file.
lang: Tokenizer language. For possible values, look at:
${NLTK_DATA}/tokenizers/punkt
Returns:
Sentences in the given text.
"""
tokenizer_path = "tokenizers/punkt/" + lang + ".pickle"
text = PlaintextCorpusReader(
corpus_root,
[filename],
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=nltk.data.LazyLoader(tokenizer_path),
)
return text.sents()
示例13: network
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def network(chapter):
if(chapter == 0):
NEs = open("finalNEs/finalNEs.txt").read().split('\n')
text_raw = open("ofk.txt").read()
else:
NEs = open("finalNEs/finalNEs_ch" + str(chapter) + ".txt").read().split('\n')
text_raw = open("ofk_ch" + str(chapter) + ".txt").read()
result = [dict(name="", relations=[""])]
for NE in NEs:
result.append(dict(name=NE, relations=[""]))
# The next line is needed because of the extra blank list elements at the beginning and end (Beginning I added, end added from newlines in finalNEs.txt)
result = result[1:len(result)-1]
corpus = PlaintextCorpusReader('.', 'ofk\.txt')
sentences = corpus.sents()
for x in range(len(sentences)):
for NEdict in result:
if NEdict["name"] in sentences[x]:
# # We are in a sentence with a named entity
for n in result:
if n["name"] in sentences[x] and n["name"] != NEdict["name"]:
NEdict["relations"].append(n["name"])
for NEdict in result:
NEdict["relations"] = Set(NEdict["relations"][1:])
final = [dict(name=r["name"], imports=list(r["relations"]), url=r["name"]+".html") for r in result]
for finals in final:
with open("../webpage/" + finals["name"] + ".html", "w") as f1:
with open("part1.html") as f:
for line in f:
f1.write(line)
f1.write(finals["name"])
with open("part2.html") as f:
for line in f:
f1.write(line)
f1.write("\tmain(\"data/" + finals["name"] + ".json" + "\");\n</script>")
with open("../webpage/data/edgeBundle.json",'w') as outfile:
json.dump(final,outfile, sort_keys = True, indent = 4, ensure_ascii=False)
示例14: build_graph
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
def build_graph(folder, file_pattern):
corpus_root = os.getcwd() + "/" + folder
print "Membuka korpus " + folder + " ..."
word_lists = PlaintextCorpusReader(corpus_root, file_pattern)
naskah = word_lists.sents()
filelists = word_lists.fileids()
teks = tokenize.sent_tokenize(word_lists.raw(fileids=filelists))
print folder + " memiliki " + str(len(teks)) + ", " + str(len(naskah)) + " kalimat."
G_result = nx.Graph()
print "Membangun graf " + folder + " ..."
for kalimat in naskah:
kata = kalimat[0]
prevToken = kata.lower()
for idx in range(1, len(kalimat)):
kata = kalimat[idx]
token = kata.lower()
if containsLetter(token) and containsLetter(prevToken):
G_result.add_edge(prevToken, token)
prevToken = token
return G_result
示例15: PlaintextCorpusReader
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import sents [as 别名]
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/home/vivkul/Downloads/project'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
# wordlists.fileids()
# wordlists.words('questions.txt')
amrit=wordlists.words('allquestion.txt')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.book import *
fo=open("selectedquestion.txt","wb")
a=wordlists.sents('allquestion.txt')
while(len(amrit)!=0):
content=[w for w in amrit if w.lower() not in stopwords]
voc=FreqDist(content)
# sorted([w for w in set(content) if len(w) > 2 and 4voc[w] > 3])
# set_voc_0=FreqDist(a[0])
# set_voc_1=FreqDist(a[1])
b=voc.keys()
i=0
while(i<len(b)):
if(len(b[i])>2):
j=i
max=b[i]
break
i=i+1
q_no=[]
k=0
while(k<len(a)):
set_voc=FreqDist(a[k])
if(set_voc[max]>0):
q_no.append(len([w for w in a[k] if w.lower() not in stopwords]))