本文整理汇总了Python中nltk.corpus.PlaintextCorpusReader.paras方法的典型用法代码示例。如果您正苦于以下问题:Python PlaintextCorpusReader.paras方法的具体用法?Python PlaintextCorpusReader.paras怎么用?Python PlaintextCorpusReader.paras使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.PlaintextCorpusReader
的用法示例。
在下文中一共展示了PlaintextCorpusReader.paras方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: compare
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def compare(request):
errors = []
statistics=[]
stats=[]
for x in range(1,3):
cantoname = "canto"+str(x)+".txt"
w=PlaintextCorpusReader("./",cantoname);
w.words();
t=nltk.text.Text(w.words());
l_lines=len(line_tokenize(w.raw()))
l_uwords=len(set(w.words()))
l_words=len(w.words())
l_sents=len(w.sents())
l_paras=len(w.paras())
l_linperpara=l_lines/l_paras
statistics.append(x)
statistics.append("Number of Words - "+ str(l_words))
statistics.append("Number of Unique Words - "+ str(l_uwords))
statistics.append("Number of Setences - "+ str(l_sents))
statistics.append("Number of Lines - "+ str(l_lines))
statistics.append("Number of Paras - "+ str(l_paras))
statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
lexical_density=l_words/l_uwords
l_wordpersent = l_words/l_sents
statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
statistics.append("Words per sentence - "+ str(l_wordpersent))
stats.append(statistics)
return render_to_response('compare.html', {'stats':statistics})
示例2: stats
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def stats(request):
errors = []
statistics=[]
if 'q' in request.GET:
q = request.GET['q']
if not q:
errors.append('Enter a Canto Number')
else:
cantoname = "canto"+q+".txt"
w=PlaintextCorpusReader("./",cantoname);
w.words();
t=nltk.text.Text(w.words());
l_lines=len(line_tokenize(w.raw()))
l_uwords=len(set(w.words()))
l_words=len(w.words())
l_sents=len(w.sents())
l_paras=len(w.paras())
l_linperpara=l_lines/l_paras
statistics.append("Number of Words - "+ str(l_words))
statistics.append("Number of Unique Words - "+ str(l_uwords))
statistics.append("Number of Setences - "+ str(l_sents))
statistics.append("Number of Lines - "+ str(l_lines))
statistics.append("Number of Paras - "+ str(l_paras))
statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
lexical_density=l_words/l_uwords
l_wordpersent = l_words/l_sents
statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
statistics.append("Words per sentence - "+ str(l_wordpersent))
return render_to_response('stats.html', {'statistics':statistics})
return render_to_response('stats.html', {'errors': errors})
示例3: extractParasInList
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def extractParasInList(name):
corpuslocation ='/Users/anis/seniorProject/aligned Paragraphs/algebra'
reader = PlaintextCorpusReader(corpuslocation, '.*\.txt')
# This gives the list of paragraphs. every paragraph list contains ist of sentences
# So it is a list of lists. Bunch of sentenses as a list joins together to make #lists of pararagraph
pList = []
paragraphlist = reader.paras(name) #'simpleTuring.txt'
numpara = len(paragraphlist)
for sentlist in paragraphlist:
#print sentlist
numsent = len(sentlist)
#print type(sentlist),
#print numsent
paraAsAList = []
# this loops through all the sentence lists and make them one list'''
for i in range(numsent):
paraAsAList = paraAsAList + sentlist[i]
#print paraAsAList # this is the whole parapragph as one list
paraAsAString = ""
for word in paraAsAList:
paraAsAString = paraAsAString + word + str(" ")
#print paraAsAString
pList.append(paraAsAString)
#print len(pList)
return pList
示例4: train
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def train():
wordlists = PlaintextCorpusReader('', file_path)
st = stemmer()
# Get blocks of text using NLTK
words = wordlists.words(file_path)
sents = wordlists.sents(file_path)
paras = wordlists.paras(file_path)
# LOGIC
# If a sentence contains a known [posi/nega]tive word, count the instances of words in that sentence as
# [posi/nega]tive
# Count words
word_features = []
# Go through paragraphs
for p in paras:
# Classify S
score_positive_negative = 0
for s in p:
for word in s:
word = st.stem(word)
if word in words_positive:
score_positive_negative += 1
elif word in words_negative:
score_positive_negative -= 1
# Record class of paragraph for any words present
for s in p:
for word in s:
word = st.stem(word)
if score_positive_negative > 0:
word_features.append( ({"word": word}, "+") )
elif score_positive_negative < 0:
word_features.append( ({"word": word}, "-") )
else:
word_features.append( ({"word": word}, " ") )
# Create and return classifier
classifier = nltk.NaiveBayesClassifier.train(word_features)
return classifier
示例5: main
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def main():
st = stemmer()
# Get data
wordlists = PlaintextCorpusReader('', file_path)
words = wordlists.words(file_path)
sents = wordlists.sents(file_path)
paras = wordlists.paras(file_path)
# Train
classifier = train()
# Get class probabilities (for MAP estimation)
counts = {"P":0, "-":0, "N":0}
for i in range(0,len(paras)):
for s in paras[i]:
score_pos = 0
score_neg = 0
# Classify paragraph
for word in s:
word = st.stem(word)
feature = {"word":word}
classified = classifier.classify(feature)
if classified == "+":
score_pos += 1
elif classified == "-":
score_neg += 1
# Record result
if score_pos > score_neg:
counts["P"] += 1
elif score_pos < score_neg:
counts["N"] += 1
else:
counts["-"] += 1
# Done!
print counts
示例6: extractParasInList
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def extractParasInList(name):
corpuslocation ='/home/aniszaman/seniorProject/combined/carnivore'
reader = PlaintextCorpusReader(corpuslocation, '.*\.txt')
# This gives the list of paragraphs. every paragraph list contains ist of sentences
# So it is a list of lists. Bunch of sentenses as a list joins together to make #lists of pararagraph
pList = []
paragraphlist = reader.paras(name) #'simpleTuring.txt'
numpara = len(paragraphlist)
for sentlist in paragraphlist:
numsent = len(sentlist)
paraAsAList = []
# this loops through all the sentence lists and make them one list'''
for i in range(numsent):
paraAsAList = paraAsAList + sentlist[i]
paraAsAString = ""
for word in paraAsAList:
paraAsAString = paraAsAString + word + str(" ")
pList.append(paraAsAString)
return pList
示例7: main
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def main(print_out, motifs, chapter):
wordlists = PlaintextCorpusReader('', 'Punctuated/pot_ch[12345]\.txt')
#rep_words = nltk.FreqDist(brown.words()) # Get representative word counts
st = LancasterStemmer()
#st = RegexpStemmer('ing$|s$|e$', min=4)
for i in range(1,6):
if i != chapter:
continue
g = nx.Graph()
words = wordlists.words('Punctuated/pot_ch{!s}.txt'.format(str(i)))
paras = wordlists.paras('Punctuated/pot_ch{!s}.txt'.format(str(i)))
# Generate HTML
#with open("test" + str(i) + ".txt", "w+") as fi:
# output = generate_html_level2(wordlists, st, words, paras, i)
# fi.write(output)
json_dict = {}
json_dict["nodes"] = []
json_dict["edges"] = []
# Get correlation coefficients
corr_data = get_corr_coefs(wordlists, st, words, paras, print_out, motifs)
corr_coefs = corr_data[0]
corr_freqs = corr_data[1]
# ---------------------------------- NetworkX ----------------------------------
# Get NetworkX nodes
nx_added_nodes = []
for m1 in corr_coefs:
g.add_node(m1)
# Get NetworkX edges
for m1 in corr_coefs:
for m2 in corr_coefs[m1]:
# Avoid repeats
if m1 <= m2:
continue
g.add_edge(m1, m2)
# -------------------------------- End NetworkX --------------------------------
# -------------------------------------- d3.js --------------------------------------
# Get d3-js nodes
json_node_numbers = dict()
square_size = 0
for m1 in corr_coefs:
sz = int(min(corr_freqs[m1]/3.0,50))*3
#print sz
json_node = {
"name": m1,
"size": str(sz),
"color": "#aaaaaa"
}
json_dict["nodes"].append(json_node)
json_node_numbers[m1] = len(json_node_numbers)
# Get d3-js edges
m1m2 = 0;
for m1 in corr_coefs:
for m2 in corr_coefs[m1]:
# Avoid repeats
if m1 <= m2:
continue
# No need to worry about repeats, since corr_coefs won't contain them
edge_size = corr_coefs[m1][m2]
#print "ES " + m1 + "/" + m2 + ": " + str(edge_size)
json_edge = {
"name": m1 + "-" + m2,
"source": json_node_numbers[m1],
"target": json_node_numbers[m2],
"size": str(edge_size)
}
json_dict["edges"].append(json_edge)
# Add boundary d3-js node
json_dict["nodes"].append({"id":"the-end",
"x":square_size,
"y":square_size,
"size":"1",
"color":"#000000"
})
# Write JSON to file
if not print_out:
with open("OFFICIAL/data" + str(i) + ".json", "w+") as fi:
fi.write("var json_str_" + str(i) + "=" + json.dumps(json_dict, fi, indent = 2))
else:
#.........这里部分代码省略.........
示例8: extract_data
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None):
"""Extract features, reduce dimensions with a PCA and return data.
Exports raw- and PCA-reduced data both in arff- and numpy-format.
"""
start = time.clock()
self.dictVectorizer = DictVectorizer(sparse=False)
filename = os.path.split(filepath)[1]
directory = os.path.split(filepath)[0]
plain_reader = PlaintextCorpusReader(
directory,
[filename],
word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|["+string.punctuation+"]"),
sent_tokenizer=LineTokenizer(blanklines="discard"),
encoding='utf8')
# create new subdir for extracted data
if _NEW_SUBDIR is not None:
path = os.path.join(directory, _NEW_SUBDIR)
if not os.path.exists(path):
os.makedirs(path)
path = os.path.join(path, os.path.splitext(filename)[0])
# print "path {}".format(path)
else:
path = os.path.splitext(filepath)[0]
# print "path {}".format(path)
# filepaths for weka- and numpy-files
arff_filepath = path + ".arff"
arff_filepath_pca = path + "_pca95.arff"
numpy_filepath = path + ".npy"
numpy_filepath_pca = path + "_pca95.npy"
# print(":time: Reader created, time elapsed {}").format(time.clock() - start)
paras = plain_reader.paras()
# print(":time: Paras created, time elapsed {}").format(time.clock() - start)
sents = plain_reader.sents()
# print(":time: Sents created, time elapsed {}").format(time.clock() - start)
# get paragraph boundaries for sliding-window
self.boundaries = util.get_boundaries(paras)
boundaries_backup = self.boundaries
# check if all files necessary exist, if yes - unpickle/load them and return data
if util.files_already_exist([numpy_filepath_pca,]):
print "Features already extracted. Calculating clusters...\n"
matrix_sklearn_pca = numpy.load(numpy_filepath_pca)
return filepath, self.boundaries, matrix_sklearn_pca, len(sents)
# save correct target-labels and additional info of current data
targets_path = open(path + ".tbs", "wb")
pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path)
# print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start)
self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features)
# self.data[year] = self.extract_features_para(paras, ind_features, dep_features)
# print(":time: Features extracted, time elapsed {}").format(time.clock() - start)
self.all_features = self.unified_features(self.data)
# print(":time: Unified features, time elapsed {}").format(time.clock() - start)
matrix_sklearn = self.feature_matrix_sklearn(self.generator_data(self.data))
# print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start)
matrix_sklearn = util.normalize(matrix_sklearn)
# print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start)
print "Exporting raw-data..."
util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename+"_RAW", labels_per_window, file_info=None)
numpy.save(numpy_filepath, matrix_sklearn)
# print "matrix dimensions before pca: {}".format(matrix_sklearn.shape)
feature_names, feature_names_part = None, None
if _DO_PCA:
print "PCA calculation..."
matrix_sklearn_pca, feature_names = util.pca(matrix_sklearn, self.dictVectorizer.get_feature_names())
util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename+"_PCA95", labels_per_window, file_info=None)
numpy.save(numpy_filepath_pca, matrix_sklearn_pca)
del matrix_sklearn
gc.collect()
return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
示例9: PlaintextCorpusReader
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
NUM_WORDS = 20
wordlist = PlaintextCorpusReader('', 'ofk(_chap_[1234])?\.txt')
def clean_words(words):
#convert everything to lower case
words = [w.lower() for w in words]
#remove period from end of sentences
words = [re.sub('\.','',w) for w in words]
#only keep alphabetic strings
words = [w for w in words if w.isalpha()]
words = [w for w in words if not w in stopwords.words('english')]
#do stemming "goes" => "go"
words = [nltk.PorterStemmer().stem(w) for w in words]
return words
paras = [sum(para, []) for para in wordlist.paras('ofk.txt')]
words = clean_words(wordlist.words('ofk.txt'))
groups = []
for i in range(0, len(paras), GROUP_LENGTH):
group = sum(paras[i : min(i + GROUP_LENGTH, len(paras))], [])
groups.append(group)
freqs = []
for group in groups:
freq = FreqDist(clean_words(group))
table = {w:freq[w] for w in freq}
freqs.append(table)
top_words = [w for w in FreqDist(words)][:NUM_WORDS]
def get_word_freqs(word):
return {'word':word, 'values':[{'x':i, 'y':freqs[i].get(word, 0)} for i in range(len(freqs))]}
示例10: len
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import wordnet as wn
# Grab stopwords.
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords) #127
# Read the plain text.
corpus_root = 'corpora'
aow = PlaintextCorpusReader(corpus_root, 'Art-of-War.txt')
aow.fileids() #['artofwar.txt']
aow.words() #['THE', 'ART', 'OF', 'WAR', 'BY', 'SUN', 'TZU', ...]
len(aow.words()) #13038
len(aow.sents()) #943
len(aow.paras()) #399
len([s for s in aow.sents() if 'enemy' in s]) #111
len([s for s in aow.sents() if 'enemies' in s]) #1
len([s for s in aow.sents() if 'ally' in s]) #2
len([s for s in aow.sents() if 'allies' in s]) #3
len([s for s in aow.sents() if 'spy' in s]) #8
len([s for s in aow.sents() if 'spies' in s]) #11
# Extract the list of sentences with /^enem(?:y|ies)$/i words.
enemy_sents = []
for s in aow.sents():
for w in s:
if w.lower().startswith('enem'):
enemy_sents.append(s) # TODO Skip if seen.
len(enemy_sents) #126 XXX Can contain duplicates. Fix TODO above.
示例11: file
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import paras [as 别名]
'''def beatles_corpus.sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
if self._sent_tokenizer is None:
raise ValueError('No sentence tokenizer for this corpus')
return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid)
in self.abspaths(fileids, True, True)])
'''
k = 0
custom_corpus = nltk.Text(beatles_corpus.words())
for i in beatles_corpus.paras():
# Iterate through songs, printing out contents
# print "Song # " + str( k )
k += 1
l = 0
while l < len(i):
# print "Line " + str(l)
# print i[l]
l += 1
# for j in i:
# print j[0]
# new_song = custom_corpus.generate(100)