本文整理汇总了Python中nltk.corpus.brown.sents函数的典型用法代码示例。如果您正苦于以下问题:Python sents函数的具体用法?Python sents怎么用?Python sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了sents函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_sentences_brown
def load_sentences_brown(nb_sentences=None):
"""
:param nb_sentences: Use if all brown sentences are too many
:return: index2word (list of string)
"""
from nltk.corpus import brown
import gensim
print 'building vocab ...'
if nb_sentences is None:
sents = brown.sents()
else:
sents = brown.sents()[:nb_sentences]
# I use gensim model only for building vocab
model = gensim.models.Word2Vec()
model.build_vocab(sents)
vocab = model.vocab
# ids: list of (list of word-id)
ids = [[vocab[w].index for w in sent
if w in vocab and vocab[w].sample_int > model.random.rand() * 2**32]
for sent in sents]
return ids, model.index2word
示例2: clean
def clean():
'''
1. Removes any individual special character.
2. Lowers all the words.
:return: list of clean sentences
'''
sents = list(brown.sents())
sents_copy = list(brown.sents())
n = len(sents)
print 'Removing special chars...'
for i in range(0, n):
for word in sents[i]:
if not bool(re.search('[A-Za-z0-9]', word)):
sents_copy[i].remove(word)
print 'Removed special chars.'
sents = None
print 'Lowercasing all the words...'
for i in range(0, n):
m = len(sents_copy[i])
for j in range(0, m):
sents_copy[i][j] = sents_copy[i][j].lower()
print 'Lowered all the words.'
return sents_copy
示例3: print_brown
def print_brown():
from nltk.corpus import brown
print brown.categories()
print brown.words(categories='news')
print brown.words(fileids=['cg22'])
print brown.sents(categories=['news','reviews'])
news_text=brown.words(categories='news')
fdist=nltk.FreqDist([w.lower() for w in news_text])
modals=['can','could','may','might','must','will']
for m in modals:
print m+':',fdist[m]
示例4: load_movie_corpus_each_sentence
def load_movie_corpus_each_sentence(range):
m = re.match(r'(\d+):(\d+)$', range)
if m:
start = int(m.group(1))
end = int(m.group(2))
from nltk.corpus import movie_reviews as corpus
return [corpus.sents(fileid) for fileid in corpus.fileids()[start:end]]
示例5: find_ngrams
def find_ngrams(self, n):
""" Input: the 'n' of 'n-grams'
Find all the n-grams in the brown corpus. Store in frequency dictionary.
Optionally it can be decided to use more corpora in order to have more data.
Note: these are of course n-grams based on going through the sentence from left to right
If we want to give the correction back based on the dependency tree, we need to
parse the brown corpus (or any other data set) with the dependency parser, so that
we can use this data.
"""
total_ngram_count = 0
ngram_freq_dict = {}
sents = brown.sents()
for sent in sents:
sent = ['-START-']*(n-1)+sent
ngrams_brown = ngrams(sent, n)
for i in ngrams_brown:
total_ngram_count += 1
old = ngram_freq_dict.get(i,0)
old += 1
ngram_freq_dict[i] = old
#print i,old
return ngram_freq_dict, total_ngram_count
示例6: data_api
def data_api(spilt_rate):
raw_sent = brown.sents()
partial_data = raw_sent[:int(0.1*len(raw_sent))]
data_x, data_y = prepare_0(partial_data, word2intdict)
print 'len data_x', len(data_x), len(data_y)
train_inds = npr.choice(range(len(data_x)), size = int((1 - spilt_rate) * len(data_x)), replace = False)
X_train = []
Y_train = []
X_test = []
Y_test = []
print 'len train_inds', len(train_inds), len(data_x)
for i in range(len(data_x)):
if i in train_inds:
#print 'trn', i
X_train.append(data_x[i])
Y_train.append(data_y[i])
else :
#print 'tst', i
X_test.append(data_x[i])
Y_test.append(data_y[i])
print 'len X_train', len(X_train), len(X_test)
return (X_train, Y_train), (X_test, Y_test)
示例7: lookupTagger
def lookupTagger():
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.keys()[:100]
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)
sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)
baseline_tagger = nltk.UnigramTagger(model=likely_tags,
backoff=nltk.DefaultTagger('NN'))
def performance(cfd, wordlist):
lt = dict((word, cfd[word].max()) for word in wordlist)
baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def display():
import pylab
words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
sizes = 2 ** pylab.arange(15)
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
pylab.plot(sizes, perfs, '-bo')
pylab.title('Lookup Tagger Performance with Varying Model Size')
pylab.xlabel('Model Size')
pylab.ylabel('Performance')
pylab.show()
示例8: read_datas
def read_datas(self):
brown_tagged_sentence = brown.tagged_sents()
brown_sent = brown.sents()
size = int(len(brown_tagged_sentence) * 0.9)
train_set = brown_tagged_sentence[:size]
test_set = brown_tagged_sentence[size:]
return (train_set,test_set)
示例9: build_index
def build_index(out_filename, in_filename = None):
'''Builds data files for word lookup. Can take an optional input file
to add to the data pool which is processed (not working).
Data is then dumped to a pickle file.'''
sents_data = []
try:
in_file = open(in_filename).read()
sents_data += sent_tokenize(in_file)
in_file.close()
except:
print("Warning: Failed to load external file for building.")
sents_data += brown.sents() + treebank.sents()
# get sentences, chop of rtheir ambiguous heads, and look at their words!
mysents = [sent[1:] for sent in sents_data]
# flatten sublists of words to list of words
mywords = [word for word in mysents for word in word]
cfd = ConditionalFreqDist((word.lower(), word) for word in mywords)
# look up most frequent form of lowercase word by doing cfd['word'].max()
# but need to check for existance of word in cfd first
# made pickle file too large and slow
# wordlist = set(words.words())
# wordlist.update(brown.words())
# wordlist.update(treebank.words())
# common_words_lower = set([w for w in wordlist if w.islower()])
# common_words_titlecase = set([w.lower() for w in wordlist if (w.istitle() and w not in common_words_lower)])
out_file = open(out_filename, 'wb')
pickle.dump(cfd, out_file, 2)
# pickle.dump(common_words_lower, out_file, 2)
# pickle.dump(common_words_titlecase, out_file, 2)
out_file.close()
示例10: cal_idf
def cal_idf():
# brown.sents()
total_wordlists = []
doc_sents = []
for f in brown.fileids():
print f
doc_wordlist = []
doc_sentlist = brown.sents(fileids=[f])
d_sents = ''
for sent in doc_sentlist:
s = ''
# sent = stem_tokens(sent)
for w in sent:
w = w.lower()
s += w + ' '
d_sents += s + '\n'
doc_wordlist.extend(sent)
total_wordlists.append(doc_wordlist)
doc_sents.append(d_sents)
print 'start caling tfidf'
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = doc_sents
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
idf = vectorizer.idf_
# print dict(zip(vectorizer.get_feature_names(), idf))
pickle.dump(vectorizer, open('idf_vectorizer', 'w'))
dictionary = corpora.Dictionary(total_wordlists)
dic, corps = get_corpus_by_lists(total_wordlists)
tfidf = models.TfidfModel(corps, id2word=dic)
pickle.dump(tfidf, open('brown_tfidf', 'w'))
示例11: auto_tag
def auto_tag(company):
"""
tag a given text using brown corpus and unigram tagger
:param company: company whose reviews are tagged
:return: a list of tagged words
"""
brown_tagged_sents = brown.tagged_sents(categories = 'news', tagset='universal')
brown_sents = brown.sents(categories = 'news')
# open the review of a company, and print error message if company review doesn't exist
# first deal with unique cases such as General Motors => GM
if company == 'General Motors':
company = 'GM'
elif company == 'Ford Motor Company':
company = 'Ford'
try:
text = open('/Users/vickyzhang/Documents/Python/chart/comp/review/'+ company.capitalize() + '_review.txt').read()
except FileNotFoundError:
print('The system doesn\'t have a review for the company you entered. Please enter another company.')
# normalize (tokenize and lowercase-ize) each word in the string
text_token = nltk.word_tokenize(text)
text_normal = [w.lower() for w in text_token]
# build unigram tagger based on brown corpus, and use it to tag the normalized text
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
text_tagged = unigram_tagger.tag(text_normal)
return text_tagged
示例12: update_category_by_pos
def update_category_by_pos():
from nltk.corpus import brown
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.tag import untag
from nltk import DecisionTreeClassifier
def pos_features(sentence, i):
features = {'suffix(1)':sentence[i][-1:],
'suffix(2)':sentence[i][-2:],
'suffix(3)':sentence[i][-3:]
}
features['prev-word'] = '<start>' if i==0 else sentence[i-1]
return features
print pos_features(brown.sents()[0], 8)
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
untagged_sent = untag(tagged_sent)
for i, (word, tag) in enumerate(tagged_sent):
featuresets.append((pos_features(untagged_sent, i), tag))
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
# classifier = NaiveBayesClassifier.train(train_set)
classifier = DecisionTreeClassifier.train(train_set)
print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
示例13: import_brown_pos
def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout):
"""
Import the brown corpus into `ds`. E.g.
>>> from nathan.core import Dataspace
>>> ds = Dataspace()
>>> %time brown.import_brown(ds, silent=True)
CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
Wall time: 12min 29s
"""
if not silent:
total = len(brown.sents())
counter = 0
for category in brown.categories():
cat_handle = ds.insert("#%s" % category)
for sent in brown.tagged_sents(categories=category):
if simplify_tags:
norm = (simplify_tag(t) for t in sent)
norm = [nltk.tuple2str(t) for t in norm]
sen_handle = ds.insert(norm)
ds.link(cat_handle, sen_handle)
if not silent:
counter += 1
if (counter % 100 == 0):
print("importing %s of %s sentences..." % (counter, total),
file=log)
示例14: createModel
def createModel():
global classifierit
global classifierloose
global classifieryou
global classifierto
global classifiertheir
trainingitSet = []
traininglooseSet = []
trainingyouSet = []
trainingtoSet = []
trainingtheirSet= []
st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar')
for line in brown.sents():
print line
tagSent = st.tag(line)
print tagSent
arrayOfitFeature = pos_itfeatures(tagSent)
arrayOfyouFeature = pos_youfeatures(tagSent)
arrayOftheirFeature = pos_theirfeatures(tagSent)
arrayOflooseFeature = pos_loosefeatures(tagSent)
arrayOftoFeature = pos_tofeatures(tagSent)
if arrayOfitFeature:
trainingitSet.extend(arrayOfitFeature)
if arrayOftheirFeature:
trainingtheirSet.extend(arrayOftheirFeature)
if arrayOflooseFeature:
traininglooseSet.extend(arrayOflooseFeature)
if arrayOftoFeature:
trainingtoSet.extend(arrayOftoFeature)
if arrayOfyouFeature:
trainingyouSet.extend(arrayOfyouFeature)
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1]
#encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True)
classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm)
f = open('classifierit.pickle', 'wb')
pickle.dump(classifierit, f)
f.close()
#encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True)
classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm)
f = open('classifierloose.pickle', 'wb')
pickle.dump(classifierloose, f)
f.close()
#encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True)
classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm)
f = open('classifieryou.pickle', 'wb')
pickle.dump(classifieryou, f)
f.close()
#encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True)
classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm)
f = open('classifierto.pickle', 'wb')
pickle.dump(classifierto, f)
f.close()
#encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True)
classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm)
f = open('classifiertheir.pickle', 'wb')
pickle.dump(classifiertheir, f)
f.close()
示例15: get_valid_brown_corpus
def get_valid_brown_corpus():
global DIR
DIR = BROWN_DIR
genre = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
sentences = brown.sents(categories=genre)
sents = remove_bad_sents(sentences)
sents = [[w.lower() for w in s] for s in sents]
return sents