本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.get_stop_words方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.get_stop_words方法的具體用法?Python TfidfVectorizer.get_stop_words怎麽用?Python TfidfVectorizer.get_stop_words使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.get_stop_words方法的9個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: vectorizaCorpus
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
def vectorizaCorpus(corpus, minDf):
''' Vectoriza o corpus introducido filtrando as palabras que aparecen en
menos de minDf documentos'''
try:
vectorizer = TfidfVectorizer(min_df = minDf, lowercase=True, stop_words='english')
# Definimos unha lista propoia de stopwords
myStopwords = ['did','didn','does','doesn','don','just','isn', \
'reddit', 'wasn','www','yeah','yes','like','able','thanks', \
'know', 'think','ve', 'want','com','https','http',\
'good', 'really', 'make', 'say', 'going', 'said', 'people','way', \
'use']
# engadimos as stop_words que queremos ao conxunto xa existente
vectorizer.stop_words = vectorizer.get_stop_words().union(myStopwords)
# calculamos a matriz de documentos-términos
docTerms = vectorizer.fit_transform(corpus)
# invertimos o vocabulario creando un diccionario de índices - termos
invVoc = {v: k for k, v in vectorizer.vocabulary_.items()}
# buscamos os termos centrais, que son os que a suma acumulada de tf/idf en todos os documentos é maior
sumaTfidf = docTerms.sum(axis=0).tolist()[0] #calculamos a suma por columnas da matriz de documentos-termos
return vectorizer, invVoc, sumaTfidf
except Exception as e:
print('\nOcorreu un problema: {0}'.format(e))
sys.exit()
示例2: create_stopword_list
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
def create_stopword_list(extra_words):
"""
Creates stopword list (adds extra words to original English set)
"""
from sklearn.feature_extraction.text import TfidfVectorizer
original = list(TfidfVectorizer.get_stop_words(TfidfVectorizer(stop_words='english')))
if extra_words:
return frozenset(original+extra_words)
else:
return frozenset(original)
示例3: make_stop_words
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
def make_stop_words():
'''
Take in list of user-created stop words and join with Tfidf 'english' stop words.
INPUT:
- None
OUTPUT:
- New master list of stop words including user and model inputs
'''
new_stop_words = ['ha', "\'s", 'tt', 'ireach', "n\'t", 'wo', 'pv', 'tm', 'anite', 'rabichev', 'russell', '603', 'hana', 'atmel', 'radwin', 'se', 'doxee', 'lantto', 'publ', 'fpc1025', '855', 'il', '0344']
# create temporary TfidfVectorizer object
tfidf_temp = TfidfVectorizer(stop_words='english')
# get Tfidf 'english' stop words from model
stop_words = tfidf_temp.get_stop_words()
# combine two lists of stop words
result = list(stop_words) + new_stop_words
return result
示例4: math_stop
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
def math_stop():
'''Add math specific words to the standard stop list'''
tfidf = TfidfVectorizer(stop_words='english')
Stop = set()
Stop.update([word for word in tfidf.get_stop_words()])
Stop.update(['theorem', 'denote', 'like', 'thank', 'lemma', 'proof',
'sum', 'difference', 'corollary', 'hand',
'product', 'multiple', 'let', 'group',
'prime', 'log', 'limit', 'cid', 'result',
'main', 'conjecture', 'case', 'suppose',
'function', 'assume', 'follows',
'given', 'define', 'note', 'defined', 'class',
'proposition', 'function', 'set',
'primes', 'numbers', 'form', 'integers', 'curves',
'real', 'using', 'following', 'obtain', 'prove',
'definition', 'large', 'small', 'action', 'define',
'bound', 'sufficiently', 'subject', 'non',
'mathematics'])
return list(Stop)
示例5: make_stop_words
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
def make_stop_words(new_words_list):
tfidf_temp = TfidfVectorizer(stop_words='english')
stop_words = tfidf_temp.get_stop_words()
result = list(stop_words) + new_words_list
return result
示例6: range
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
pos_train = pos_train+['positive']
for l in range(len(neg_trainer)):
neg_train = neg_train+['negative']
y_train = pos_train+neg_train
X_train = tfidf.fit_transform(pos_trainer+neg_trainer)
#n_tf = tfidf.fit_transform(neg_trainer)
clf = MultinomialNB().fit(X_train, y_train)
#clf = MultinomialNB().partial_fit(n_tf, neg_train)
#clf.partial_fit(n_tf, neg_train)
fn = tfidf.get_feature_names()
stop = tfidf.get_stop_words()
if statement == 'red':
comment_list = redditor.redditor()
for comment in range(len(comment_list)):
statement = comment_list[comment]
X_test = tfidf.transform(np.array([statement]))
predicted = clf.predict(X_test)
prob = clf.predict_proba(X_test)
print "Tweet: " + str(l)
print statement
print predicted, prob
X_test = tfidf.transform(np.array([statement]))
示例7: open
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
pickle.dump( from_data, open("your_email_authors.pkl", "w") )
### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer
transformer = TfidfVectorizer(stop_words="english")
theFit = transformer.fit_transform(word_data)
print('theFit', theFit)
afterTransform = theFit.toarray()
print(afterTransform)
stopWords = transformer.get_stop_words()
# print('stopWords', len(stopWords))
featureNames = transformer.get_feature_names()
# print(featureNames[34597])
print('featureNames', len(featureNames))
# shove matrix gained from TfidfVectorizer into
# k-means clustering
# just wanted to experiment
from sklearn.cluster import MiniBatchKMeans
clf = MiniBatchKMeans(n_clusters=4, n_init=10, max_iter=300)
clf.fit(afterTransform)
pred = clf.predict(afterTransform)
print('PRED', pred)
示例8: open
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
if name=='sara':
from_data.append(0)
else:
from_data.append(1)
email.close()
print "emails processed"
from_sara.close()
from_chris.close()
pickle.dump( word_data, open("your_word_data.pkl", "w") )
pickle.dump( from_data, open("your_email_authors.pkl", "w") )
# print word_data[152]
### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfv = TfidfVectorizer(stop_words="english", lowercase=True)
v_word_data = tfidfv.fit_transform(word_data)
print len(tfidfv.get_feature_names())
print tfidfv.get_feature_names()
print tfidfv.get_feature_names()[34597]
print tfidfv.get_stop_words()
示例9: len
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
all_docs.append(read_merge_data(ff))
print all_docs
print len(all_docs)
tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=1, stop_words='english')
try:
results = tfidf_vec.fit_transform(all_docs)
print results.get_shape()
result_as_array = results.toarray()
feature_names = tfidf_vec.get_feature_names()
total_highest_scores = get_highest_scoring_feature(result_as_array, feature_names, -10)
print total_highest_scores
print tfidf_vec.get_stop_words()
except ValueError:
pass
# documents = read_data(directory)
# first_five = documents[0:100]
# tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=1, stop_words='english')
# try:
# results = tfidf_vec.fit_transform(first_five)
# print results.get_shape()
# result_as_array = results.toarray()
# feature_names = tfidf_vec.get_feature_names()
# total_highest_scores = get_highest_scoring_feature(result_as_array, feature_names, -10)
# print total_highest_scores