當前位置: 首頁>>代碼示例>>Python>>正文


Python TfidfVectorizer.get_stop_words方法代碼示例

本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.get_stop_words方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.get_stop_words方法的具體用法?Python TfidfVectorizer.get_stop_words怎麽用?Python TfidfVectorizer.get_stop_words使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.feature_extraction.text.TfidfVectorizer的用法示例。


在下文中一共展示了TfidfVectorizer.get_stop_words方法的9個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: vectorizaCorpus

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
def vectorizaCorpus(corpus, minDf):
    ''' Vectoriza o corpus introducido filtrando as palabras que aparecen en 
    menos de minDf documentos'''
    try:
        vectorizer = TfidfVectorizer(min_df = minDf, lowercase=True, stop_words='english')
        
        # Definimos unha lista propoia de stopwords
        myStopwords = ['did','didn','does','doesn','don','just','isn', \
        'reddit', 'wasn','www','yeah','yes','like','able','thanks', \
        'know', 'think','ve', 'want','com','https','http',\
        'good', 'really', 'make', 'say', 'going', 'said', 'people','way', \
        'use']
        
        # engadimos as stop_words que queremos ao conxunto xa existente
        vectorizer.stop_words = vectorizer.get_stop_words().union(myStopwords)
        
        # calculamos a matriz de documentos-términos
        docTerms = vectorizer.fit_transform(corpus)
        
        # invertimos o vocabulario creando un diccionario de índices - termos
        invVoc = {v: k for k, v in vectorizer.vocabulary_.items()} 
        
        # buscamos os termos centrais, que son os que a suma acumulada de tf/idf en todos os documentos é maior
        sumaTfidf = docTerms.sum(axis=0).tolist()[0] #calculamos a suma por columnas da matriz de documentos-termos
    
        return  vectorizer, invVoc, sumaTfidf
    except Exception as e:
        print('\nOcorreu un problema: {0}'.format(e))
        sys.exit()
開發者ID:anuf,項目名稱:txine,代碼行數:31,代碼來源:Practica3.py

示例2: create_stopword_list

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
def create_stopword_list(extra_words):
	"""
	Creates stopword list (adds extra words to original English set)
	"""
	from sklearn.feature_extraction.text import TfidfVectorizer
	original = list(TfidfVectorizer.get_stop_words(TfidfVectorizer(stop_words='english')))
	if extra_words:
		return frozenset(original+extra_words)
	else:
		return frozenset(original)
開發者ID:davidabelman,項目名稱:GuardianVizFlask,代碼行數:12,代碼來源:general_functions.py

示例3: make_stop_words

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
def make_stop_words():
    '''
        Take in list of user-created stop words and join with Tfidf 'english' stop words.

        INPUT:
        - None

        OUTPUT:
        - New master list of stop words including user and model inputs
    '''
    new_stop_words = ['ha', "\'s", 'tt', 'ireach', "n\'t", 'wo', 'pv', 'tm', 'anite', 'rabichev', 'russell', '603', 'hana', 'atmel', 'radwin', 'se', 'doxee', 'lantto', 'publ', 'fpc1025', '855', 'il', '0344']
    # create temporary TfidfVectorizer object
    tfidf_temp = TfidfVectorizer(stop_words='english')
    # get Tfidf 'english' stop words from model
    stop_words = tfidf_temp.get_stop_words()
    # combine two lists of stop words
    result = list(stop_words) + new_stop_words
    return result
開發者ID:NashC,項目名稱:startup_press_release_analyzer,代碼行數:20,代碼來源:model.py

示例4: math_stop

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
def math_stop():
    '''Add math specific words to the standard stop list'''
    tfidf = TfidfVectorizer(stop_words='english')
    Stop = set()
    Stop.update([word for word in tfidf.get_stop_words()])
    Stop.update(['theorem', 'denote', 'like', 'thank', 'lemma', 'proof',
                'sum', 'difference', 'corollary', 'hand',
                 'product', 'multiple', 'let', 'group',
                 'prime', 'log', 'limit', 'cid', 'result',
                 'main', 'conjecture', 'case', 'suppose',
                 'function', 'assume', 'follows',
                 'given', 'define', 'note', 'defined', 'class',
                 'proposition', 'function', 'set',
                 'primes', 'numbers', 'form', 'integers', 'curves',
                 'real', 'using', 'following', 'obtain', 'prove',
                 'definition', 'large', 'small', 'action', 'define',
                         'bound', 'sufficiently', 'subject', 'non',
                          'mathematics'])
    return list(Stop)
開發者ID:ColinFerguson,項目名稱:Project,代碼行數:21,代碼來源:math_scraping_and_recommending_functions.py

示例5: make_stop_words

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
def make_stop_words(new_words_list):
	tfidf_temp = TfidfVectorizer(stop_words='english')
	stop_words = tfidf_temp.get_stop_words()
	result = list(stop_words) + new_words_list
	return result
開發者ID:NashC,項目名稱:prnewswire_api,代碼行數:7,代碼來源:prnews_mini.py

示例6: range

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
    pos_train = pos_train+['positive']
for l in range(len(neg_trainer)):
    neg_train = neg_train+['negative']
y_train = pos_train+neg_train
X_train = tfidf.fit_transform(pos_trainer+neg_trainer)
#n_tf = tfidf.fit_transform(neg_trainer)

clf = MultinomialNB().fit(X_train, y_train)
#clf = MultinomialNB().partial_fit(n_tf, neg_train)

#clf.partial_fit(n_tf, neg_train)



fn = tfidf.get_feature_names()
stop = tfidf.get_stop_words()

if statement == 'red':
    comment_list = redditor.redditor()


    for comment in range(len(comment_list)):
        statement = comment_list[comment]
        X_test = tfidf.transform(np.array([statement]))
        predicted = clf.predict(X_test)
        prob = clf.predict_proba(X_test)
        print "Tweet: " + str(l)
        print statement
        print predicted, prob

X_test = tfidf.transform(np.array([statement]))
開發者ID:scsilver,項目名稱:moodjournal,代碼行數:33,代碼來源:twitter_city_text_processing.py

示例7: open

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
pickle.dump( from_data, open("your_email_authors.pkl", "w") )





### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer

transformer = TfidfVectorizer(stop_words="english")
theFit = transformer.fit_transform(word_data)
print('theFit', theFit)
afterTransform = theFit.toarray()
print(afterTransform)

stopWords = transformer.get_stop_words()
# print('stopWords', len(stopWords))

featureNames = transformer.get_feature_names()
# print(featureNames[34597])
print('featureNames', len(featureNames))


# shove matrix gained from TfidfVectorizer into
# k-means clustering
# just wanted to experiment
from sklearn.cluster import MiniBatchKMeans
clf = MiniBatchKMeans(n_clusters=4, n_init=10, max_iter=300)
clf.fit(afterTransform)
pred = clf.predict(afterTransform)
print('PRED', pred)
開發者ID:jeremysandor,項目名稱:ud120-projects,代碼行數:33,代碼來源:vectorize_text.py

示例8: open

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
        if name=='sara':
            from_data.append(0)
        else:
            from_data.append(1)

        email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("your_word_data.pkl", "w") )
pickle.dump( from_data, open("your_email_authors.pkl", "w") )


# print word_data[152]



### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfv = TfidfVectorizer(stop_words="english", lowercase=True)
v_word_data = tfidfv.fit_transform(word_data)
print len(tfidfv.get_feature_names())
print tfidfv.get_feature_names()
print tfidfv.get_feature_names()[34597]
print tfidfv.get_stop_words()


開發者ID:hushenglang,項目名稱:ud120-projects,代碼行數:30,代碼來源:vectorize_text.py

示例9: len

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_stop_words [as 別名]
	all_docs.append(read_merge_data(ff))

print all_docs
print len(all_docs)

tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=1, stop_words='english')

try:
	results = tfidf_vec.fit_transform(all_docs)
	print results.get_shape()	
	result_as_array = results.toarray()
	feature_names = tfidf_vec.get_feature_names()

	total_highest_scores = get_highest_scoring_feature(result_as_array, feature_names, -10)
	print total_highest_scores	
	print tfidf_vec.get_stop_words()
except ValueError:
	pass

# documents = read_data(directory)
# first_five = documents[0:100]
# tfidf_vec = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=1, stop_words='english')

# try:
# 	results = tfidf_vec.fit_transform(first_five)
# 	print results.get_shape()	
# 	result_as_array = results.toarray()
# 	feature_names = tfidf_vec.get_feature_names()

# 	total_highest_scores = get_highest_scoring_feature(result_as_array, feature_names, -10)
# 	print total_highest_scores	
開發者ID:tomlai19852004,項目名稱:DSPersonal,代碼行數:33,代碼來源:tf-idf-corpus.py


注:本文中的sklearn.feature_extraction.text.TfidfVectorizer.get_stop_words方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。