本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.get_feature_names方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.get_feature_names方法的具體用法?Python TfidfVectorizer.get_feature_names怎麽用?Python TfidfVectorizer.get_feature_names使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.get_feature_names方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_text_vectorization
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def test_text_vectorization():
mongo_dataset = MongoHC("hc", "re0")
data = [d for d in mongo_dataset.get_all(order_by="id_doc")]
text = [d["text"] for d in data[1:2]]
tfidf_vectorizer = TfidfVectorizer(
max_df=1,
max_features=200000,
min_df=1,
stop_words="english",
strip_accents="unicode",
use_idf=True,
ngram_range=(1, 1),
norm="l2",
)
tfidf_matrix = tfidf_vectorizer.fit_transform(text)
print tfidf_vectorizer.get_feature_names()
print tfidf_matrix.data
indices = np.argsort(tfidf_vectorizer.idf_)[::-1]
print indices
features = tfidf_vectorizer.get_feature_names()
top_n = 5
top_features = [features[i] for i in indices[:top_n]]
print len(features)
print tfidf_matrix.shape
print top_features
示例2: printLSA
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def printLSA(self):
corpus = []
for message in self.message_list:
corpus += message.text
# for message in self.message_list:
# for text in message.text:
# corpus.append(text)
#tfidf stuff
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
X = vectorizer.fit_transform(corpus)
idf = vectorizer.idf_
#lsa stuff
lsa = TruncatedSVD(n_components=27, n_iter=100)
lsa.fit(X)
print dict(zip(vectorizer.get_feature_names(), idf))
print ""
#print related concepts
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
termsInComp = zip (terms,comp)
sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
print "Concept %d:" % i
for term in sortedTerms:
print term[0]
print " "
#print sorted stuff to see
v = sorted(zip(vectorizer.get_feature_names(), idf), key=lambda x:x[1])
print v
print "\n\n"
示例3: getFeatures
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def getFeatures(tweets, vocabularyWords):
"""
Gets the features (word count, represented as a sparse matrix),
where we can recover the particular feature labels.
We then weight features via Tf-idf terms. (http://en.wikipedia.org/wiki/Tf%E2%80%93idf)
See: http://scikit-learn.org/dev/modules/feature_extraction.html#text-feature-extraction
"""
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(vocabulary = vocabularyWords, ngram_range = (1, 3))
features = vectorizer.fit_transform(tweets)
# print "features are: "
# print features.toarray()
print "features length is: "
print len(features.toarray()[0])
# print "feature names are: "
# print vectorizer.get_feature_names()
print "feature name lengths are: "
print len(vectorizer.get_feature_names())
return (features.toarray(), vectorizer.get_feature_names())
示例4: text_to_vectors
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def text_to_vectors(dirname_or_textdata,test_dirname_or_textdata=None,ngram_range=(1, 1),verbose=False):
if isinstance(dirname_or_textdata,str):
textdata=load_files(dirname_or_textdata,verbose)
else:
textdata=dirname_or_textdata
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=ngram_range)
vectors = vectorizer.fit_transform(textdata.data)
data=Struct()
data.vectorizer=vectorizer
data.vectors=vectors
data.targets=textdata.targets
data.target_names=textdata.target_names
data.feature_names=vectorizer.get_feature_names()
if not test_dirname_or_textdata is None:
if isinstance(test_dirname_or_textdata,str):
textdata=load_files(test_dirname_or_textdata,verbose)
else:
textdata=test_dirname_or_textdata
test_vectors = vectorizer.transform(textdata.data)
test_data=Struct()
test_data.vectorizer=vectorizer
test_data.vectors=test_vectors
test_data.targets=textdata.targets
test_data.target_names=textdata.target_names
test_data.feature_names=vectorizer.get_feature_names()
return data,test_data
else:
return data
示例5: test2
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def test2():
with codecs.open('/home/zhangwj/Applications/Scrapy/baike/files/data_fenci.txt', 'rb',encoding='utf-8') as f:
data_samples = f.read()
n_features = 1000
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #CountVectorizer是通過fit_transform函數將文本中的詞語轉換為詞頻矩陣
max_features=n_features,stop_words=u"應該"
) #TfidfTransformer是統計vectorizer中每個詞語的tf-idf權值
tfidf = tfidf_vectorizer.fit_transform(data_samples) # return sparse matrix, [n_samples, n_features],Tf-idf-weighted document-term matrix.
tfidf_vectorizer.get_feature_names() #上麵輸出的是tfidf的權重矩陣 sample*feature, 該函數打印feature names, 一個sample是一篇文檔
示例6: main
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def main(K, numfeatures, sample_file, num_display_words, outputfile):
K_clusters = K
stop_words = set(stopwords.words('spanish')).union(set(['http','www','san', '099','098','096','097']))
#stop_words = [word.decode('utf-8') for word in stopwords.words('spanish')]#stopwords.words("spanish")
vectorizer = TfidfVectorizer(max_df=0.5, max_features=numfeatures,
min_df=2, stop_words=set(stop_words),
use_idf=True)
text = []
with open(sample_file, 'rb') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
text.append(row[1])
t0 = time()
print("Extracting features from the training dataset using a sparse vectorizer")
X = vectorizer.fit_transform(text)
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
idf = vectorizer.idf_
words = dict(zip(vectorizer.get_feature_names(), idf))
terms = sorted(words, key=words.__getitem__)[0:10]
# mapping from feature id to acutal word
id2words ={}
for i,word in enumerate(vectorizer.get_feature_names()):
id2words[i] = word
t0 = time()
print("Applying topic modeling, using LDA")
print(str(K_clusters) + " topics")
corpus = matutils.Sparse2Corpus(X, documents_columns=False)
lda = models.ldamodel.LdaModel(corpus, num_topics=K_clusters, id2word=id2words)
print("done in %fs" % (time() - t0))
#write json version
json_data = {"terms":terms,"topics":None}
json_topics = []
for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)):
topic = {}
topic['name']= "topic" + str(i)
topic['children']= []
for weight,term in item:
child = {}
child['name'] = term
child['weight'] = weight
topic['children'].append(child)
#output_text.append( term + " : " + str(weight) )
json_topics.append(topic)
json_data['topics'] = json_topics
with open(outputfile + ".json", 'w') as outfile:
json.dump(json_data, outfile)
示例7: LoadDocuments
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def LoadDocuments(fname, collect_links):
crawl_data, urls, titles, relationships = pages_to_mem(fname, collect_links)
tfidfVect = TfidfVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1,2), sublinear_tf=True)
term_tfidf = tfidfVect.fit_transform(crawl_data)
dict_values = tfidfVect.get_feature_names()
i = iter(dict_values)
term_b = dict(izip(i, xrange(len(dict_values)))) # dictionary of words and indicies
tfidfVect = TfidfVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1,2))
title_tfidf = tfidfVect.fit_transform(titles)
dict_values = tfidfVect.get_feature_names()
i = iter(dict_values)
title_b = dict(izip(i, xrange(len(dict_values)))) # dictionary of words and indicies
return title_tfidf, title_b, term_tfidf, term_b, urls, relationships
示例8: tfidf_vectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def tfidf_vectorizer(codex,\
max_df=1,\
min_df=0,\
stop_words='english',\
train_split=False
):
"""
Calculate term frequency for words in all comments
Input: text string (nouns only from noun_tokenizer)
Output: transformed input, term list from tfidf, model
"""
#Select english stopwords
cachedStopWords = set(stopwords.words("english"))
#Add words to stopwords list
cachedStopWords.update(('and','I','A','And','So','arnt','This','When','It',\
'many','Many','so','cant','Yes','yes','No','no',\
'These','these','',' ','ok','na', 'edit','idk',\
'gon','wasnt','yt','sure','watch','whats','youre',\
'theyll','anyone'
))
if train_split:
#Initialize model
vectorizer = TfidfVectorizer(max_df=max_df,\
min_df=min_df,\
stop_words=cachedStopWords\
)
x_train, x_test = train_test_split(codex)
#Transform codex to vectors and calculate TFIDFs
X = vectorizer.fit_transform(x_train)
#Get all word tokens
terms = vectorizer.get_feature_names()
return X, terms, vectorizer
else:
#Initialize model
vectorizer = TfidfVectorizer(max_df=max_df,\
min_df=min_df,\
stop_words=cachedStopWords
)
#Transform codex to vectors and calculate TFIDFs
X = vectorizer.fit_transform(codex)
#Get all word tokens
terms = vectorizer.get_feature_names()
return X, terms, vectorizer
示例9: test1
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def test1():
n_samples = 2000
n_features = 1000
print("Loading dataset...")
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
max_features=n_features,
stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(data_samples) #sparse matrix, [n_samples, n_features],Tf-idf-weighted document-term matrix.
tfidf_vectorizer.get_feature_names() #上麵輸出的是tfidf的權重矩陣 sample*feature, 該函數打印feature names, 一個sample是一篇文檔
示例10: rocchio
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def rocchio(request):
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils.extmath import randomized_svd
from sklearn import feature_selection
import pandas as pd
document_index = []
s = SessionStore()
sessionData = db.sessionHistory.find_one({"session_key":s.session_key})
urls_visited = sessionData['url_visited']
urls = []
for url in urls_visited:
urls.append(url[0])
bodyContentList = db.crawledCollection.find({'url':{"$in":urls}}, {'body':1})
body = []
terms = []
for x in bodyContentList:
body.append(re.sub('[[email protected]#$%^&*()[]./<>?\|`~-=_+]0-9', '', x['body']))
# Turning the body content into a bag of words
top_features=[]
vectorizer = TfidfVectorizer(stop_words = 'english')
X = vectorizer.fit_transform(body)
indices = np.argsort(vectorizer.idf_)[::-1]
features = vectorizer.get_feature_names()
top_n = 10
top_features.append([features[i] for i in indices[:top_n]])
print top_features
vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')
dtm = vectorizer.fit_transform(body)
index=pd.DataFrame(dtm.toarray(),index=body,columns=vectorizer.get_feature_names())
indexterms=vectorizer.get_feature_names()
transform=TfidfTransformer()
tfidf=transform.fit_transform(dtm)
U, Sigma, V = randomized_svd(tfidf, n_components=5,
n_iter=5, transpose=True,
random_state=None)
#getting the highes count of words and adding it into the query
return HttpResponse(top_features)
示例11: cluster
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def cluster(data, k):
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=['nfl','game','team'])
td_matrix = vectorizer.fit_transform(data)
km = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_jobs=-1)
km.fit(td_matrix)
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
def count(acc,value):
acc[value] += 1
return acc
cluster_counts = reduce(count, km.labels_, [0]*k)
#_max = (0,0)
#for i in range(0,len(cluster_counts)):
# if _max[1] < cluster_counts[i]:
# _max = (i,cluster_counts[i])
#print _max[0], _max[1], float(_max[1]) / len(data)
# print counts
result = []
for i in reversed(numpy.array(cluster_counts).argsort()):
x = [float(cluster_counts[i])/len(data)]
for ind in order_centroids[i, :10]:
x.append(terms[ind])
result.append(x)
return result
示例12: get_tfidf_model
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def get_tfidf_model(self, dirname):
data = Sentences(dirname)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data)
mat_array = tfidf_matrix.toarray()
fn = tfidf_vectorizer.get_feature_names()
return tfidf_vectorizer
示例13: tfidf_word_match_share
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def tfidf_word_match_share(question1, question2):
qs = question1 + question2
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=3)
tfidf_matrix = tfidf_vectorizer.fit_transform(qs)
feature_names = tfidf_vectorizer.get_feature_names()
# dense = tfidf_matrix.todense()
# word_index_dict = dict((j, i) for i, j in enumerate(feature_names))
tf_idf = []
for q1, q2 in zip(question1, question2):
q1words = {}
q2words = {}
for word in str(q1).lower().split():
if word not in stops:
q1words[word] = 1
for word in str(q2).lower().split():
if word not in stops:
q2words[word] = 1
if len(q1words) == 0 or len(q2words) == 0:
tf_idf.append([0])
else:
q1_tfidf = tfidf_vectorizer.transform([" ".join(q1words.keys())])
q2_tfidf = tfidf_vectorizer.transform([" ".join(q2words.keys())])
inter = np.intersect1d(q1_tfidf.indices, q2_tfidf.indices)
shared_weights = 0
for word_index in inter:
shared_weights += (q1_tfidf[0, word_index] + q2_tfidf[0, word_index])
total_weights = q1_tfidf.sum() + q2_tfidf.sum()
if np.sum(total_weights) == 0:
tf_idf.append([0])
else:
score = np.sum(shared_weights) / np.sum(total_weights)
tf_idf.append([round(score, 2)])
print("Created tf_idf features feature")
return np.array(tf_idf)
示例14: get_salience_matrix
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def get_salience_matrix(keys, salient_set):
""" run test set on salient terms """
salient_feats = []
tfidf = TfidfVectorizer(stop_words="english")
top_n = 100
for key in keys:
salience_test = []
top_terms = []
history = clean(tweets[str(key)]["audience"]["user"]["history"])[1:]
# print len(history)
try:
teeeff = tfidf.fit_transform(history)
indices = np.argsort(tfidf.idf_)[::-1]
features = tfidf.get_feature_names()
top_terms = [features[i] for i in indices[:top_n]]
except:
top_terms = []
for term in salient_set:
if term in top_terms:
salience_test.append(1)
else:
salience_test.append(0)
salient_feats.append(salience_test)
return np.array(salient_feats)
示例15: __init__
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
class Train:
"""Using non-negative matrix factorization to learn the vector of a document"""
def __init__(self,filename_in):
self.text = []
for line in open(filename_in,'rb'):
self.text.append(line.strip().decode('utf-8'))
def train(self,n_topics=10):
self.vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.6)
tfidf = self.vectorizer.fit_transform(self.text)
n_samples = len(self.text)
print("Fitting the NMF model with n_samples=%d and n_features=%d..."
% (n_samples,n_topics))
self.nmf = NMF(n_components = n_topics, random_state = 1).fit(tfidf)
def show_result(self,n_top_words=10):
feature_names = self.vectorizer.get_feature_names()
for topic_idx, topic in enumerate(self.nmf.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
def __str__(self):
print np.shape(self.nmf.components_)[1]+'topics'