當前位置: 首頁>>代碼示例>>Python>>正文


Python TfidfVectorizer.get_feature_names方法代碼示例

本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.get_feature_names方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.get_feature_names方法的具體用法?Python TfidfVectorizer.get_feature_names怎麽用?Python TfidfVectorizer.get_feature_names使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.feature_extraction.text.TfidfVectorizer的用法示例。


在下文中一共展示了TfidfVectorizer.get_feature_names方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: test_text_vectorization

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def test_text_vectorization():
    mongo_dataset = MongoHC("hc", "re0")
    data = [d for d in mongo_dataset.get_all(order_by="id_doc")]
    text = [d["text"] for d in data[1:2]]
    tfidf_vectorizer = TfidfVectorizer(
        max_df=1,
        max_features=200000,
        min_df=1,
        stop_words="english",
        strip_accents="unicode",
        use_idf=True,
        ngram_range=(1, 1),
        norm="l2",
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(text)
    print tfidf_vectorizer.get_feature_names()
    print tfidf_matrix.data

    indices = np.argsort(tfidf_vectorizer.idf_)[::-1]
    print indices
    features = tfidf_vectorizer.get_feature_names()
    top_n = 5
    top_features = [features[i] for i in indices[:top_n]]

    print len(features)
    print tfidf_matrix.shape
    print top_features
開發者ID:Neuro17,項目名稱:LOD-doc-clustering,代碼行數:29,代碼來源:test.py

示例2: printLSA

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
    def printLSA(self):
        corpus = []
        for message in self.message_list:
            corpus += message.text
#         for message in self.message_list:
#             for text in message.text:
#                 corpus.append(text)
        #tfidf stuff
        vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
        X = vectorizer.fit_transform(corpus)
        idf = vectorizer.idf_
        #lsa stuff
        lsa = TruncatedSVD(n_components=27, n_iter=100)
        lsa.fit(X)
    
        print dict(zip(vectorizer.get_feature_names(), idf))
        print ""
        
        #print related concepts
        terms = vectorizer.get_feature_names()
        for i, comp in enumerate(lsa.components_): 
            termsInComp = zip (terms,comp)
            sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
            print "Concept %d:" % i
            for term in sortedTerms:
                print term[0]
            print " "
        
        #print sorted stuff to see    
        v = sorted(zip(vectorizer.get_feature_names(), idf), key=lambda x:x[1])
        print v
        print "\n\n"
開發者ID:KendraCB,項目名稱:sunshine,代碼行數:34,代碼來源:Candidate.py

示例3: getFeatures

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def getFeatures(tweets, vocabularyWords):
	"""
		Gets the features (word count, represented as a sparse matrix), 
		where we can recover the particular feature labels.

		We then weight features via Tf-idf terms. (http://en.wikipedia.org/wiki/Tf%E2%80%93idf)

		See: http://scikit-learn.org/dev/modules/feature_extraction.html#text-feature-extraction
	"""
	from sklearn.feature_extraction.text import TfidfVectorizer

	vectorizer = TfidfVectorizer(vocabulary = vocabularyWords, ngram_range = (1, 3))
	features = vectorizer.fit_transform(tweets)

	# print "features are: "
	# print features.toarray()
	print "features length is: "
	print len(features.toarray()[0])

	# print "feature names are: "
	# print vectorizer.get_feature_names()
	print "feature name lengths are: "
	print len(vectorizer.get_feature_names())

	return (features.toarray(), vectorizer.get_feature_names())
開發者ID:mbartoli,項目名稱:NLP-portfolio,代碼行數:27,代碼來源:svm-tweets.py

示例4: text_to_vectors

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def text_to_vectors(dirname_or_textdata,test_dirname_or_textdata=None,ngram_range=(1, 1),verbose=False):
    if isinstance(dirname_or_textdata,str):
        textdata=load_files(dirname_or_textdata,verbose)
    else:
        textdata=dirname_or_textdata

    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    vectors = vectorizer.fit_transform(textdata.data)
    
    data=Struct()
    data.vectorizer=vectorizer
    data.vectors=vectors
    data.targets=textdata.targets
    data.target_names=textdata.target_names
    data.feature_names=vectorizer.get_feature_names()
    
    if not test_dirname_or_textdata is None:
        if isinstance(test_dirname_or_textdata,str):
            textdata=load_files(test_dirname_or_textdata,verbose)
        else:
            textdata=test_dirname_or_textdata

        test_vectors = vectorizer.transform(textdata.data)
        test_data=Struct()
        test_data.vectorizer=vectorizer
        test_data.vectors=test_vectors
        test_data.targets=textdata.targets
        test_data.target_names=textdata.target_names
        test_data.feature_names=vectorizer.get_feature_names()
        
        return data,test_data
    else:
        return data
開發者ID:bblais,項目名稱:Classy,代碼行數:36,代碼來源:text.py

示例5: test2

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def test2():
    with codecs.open('/home/zhangwj/Applications/Scrapy/baike/files/data_fenci.txt', 'rb',encoding='utf-8') as f:
        data_samples = f.read()
    n_features = 1000
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,   #CountVectorizer是通過fit_transform函數將文本中的詞語轉換為詞頻矩陣
                                       max_features=n_features,stop_words=u"應該"
                                       ) #TfidfTransformer是統計vectorizer中每個詞語的tf-idf權值
    tfidf = tfidf_vectorizer.fit_transform(data_samples)  # return sparse matrix, [n_samples, n_features],Tf-idf-weighted document-term matrix.
    tfidf_vectorizer.get_feature_names() #上麵輸出的是tfidf的權重矩陣 sample*feature, 該函數打印feature names, 一個sample是一篇文檔
開發者ID:zhangweijiqn,項目名稱:testPython,代碼行數:11,代碼來源:testTFIDF.py

示例6: main

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def main(K, numfeatures, sample_file, num_display_words, outputfile):
    K_clusters = K
    stop_words = set(stopwords.words('spanish')).union(set(['http','www','san', '099','098','096','097']))
    #stop_words = [word.decode('utf-8') for word in stopwords.words('spanish')]#stopwords.words("spanish")
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=numfeatures,
                                     min_df=2, stop_words=set(stop_words),
                                     use_idf=True)

    text = []

    with open(sample_file, 'rb') as csvfile:
         reader = csv.reader(csvfile)
         for row in reader:
             text.append(row[1])

    t0 = time()
    print("Extracting features from the training dataset using a sparse vectorizer")
    X = vectorizer.fit_transform(text)
    print("done in %fs" % (time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)

    idf = vectorizer.idf_
    words = dict(zip(vectorizer.get_feature_names(), idf))

    terms = sorted(words, key=words.__getitem__)[0:10]

    # mapping from feature id to acutal word
    id2words ={}
    for i,word in enumerate(vectorizer.get_feature_names()):
        id2words[i] = word

    t0 = time()
    print("Applying topic modeling, using LDA")
    print(str(K_clusters) + " topics")
    corpus = matutils.Sparse2Corpus(X,  documents_columns=False)
    lda = models.ldamodel.LdaModel(corpus, num_topics=K_clusters, id2word=id2words)
    print("done in %fs" % (time() - t0))

    #write json version
    json_data = {"terms":terms,"topics":None}
    json_topics = []
    for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)):
        topic = {}
        topic['name']= "topic" + str(i)
        topic['children']= []
        for weight,term in item:
            child = {}
            child['name'] = term
            child['weight'] = weight
            topic['children'].append(child)
            #output_text.append( term + " : " + str(weight) )
        json_topics.append(topic)
    json_data['topics'] = json_topics

    with open(outputfile + ".json", 'w') as outfile:
        json.dump(json_data, outfile)
開發者ID:obernal,項目名稱:mining-emergency-reports,代碼行數:58,代碼來源:tfidf_topics_data_extraction.py

示例7: LoadDocuments

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def LoadDocuments(fname, collect_links):
    crawl_data, urls, titles, relationships = pages_to_mem(fname, collect_links)
    tfidfVect = TfidfVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1,2), sublinear_tf=True)
    term_tfidf = tfidfVect.fit_transform(crawl_data)
    dict_values = tfidfVect.get_feature_names()
    i = iter(dict_values)
    term_b = dict(izip(i, xrange(len(dict_values))))    # dictionary of words and indicies

    tfidfVect = TfidfVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1,2))
    title_tfidf = tfidfVect.fit_transform(titles)
    dict_values = tfidfVect.get_feature_names()
    i = iter(dict_values)
    title_b = dict(izip(i, xrange(len(dict_values))))    # dictionary of words and indicies
    return title_tfidf, title_b, term_tfidf, term_b, urls, relationships
開發者ID:jayluan,項目名稱:IndexMapReduce,代碼行數:16,代碼來源:SklearnTfidf.py

示例8: tfidf_vectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def tfidf_vectorizer(codex,\
                     max_df=1,\
                     min_df=0,\
                     stop_words='english',\
                     train_split=False
                     ):
    """
        Calculate term frequency for words in all comments 

        Input:  text string (nouns only from noun_tokenizer)
        Output: transformed input, term list from tfidf, model
    """

    #Select english stopwords
    cachedStopWords = set(stopwords.words("english"))

    #Add words to stopwords list
    cachedStopWords.update(('and','I','A','And','So','arnt','This','When','It',\
                            'many','Many','so','cant','Yes','yes','No','no',\
                            'These','these','',' ','ok','na', 'edit','idk',\
                            'gon','wasnt','yt','sure','watch','whats','youre',\
                            'theyll','anyone'
                            ))
    if train_split:
        #Initialize model
        vectorizer = TfidfVectorizer(max_df=max_df,\
                                     min_df=min_df,\
                                     stop_words=cachedStopWords\
                                     )
        x_train, x_test = train_test_split(codex)

        #Transform codex to vectors and calculate TFIDFs
        X = vectorizer.fit_transform(x_train)

        #Get all word tokens
        terms = vectorizer.get_feature_names()
        return X, terms, vectorizer
    else:
        #Initialize model
        vectorizer = TfidfVectorizer(max_df=max_df,\
                                     min_df=min_df,\
                                     stop_words=cachedStopWords
                                     )
        
        #Transform codex to vectors and calculate TFIDFs
        X = vectorizer.fit_transform(codex)

        #Get all word tokens
        terms = vectorizer.get_feature_names()
        return X, terms, vectorizer
開發者ID:mastraut,項目名稱:reddit_social_popularity_graph,代碼行數:52,代碼來源:nlp_extractors.py

示例9: test1

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def test1():
    n_samples = 2000
    n_features = 1000
    print("Loading dataset...")
    dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                                 remove=('headers', 'footers', 'quotes'))
    data_samples = dataset.data[:n_samples]

    # Use tf-idf features for NMF.
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                       max_features=n_features,
                                       stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(data_samples)  #sparse matrix, [n_samples, n_features],Tf-idf-weighted document-term matrix.
    tfidf_vectorizer.get_feature_names() #上麵輸出的是tfidf的權重矩陣 sample*feature, 該函數打印feature names, 一個sample是一篇文檔
開發者ID:zhangweijiqn,項目名稱:testPython,代碼行數:17,代碼來源:testTFIDF.py

示例10: rocchio

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def rocchio(request):
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.utils.extmath import randomized_svd
	from sklearn import feature_selection
	import pandas as pd
	document_index = []
	s = SessionStore()
	sessionData = db.sessionHistory.find_one({"session_key":s.session_key})
	urls_visited = sessionData['url_visited']
	urls = []
	for url in urls_visited:
		urls.append(url[0])
	bodyContentList = db.crawledCollection.find({'url':{"$in":urls}}, {'body':1})
	body = []
	terms = []
	for x in bodyContentList:
		body.append(re.sub('[[email protected]#$%^&*()[]./<>?\|`~-=_+]0-9', '', x['body']))

	# Turning the body content into a bag of words
	top_features=[]
	
	vectorizer = TfidfVectorizer(stop_words = 'english')
	X = vectorizer.fit_transform(body)
	indices = np.argsort(vectorizer.idf_)[::-1]
	features = vectorizer.get_feature_names()
	top_n = 10
	top_features.append([features[i] for i in indices[:top_n]])

	print top_features
	
	vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')
	dtm = vectorizer.fit_transform(body)

	index=pd.DataFrame(dtm.toarray(),index=body,columns=vectorizer.get_feature_names())
	indexterms=vectorizer.get_feature_names()
	
	transform=TfidfTransformer()
	tfidf=transform.fit_transform(dtm)
	
	U, Sigma, V = randomized_svd(tfidf, n_components=5,
                                      n_iter=5, transpose=True,
                                      random_state=None)
	

	#getting the highes count of words and adding it into the query
	return HttpResponse(top_features)
開發者ID:ashiq-techie,項目名稱:lsiSearch,代碼行數:50,代碼來源:views.py

示例11: cluster

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def cluster(data, k):

    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=['nfl','game','team'])

    td_matrix = vectorizer.fit_transform(data)
    km = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_jobs=-1)
    km.fit(td_matrix)
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()

    def count(acc,value):
        acc[value] += 1
        return acc

    cluster_counts = reduce(count, km.labels_, [0]*k)

    #_max = (0,0)
    #for i in range(0,len(cluster_counts)):
    #    if _max[1] < cluster_counts[i]:
    #        _max = (i,cluster_counts[i])

    #print _max[0], _max[1], float(_max[1]) / len(data)
    # print counts

    result = []

    for i in reversed(numpy.array(cluster_counts).argsort()):
        x = [float(cluster_counts[i])/len(data)]
        for ind in order_centroids[i, :10]:
            x.append(terms[ind])
        result.append(x)

    return result
開發者ID:maroy,項目名稱:TSTA,代碼行數:35,代碼來源:nfl_clusterer_sqlite.py

示例12: get_tfidf_model

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
	def get_tfidf_model(self, dirname):
		data = Sentences(dirname)
		tfidf_vectorizer = TfidfVectorizer(stop_words='english')
		tfidf_matrix = tfidf_vectorizer.fit_transform(data)
		mat_array = tfidf_matrix.toarray()
		fn = tfidf_vectorizer.get_feature_names()
		return tfidf_vectorizer
開發者ID:sagar3LOQ,項目名稱:utils,代碼行數:9,代碼來源:model_topN.py

示例13: tfidf_word_match_share

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def tfidf_word_match_share(question1, question2):
    qs = question1 + question2
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=3)
    tfidf_matrix = tfidf_vectorizer.fit_transform(qs)
    feature_names = tfidf_vectorizer.get_feature_names()
    # dense = tfidf_matrix.todense()
    # word_index_dict = dict((j, i) for i, j in enumerate(feature_names))

    tf_idf = []
    for q1, q2 in zip(question1, question2):
        q1words = {}
        q2words = {}
        for word in str(q1).lower().split():
            if word not in stops:
                q1words[word] = 1
        for word in str(q2).lower().split():
            if word not in stops:
                q2words[word] = 1
        if len(q1words) == 0 or len(q2words) == 0:
            tf_idf.append([0])
        else:
            q1_tfidf = tfidf_vectorizer.transform([" ".join(q1words.keys())])
            q2_tfidf = tfidf_vectorizer.transform([" ".join(q2words.keys())])
            inter = np.intersect1d(q1_tfidf.indices, q2_tfidf.indices)
            shared_weights = 0
            for word_index in inter:
                shared_weights += (q1_tfidf[0, word_index] + q2_tfidf[0, word_index])
            total_weights = q1_tfidf.sum() + q2_tfidf.sum()
            if np.sum(total_weights) == 0:
                tf_idf.append([0])
            else:
                score = np.sum(shared_weights) / np.sum(total_weights)
                tf_idf.append([round(score, 2)])
    print("Created tf_idf features feature")
    return np.array(tf_idf)
開發者ID:andra-pumnea,項目名稱:Thesis,代碼行數:37,代碼來源:feature_module.py

示例14: get_salience_matrix

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
def get_salience_matrix(keys, salient_set):
    """ run test set on salient terms """
    salient_feats = []
    tfidf = TfidfVectorizer(stop_words="english")
    top_n = 100
    for key in keys:
        salience_test = []
        top_terms = []
        history = clean(tweets[str(key)]["audience"]["user"]["history"])[1:]
        # print len(history)
        try:
            teeeff = tfidf.fit_transform(history)
            indices = np.argsort(tfidf.idf_)[::-1]
            features = tfidf.get_feature_names()
            top_terms = [features[i] for i in indices[:top_n]]
        except:
            top_terms = []

        for term in salient_set:
            if term in top_terms:
                salience_test.append(1)
            else:
                salience_test.append(0)
        salient_feats.append(salience_test)
    return np.array(salient_feats)
開發者ID:GavinNaiz,項目名稱:Thesis,代碼行數:27,代碼來源:ahst.py

示例15: __init__

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import get_feature_names [as 別名]
class Train:
    """Using non-negative matrix factorization to learn the vector of a document"""
    def __init__(self,filename_in):
        self.text = []
        for line in open(filename_in,'rb'):
            self.text.append(line.strip().decode('utf-8'))

    def train(self,n_topics=10):
        self.vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.6)
        tfidf = self.vectorizer.fit_transform(self.text)
        n_samples = len(self.text)
        print("Fitting the NMF model with n_samples=%d and n_features=%d..."
            % (n_samples,n_topics))
        self.nmf = NMF(n_components = n_topics, random_state = 1).fit(tfidf)

    def show_result(self,n_top_words=10):
        feature_names = self.vectorizer.get_feature_names()
        for topic_idx, topic in enumerate(self.nmf.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
            print()

    def __str__(self):
        print np.shape(self.nmf.components_)[1]+'topics'
開發者ID:panjinbo,項目名稱:JD_CV_Match,代碼行數:27,代碼來源:Train.py


注:本文中的sklearn.feature_extraction.text.TfidfVectorizer.get_feature_names方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。