當前位置: 首頁>>代碼示例>>Python>>正文


Python TfidfVectorizer.fit_transform方法代碼示例

本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.fit_transform方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.fit_transform方法的具體用法?Python TfidfVectorizer.fit_transform怎麽用?Python TfidfVectorizer.fit_transform使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.feature_extraction.text.TfidfVectorizer的用法示例。


在下文中一共展示了TfidfVectorizer.fit_transform方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: main

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def main():
    # create tweets dataframe
    tweets = tfidf.build_corpus_from_csv(dataFile)
    # create just a list of tweets
    tweets_only = [tweet for tweet in tweets['Tweet']]
    # define stopset
    stopset = set(stopwords.words('english'))
    # tokenize the tweets in place
    tweets['Tweet'] = tfidf.tokenize_corpus(tweets['Tweet'], stopset)
    # print the 10 most frequent words for each tweet
    get_most_frequent_words(tweets, 10)

    ##############################

    # create vectorizer
    vectorizer = TfidfVectorizer(input='content', stop_words=stopset)
    # fit the vectorizer
    vectorizer.fit_transform(tweets_only)
    # get feature names
    tweet_features = vectorizer.get_feature_names()

    # Generate frequency distrubutions for each tweet
    freqs = []
    indices = []
    for (num, entry) in tweets.iterrows():
        freqs.append(FreqDist(entry['Tweet']))
        indices.append(num)
    # loop over the features, and insert frequences in the dataframe
    for feature in tweet_features:
        tweets[feature] = pd.Series(
            [fd[feature] for fd in freqs],
            index=indices
        )
    # output a csv
    tweets.to_csv('frequencies.csv')
開發者ID:csbailey5t,項目名稱:nobles,代碼行數:37,代碼來源:count.py

示例2: get_features

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation_cos: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation_cos: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
            X.append(cos_dist.tolist())
        return X
開發者ID:paris5020,項目名稱:athene_system,代碼行數:34,代碼來源:topic_models.py

示例3: readFile

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def readFile(filename):
    
    global vectorizer
    
    train_data = pd.read_csv(filename, header=0, delimiter='\t', quoting=3)
    train_size = train_data.shape[0]
    
    
    
    clean_train = []
    for i in xrange(0,train_size):
        clean_train.append(filter(train_data['review'][i]))
        #if i%1000 ==0:
        #    print '%d reviews processed...' %i
   
    
    #vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
    if vectorizer==None:
        vectorizer = TfidfVectorizer(sublinear_tf=True,max_df=0.5, max_features = 50000)
        train_data_feature = vectorizer.fit_transform(clean_train)
    else:
        vec = TfidfVectorizer(vocabulary=vectorizer.vocabulary_)
        train_data_feature = vec.fit_transform(clean_train)
        

    print train_data_feature.shape
    if 'test' in filename:
        return train_data['id'], train_data_feature
    else:
        return train_data['id'], train_data_feature, train_data['sentiment']
開發者ID:OliverKehl,項目名稱:word2vec,代碼行數:32,代碼來源:better.py

示例4: feature_tfidf

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def feature_tfidf(train_lines, test_lines, train_text_index, test_text_index):
    start = time.time()
    train_text_arr, forward_train, comment_train, like_train = file_to_arr(train_lines, train_text_index, 'train')

    test_text_arr = file_to_arr(test_lines, test_text_index, 'test')
    end = time.time()
    print 'train and test file to array fininshed with: ' + str(end - start)
    start = time.time()
    # debug start
    # train_text_arr_nozero = []
    # comment_train_nozero = []
    # for i in range(len(comment_train)):
    #     if int(comment_train[i]) != 0:
    #         train_text_arr_nozero.append(train_text_arr[i])
    #         comment_train_nozero.append(comment_train[i])
    # train_text_arr = train_text_arr_nozero
    # comment_train = comment_train_nozero
    # debug end

    tv = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
    tfidf_train = tv.fit_transform(train_text_arr)
    tv2 = TfidfVectorizer(vocabulary=tv.vocabulary_)
    tfidf_test = tv2.fit_transform(test_text_arr)
    end = time.time()
    print 'train and test array to tfidf feature fininshed with: ' + str(end - start)
    return tfidf_train, tfidf_test, forward_train, comment_train, like_train
開發者ID:uotter,項目名稱:weibo,代碼行數:28,代碼來源:lang.py

示例5: createTDIDF

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def createTDIDF():
    ## Bag of words
    with open("./data/movies.csv") as f:
        train_set1 = [line.lower().rstrip() for line in f]
    with open("./data/dvd.csv") as f:
        train_set2 = [line.lower().rstrip() for line in f]

    train_set = sorted(list(set(train_set1 + train_set2)))
    # Create dictionary to find movie
    dictTrain = dict()
    for i,movie in enumerate(train_set):
        dictTrain[movie] = i

    # Find weitghts
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)

    ## Tri-grams
    lenGram  = 3
    train_setBigrams = []
    for mov in train_set:
        temp = [mov[i:i+lenGram] for i in range(len(mov)-1)]
        temp = [elem for elem in temp if len(elem) == lenGram]
        train_setBigrams.append(' '.join(temp))

    train_setBigrams = sorted(list(set(train_setBigrams)))
    dictTrainBigrams = dict()
    for i,movie in enumerate(train_setBigrams):
        dictTrainBigrams[movie] = i
    tfidf_vectorizerBigrams = TfidfVectorizer()
    tfidf_matrix_trainBigrams = tfidf_vectorizerBigrams.fit_transform(train_setBigrams)

    return [tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram]
開發者ID:jgarciab,項目名稱:matchString,代碼行數:35,代碼來源:matchStrings.py

示例6: readFile

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def readFile(filename):
    global vectorizer
    
    train_data = pd.read_csv(filename, header=0, delimiter='\t', quoting=3)
    train_size = train_data.shape[0]
    
    clean_train = []
    for i in xrange(0,train_size):
        clean_train.append(filter(train_data['review'][i]))
        if i%1000 ==0:
            print '%d reviews processed...' %i
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    if vectorizer==None:
        vectorizer = TfidfVectorizer(sublinear_tf=True,max_df=0.9,ngram_range=(1,3),max_features=100000)
        train_data_feature = vectorizer.fit_transform(clean_train)
    else:
        vec = TfidfVectorizer(vocabulary=vectorizer.vocabulary_)
        train_data_feature = vec.fit_transform(clean_train)
        

    print train_data_feature.shape
    if 'test' in filename:
        return train_data['id'], train_data_feature
    else:
        return train_data['id'], train_data_feature, train_data['sentiment']
開發者ID:OliverKehl,項目名稱:word2vec,代碼行數:28,代碼來源:grid_search_svm.py

示例7: get_bow_vect_data_test

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def get_bow_vect_data_test(classif_data):
	vect = TfidfVectorizer()
	vect.fit_transform([classif_data["corpus"]])

	#Before we begin, get rid of any test articles with no topic
	vect_token_sets = []
	vect_test_sets = []

	#Transform testing and training data
	for i in classif_data["train_tokens"]:
		vect_token_sets.append(vect.transform([i]).toarray())

	for i in classif_data["test_tokens"]:
		vect_test_sets.append(vect.transform([i]).toarray())


	train_set = []
	test_set = []
	for i in vect_token_sets:
		train_set.append(i[0])
	for i in vect_test_sets:
		test_set.append(i[0])

	return {
		"vectorizer": vect,
		"train_vect": train_set,
		"test_vect": test_set
	}
開發者ID:tangohead,項目名稱:CS909-Project,代碼行數:30,代碼來源:helper.py

示例8: classify_svm

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def classify_svm(text):

	coarse_X = sets['coarse_training_qs']
	coarse_Y = sets['coarse_training_targets']
	fine_X = sets['fine_training_qs']
	fine_Y = sets['fine_training_targets']

	vectz = TfidfVectorizer(min_df=2, decode_error="ignore")
	coarse_X = vectz.fit_transform(coarse_X)	
	fine_X = vectz.fit_transform(fine_X)
	array_to_classify = vectz.transform([text]).toarray()

	
	# coarse
	svm_coarse = SVC(C=1000, gamma = 0.001, kernel='rbf')
	svm_coarse.fit(coarse_X, coarse_Y)
	# predict
	coarse_predict = svm_coarse.predict(array_to_classify)

	# fine
	svm_fine = SVC(C=1000, gamma = 0.001, kernel='rbf')
	svm_fine.fit(fine_X, fine_Y)
	# predict
	fine_predict = svm_fine.predict(array_to_classify)

	results={}
	results['coarse_class'] = coarse_predict[0] 
	results['fine_class'] = fine_predict[0]

	return results
開發者ID:el9335,項目名稱:QUAILS_1.0,代碼行數:32,代碼來源:serv.py

示例9: getNewsContext

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def getNewsContext(newsObj,ent_ind,ents,vocab,window):          
    ent_text = {}
    for e in ent_ind:
        ent_text[e] = ''

    sentencesIn = []            
    sentencesInObj= []            
    entsIn = []

    # binary matrix
    
    indices = []
    indptr = [0]
    for news in newsObj:
        h_ent = news.h_ent
        s = makeEntText(h_ent,ent_text,ent_ind,indices,indptr,window)
        if s:
            sentencesIn.append( s )
            sentencesInObj.append(Sentence(s,news.created_at,h_ent,news.title))
        b_ent = news.b_ent
        for sentence in sent_detector.tokenize(b_ent.strip()):
            s = makeEntText(sentence,ent_text,ent_ind,indices,indptr,window)
            if s:
                sentencesIn.append( s )
                sentencesInObj.append(Sentence(s,news.created_at,sentence,news.title))
    newsVectorizer = TfidfVectorizer(stop_words='english',vocabulary=vocab,#use_idf=False,
        tokenizer=lambda text: news_tokenizer(text,'reg'))
    XN = newsVectorizer.fit_transform(sentencesIn) #

    for e in ents:
        entsIn.append(ent_text[e])
    XEn = newsVectorizer.fit_transform(entsIn)    

    NEb = csr_matrix((np.ones(len(indices)), indices, indptr), shape=(len(sentencesIn),len(ents) ))
    return XN,XEn,NEb,sentencesIn,sentencesInObj,ent_text
開發者ID:alwayforver,項目名稱:tweetNews,代碼行數:37,代碼來源:eknot_utils.py

示例10: getTweetContext

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def getTweetContext(tweetsObj,ent_ind,ents,vocab,window):          
    ent_text = {}
    for e in ent_ind:
        ent_text[e] = ''

    t0 = time()
    tweetsIn = []            
    tweetsInObj = []            
    entsIn = []
    indices = []
    indptr = [0]
    for i in tweetsObj:
        tweet = tweetsObj[i]
        tokens_ent = tweet.tokens_ent
        t = makeEntText(tokens_ent,ent_text,ent_ind,indices,indptr,window)
        if t:
            tweetsIn.append( t )
            tweetsInObj.append( tweet )

    print( "append in "+str(time() - t0))
    t0 = time()
    tweetVectorizer = TfidfVectorizer(stop_words='english',vocabulary=vocab,#use_idf=False,
        tokenizer=lambda text: tweet_tokenizer(text,'reg'))
    XT = tweetVectorizer.fit_transform(tweetsIn) 
    print( "vectorize in "+str(time() - t0))
    t0 = time()
    for e in ents:
        entsIn.append(ent_text[e])
    XEt = tweetVectorizer.fit_transform(entsIn)    
    print( "ents append + vec in "+str(time() - t0))

    TEb = csr_matrix((np.ones(len(indices)), indices, indptr), shape=(len(tweetsIn),len(ents) ))
    return XT,XEt,TEb,tweetsIn,tweetsInObj,ent_text
開發者ID:alwayforver,項目名稱:tweetNews,代碼行數:35,代碼來源:eknot_utils.py

示例11: Q3Transformer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
class Q3Transformer(base.BaseEstimator, base.TransformerMixin):
    '''
        class variable: self.col; self.vectorizer
    '''
    def __init__(self):      
        self.col = 'categories' # initialize the column name

    def fit(self, X, y=None):
        # pick the column
        pick_category = pick(self.col, X)
        category_train = [' '.join(pick_category[i].values()[0]) for i in range(0,len(pick_category))]
        
        # transform the training records
        self.vectorizer = TfidfVectorizer(min_df=1)  
        self.vectorizer.fit_transform(category_train)
        
        return self

    
    def transform(self, X):
        # transform the test record
        if type(X) is list:
            pick_category = pick(self.col, X)
            category_X = [' '.join(pick_category[i].values()[0]) for i in range(0,len(pick_category))]
        else:
            category_X = [' '.join(X[self.col])]
        
        X_trans = self.vectorizer.transform(category_X)
        return X_trans 
開發者ID:FangMath,項目名稱:MachineLearning_Mini_Project,代碼行數:31,代碼來源:Models_ml.py

示例12: Classifier

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
class Classifier(object):
	def __init__(self):

		self.classifier = LogisticRegression(intercept_scaling=100)
		self.vectorizer = TfidfVectorizer()
	
	def trainvectorizer(self,corpus):
		
		self.vectorizer.fit_transform(corpus)
		file1 = open("feature_names.txt","w")
		names = self.vectorizer.get_feature_names()
		print len(names)
		for name in names:
			file1.write(name.encode('utf8')+"\n")
		file1.close()
		print "vectrizer train is over...."


	def trainclassifier(self,train_X,train_Y):
		
		self.classifier.fit(train_X,train_Y)
		print "classifier train is over ...."

	def getfeature(self,text):#return a feature array
		matrx = self.vectorizer.transform([text]).toarray()
		array = matrx[0]
		return array
		
	def getresult(self,feature):#return true or false
		
		return self.classifier.predict(feature)
開發者ID:zhoujiaxing,項目名稱:Classifier1,代碼行數:33,代碼來源:classifier.py

示例13: doTFIDF

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def doTFIDF(train, test1, test2):
	steemedTrain = stemIt(train)
	steemedTest1 = stemIt(test1)
	steemedTest2 = stemIt(test2)
	print "done stemming tweets"

	regTrain = processIt(train)
	regTest1 = processIt(test1)
	regTest2 = processIt(test2)

	vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1)

	X = vectorizer.fit_transform(regTrain) 
	Xtest1 = vectorizer.transform(regTest1)
	Xtest2 = vectorizer.transform(regTest2)
	scipy.io.mmwrite('train_reg_dataM',X, field='real')
	scipy.io.mmwrite('test1_reg_dataM',Xtest1, field='real')
	scipy.io.mmwrite('test2_reg_dataM',Xtest2, field='real')

	vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1)

	X = vectorizer.fit_transform(steemedTrain) 
	Xtest1 = vectorizer.transform(steemedTest1)
	Xtest2 = vectorizer.transform(steemedTest2)
	scipy.io.mmwrite('train_stem_dataM',X, field='real')
	scipy.io.mmwrite('test1_stem_dataM',Xtest1, field='real')
	scipy.io.mmwrite('test2_stem_dataM',Xtest2, field='real')
開發者ID:Krimit,項目名稱:ml_project,代碼行數:29,代碼來源:ingest.py

示例14: tfidf_score

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def tfidf_score(train_set, test_set):

    stopwords = nltk.corpus.stopwords.words('english')
    vectorizer = TfidfVectorizer(min_df=1, stop_words=set(stopwords))
    #Remove all the None Types from the input datasets
    train_set = filter(None, train_set)
    test_set = filter(None, test_set)
    vectorizer.fit_transform(train_set)
    #print "Word Index is {0} \n".format(vectorizer.vocabulary_)
    smatrix = vectorizer.transform(test_set)
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(smatrix)
    #print "IDF scores:", tfidf.idf_
    tf_idf_matrix = tfidf.transform(smatrix)
    pairwise_similarity = tf_idf_matrix * tf_idf_matrix.T
    msum = tf_idf_matrix.sum(axis=1)
    cos_sum = pairwise_similarity.sum(axis=1)
    mlist = msum.tolist()
    cos_sim = cos_sum.tolist()
    count = 0
    tfidfscores = {}
    for s in train_set:
        tfidfscores[s] = []
        tfidfscores[s].append(mlist[count][0])
        tfidfscores[s].append(cos_sim[count][0])
        count += 1
    return tfidfscores
開發者ID:nAk123,項目名稱:mailgist,代碼行數:29,代碼來源:FExtractor.py

示例15: classify

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def classify(good_deals,bad_deals,dictionary):
    word_with_low_freq = [word for word in dictionary.elements() if dictionary[word]<1]
    for word in word_with_low_freq:
        del dictionary[word]
    
    tfidf_vectorizer = TfidfVectorizer(vocabulary=dictionary)
    good_tfidf = tfidf_vectorizer.fit_transform(good_deals)
    bad_tfidf = tfidf_vectorizer.fit_transform(bad_deals)
    good_tfidf = good_tfidf.todense()
    bad_tfidf = bad_tfidf.todense()
    svm_data = []
    svm_data.append(good_tfidf)
    svm_data.append(bad_tfidf)
    svm_data = np.concatenate(svm_data)
    svm_pos_lables = np.ones(len(good_tfidf))
    svm_neg_lables = np.zeros(len(bad_tfidf))
    labels= []
    labels.append(svm_pos_lables)
    labels.append(svm_neg_lables)
    svm_labels  = np.concatenate(labels)
    
    param_grid = [
                  {'C': [1, 10, 100, 1000], 'gamma': [1,0.1,0.001, 0.0001],'kernel': ['linear']},
                  {'C': [1, 10, 100, 1000], 'gamma': [1,0.1,0.001, 0.0001], 'kernel': ['rbf']},
                  ]
    svc = svm.SVC()
    clf = grid_search.GridSearchCV(estimator=svc, param_grid=param_grid,n_jobs=1)
    print "Training SVM classifier for grid of C and gamma values to select best parameter\n"
    clf.fit(svm_data,svm_labels)
    print "svm score",clf.best_score
    print "svm gamma value",clf.best_estimator.gamma
    print "svm C value",clf.best_estimator.C
    print "svm kernel",clf.best_estimator.kernel
    return clf
開發者ID:jatinbhikadiya,項目名稱:textClassification,代碼行數:36,代碼來源:task3.py


注:本文中的sklearn.feature_extraction.text.TfidfVectorizer.fit_transform方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。