本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.fit_transform方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.fit_transform方法的具體用法?Python TfidfVectorizer.fit_transform怎麽用?Python TfidfVectorizer.fit_transform使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.fit_transform方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: main
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def main():
# create tweets dataframe
tweets = tfidf.build_corpus_from_csv(dataFile)
# create just a list of tweets
tweets_only = [tweet for tweet in tweets['Tweet']]
# define stopset
stopset = set(stopwords.words('english'))
# tokenize the tweets in place
tweets['Tweet'] = tfidf.tokenize_corpus(tweets['Tweet'], stopset)
# print the 10 most frequent words for each tweet
get_most_frequent_words(tweets, 10)
##############################
# create vectorizer
vectorizer = TfidfVectorizer(input='content', stop_words=stopset)
# fit the vectorizer
vectorizer.fit_transform(tweets_only)
# get feature names
tweet_features = vectorizer.get_feature_names()
# Generate frequency distrubutions for each tweet
freqs = []
indices = []
for (num, entry) in tweets.iterrows():
freqs.append(FreqDist(entry['Tweet']))
indices.append(num)
# loop over the features, and insert frequences in the dataframe
for feature in tweet_features:
tweets[feature] = pd.Series(
[fd[feature] for fd in freqs],
index=indices
)
# output a csv
tweets.to_csv('frequencies.csv')
示例2: get_features
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def get_features(vocab):
vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_head = vectorizer_head.fit_transform(headlines)
vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_body = vectorizer_body.fit_transform(bodies)
# calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
# more important topic words a body contains of a certain topic, the higher its value for this topic
lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)
print("latent_dirichlet_allocation_cos: fit and transform body")
t0 = time()
lda_body_matrix = lda_body.fit_transform(X_train_body)
print("done in %0.3fs." % (time() - t0))
print("latent_dirichlet_allocation_cos: transform head")
# use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
# their vectors should be similar
lda_head_matrix = lda_body.transform(X_train_head)
#print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)
print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
# calculate cosine distance between the body and head
X = []
for i in range(len(lda_head_matrix)):
X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
X.append(cos_dist.tolist())
return X
示例3: readFile
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def readFile(filename):
global vectorizer
train_data = pd.read_csv(filename, header=0, delimiter='\t', quoting=3)
train_size = train_data.shape[0]
clean_train = []
for i in xrange(0,train_size):
clean_train.append(filter(train_data['review'][i]))
#if i%1000 ==0:
# print '%d reviews processed...' %i
#vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
if vectorizer==None:
vectorizer = TfidfVectorizer(sublinear_tf=True,max_df=0.5, max_features = 50000)
train_data_feature = vectorizer.fit_transform(clean_train)
else:
vec = TfidfVectorizer(vocabulary=vectorizer.vocabulary_)
train_data_feature = vec.fit_transform(clean_train)
print train_data_feature.shape
if 'test' in filename:
return train_data['id'], train_data_feature
else:
return train_data['id'], train_data_feature, train_data['sentiment']
示例4: feature_tfidf
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def feature_tfidf(train_lines, test_lines, train_text_index, test_text_index):
start = time.time()
train_text_arr, forward_train, comment_train, like_train = file_to_arr(train_lines, train_text_index, 'train')
test_text_arr = file_to_arr(test_lines, test_text_index, 'test')
end = time.time()
print 'train and test file to array fininshed with: ' + str(end - start)
start = time.time()
# debug start
# train_text_arr_nozero = []
# comment_train_nozero = []
# for i in range(len(comment_train)):
# if int(comment_train[i]) != 0:
# train_text_arr_nozero.append(train_text_arr[i])
# comment_train_nozero.append(comment_train[i])
# train_text_arr = train_text_arr_nozero
# comment_train = comment_train_nozero
# debug end
tv = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
tfidf_train = tv.fit_transform(train_text_arr)
tv2 = TfidfVectorizer(vocabulary=tv.vocabulary_)
tfidf_test = tv2.fit_transform(test_text_arr)
end = time.time()
print 'train and test array to tfidf feature fininshed with: ' + str(end - start)
return tfidf_train, tfidf_test, forward_train, comment_train, like_train
示例5: createTDIDF
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def createTDIDF():
## Bag of words
with open("./data/movies.csv") as f:
train_set1 = [line.lower().rstrip() for line in f]
with open("./data/dvd.csv") as f:
train_set2 = [line.lower().rstrip() for line in f]
train_set = sorted(list(set(train_set1 + train_set2)))
# Create dictionary to find movie
dictTrain = dict()
for i,movie in enumerate(train_set):
dictTrain[movie] = i
# Find weitghts
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)
## Tri-grams
lenGram = 3
train_setBigrams = []
for mov in train_set:
temp = [mov[i:i+lenGram] for i in range(len(mov)-1)]
temp = [elem for elem in temp if len(elem) == lenGram]
train_setBigrams.append(' '.join(temp))
train_setBigrams = sorted(list(set(train_setBigrams)))
dictTrainBigrams = dict()
for i,movie in enumerate(train_setBigrams):
dictTrainBigrams[movie] = i
tfidf_vectorizerBigrams = TfidfVectorizer()
tfidf_matrix_trainBigrams = tfidf_vectorizerBigrams.fit_transform(train_setBigrams)
return [tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram]
示例6: readFile
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def readFile(filename):
global vectorizer
train_data = pd.read_csv(filename, header=0, delimiter='\t', quoting=3)
train_size = train_data.shape[0]
clean_train = []
for i in xrange(0,train_size):
clean_train.append(filter(train_data['review'][i]))
if i%1000 ==0:
print '%d reviews processed...' %i
from sklearn.feature_extraction.text import TfidfVectorizer
if vectorizer==None:
vectorizer = TfidfVectorizer(sublinear_tf=True,max_df=0.9,ngram_range=(1,3),max_features=100000)
train_data_feature = vectorizer.fit_transform(clean_train)
else:
vec = TfidfVectorizer(vocabulary=vectorizer.vocabulary_)
train_data_feature = vec.fit_transform(clean_train)
print train_data_feature.shape
if 'test' in filename:
return train_data['id'], train_data_feature
else:
return train_data['id'], train_data_feature, train_data['sentiment']
示例7: get_bow_vect_data_test
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def get_bow_vect_data_test(classif_data):
vect = TfidfVectorizer()
vect.fit_transform([classif_data["corpus"]])
#Before we begin, get rid of any test articles with no topic
vect_token_sets = []
vect_test_sets = []
#Transform testing and training data
for i in classif_data["train_tokens"]:
vect_token_sets.append(vect.transform([i]).toarray())
for i in classif_data["test_tokens"]:
vect_test_sets.append(vect.transform([i]).toarray())
train_set = []
test_set = []
for i in vect_token_sets:
train_set.append(i[0])
for i in vect_test_sets:
test_set.append(i[0])
return {
"vectorizer": vect,
"train_vect": train_set,
"test_vect": test_set
}
示例8: classify_svm
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def classify_svm(text):
coarse_X = sets['coarse_training_qs']
coarse_Y = sets['coarse_training_targets']
fine_X = sets['fine_training_qs']
fine_Y = sets['fine_training_targets']
vectz = TfidfVectorizer(min_df=2, decode_error="ignore")
coarse_X = vectz.fit_transform(coarse_X)
fine_X = vectz.fit_transform(fine_X)
array_to_classify = vectz.transform([text]).toarray()
# coarse
svm_coarse = SVC(C=1000, gamma = 0.001, kernel='rbf')
svm_coarse.fit(coarse_X, coarse_Y)
# predict
coarse_predict = svm_coarse.predict(array_to_classify)
# fine
svm_fine = SVC(C=1000, gamma = 0.001, kernel='rbf')
svm_fine.fit(fine_X, fine_Y)
# predict
fine_predict = svm_fine.predict(array_to_classify)
results={}
results['coarse_class'] = coarse_predict[0]
results['fine_class'] = fine_predict[0]
return results
示例9: getNewsContext
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def getNewsContext(newsObj,ent_ind,ents,vocab,window):
ent_text = {}
for e in ent_ind:
ent_text[e] = ''
sentencesIn = []
sentencesInObj= []
entsIn = []
# binary matrix
indices = []
indptr = [0]
for news in newsObj:
h_ent = news.h_ent
s = makeEntText(h_ent,ent_text,ent_ind,indices,indptr,window)
if s:
sentencesIn.append( s )
sentencesInObj.append(Sentence(s,news.created_at,h_ent,news.title))
b_ent = news.b_ent
for sentence in sent_detector.tokenize(b_ent.strip()):
s = makeEntText(sentence,ent_text,ent_ind,indices,indptr,window)
if s:
sentencesIn.append( s )
sentencesInObj.append(Sentence(s,news.created_at,sentence,news.title))
newsVectorizer = TfidfVectorizer(stop_words='english',vocabulary=vocab,#use_idf=False,
tokenizer=lambda text: news_tokenizer(text,'reg'))
XN = newsVectorizer.fit_transform(sentencesIn) #
for e in ents:
entsIn.append(ent_text[e])
XEn = newsVectorizer.fit_transform(entsIn)
NEb = csr_matrix((np.ones(len(indices)), indices, indptr), shape=(len(sentencesIn),len(ents) ))
return XN,XEn,NEb,sentencesIn,sentencesInObj,ent_text
示例10: getTweetContext
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def getTweetContext(tweetsObj,ent_ind,ents,vocab,window):
ent_text = {}
for e in ent_ind:
ent_text[e] = ''
t0 = time()
tweetsIn = []
tweetsInObj = []
entsIn = []
indices = []
indptr = [0]
for i in tweetsObj:
tweet = tweetsObj[i]
tokens_ent = tweet.tokens_ent
t = makeEntText(tokens_ent,ent_text,ent_ind,indices,indptr,window)
if t:
tweetsIn.append( t )
tweetsInObj.append( tweet )
print( "append in "+str(time() - t0))
t0 = time()
tweetVectorizer = TfidfVectorizer(stop_words='english',vocabulary=vocab,#use_idf=False,
tokenizer=lambda text: tweet_tokenizer(text,'reg'))
XT = tweetVectorizer.fit_transform(tweetsIn)
print( "vectorize in "+str(time() - t0))
t0 = time()
for e in ents:
entsIn.append(ent_text[e])
XEt = tweetVectorizer.fit_transform(entsIn)
print( "ents append + vec in "+str(time() - t0))
TEb = csr_matrix((np.ones(len(indices)), indices, indptr), shape=(len(tweetsIn),len(ents) ))
return XT,XEt,TEb,tweetsIn,tweetsInObj,ent_text
示例11: Q3Transformer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
class Q3Transformer(base.BaseEstimator, base.TransformerMixin):
'''
class variable: self.col; self.vectorizer
'''
def __init__(self):
self.col = 'categories' # initialize the column name
def fit(self, X, y=None):
# pick the column
pick_category = pick(self.col, X)
category_train = [' '.join(pick_category[i].values()[0]) for i in range(0,len(pick_category))]
# transform the training records
self.vectorizer = TfidfVectorizer(min_df=1)
self.vectorizer.fit_transform(category_train)
return self
def transform(self, X):
# transform the test record
if type(X) is list:
pick_category = pick(self.col, X)
category_X = [' '.join(pick_category[i].values()[0]) for i in range(0,len(pick_category))]
else:
category_X = [' '.join(X[self.col])]
X_trans = self.vectorizer.transform(category_X)
return X_trans
示例12: Classifier
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
class Classifier(object):
def __init__(self):
self.classifier = LogisticRegression(intercept_scaling=100)
self.vectorizer = TfidfVectorizer()
def trainvectorizer(self,corpus):
self.vectorizer.fit_transform(corpus)
file1 = open("feature_names.txt","w")
names = self.vectorizer.get_feature_names()
print len(names)
for name in names:
file1.write(name.encode('utf8')+"\n")
file1.close()
print "vectrizer train is over...."
def trainclassifier(self,train_X,train_Y):
self.classifier.fit(train_X,train_Y)
print "classifier train is over ...."
def getfeature(self,text):#return a feature array
matrx = self.vectorizer.transform([text]).toarray()
array = matrx[0]
return array
def getresult(self,feature):#return true or false
return self.classifier.predict(feature)
示例13: doTFIDF
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def doTFIDF(train, test1, test2):
steemedTrain = stemIt(train)
steemedTest1 = stemIt(test1)
steemedTest2 = stemIt(test2)
print "done stemming tweets"
regTrain = processIt(train)
regTest1 = processIt(test1)
regTest2 = processIt(test2)
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1)
X = vectorizer.fit_transform(regTrain)
Xtest1 = vectorizer.transform(regTest1)
Xtest2 = vectorizer.transform(regTest2)
scipy.io.mmwrite('train_reg_dataM',X, field='real')
scipy.io.mmwrite('test1_reg_dataM',Xtest1, field='real')
scipy.io.mmwrite('test2_reg_dataM',Xtest2, field='real')
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1)
X = vectorizer.fit_transform(steemedTrain)
Xtest1 = vectorizer.transform(steemedTest1)
Xtest2 = vectorizer.transform(steemedTest2)
scipy.io.mmwrite('train_stem_dataM',X, field='real')
scipy.io.mmwrite('test1_stem_dataM',Xtest1, field='real')
scipy.io.mmwrite('test2_stem_dataM',Xtest2, field='real')
示例14: tfidf_score
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def tfidf_score(train_set, test_set):
stopwords = nltk.corpus.stopwords.words('english')
vectorizer = TfidfVectorizer(min_df=1, stop_words=set(stopwords))
#Remove all the None Types from the input datasets
train_set = filter(None, train_set)
test_set = filter(None, test_set)
vectorizer.fit_transform(train_set)
#print "Word Index is {0} \n".format(vectorizer.vocabulary_)
smatrix = vectorizer.transform(test_set)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(smatrix)
#print "IDF scores:", tfidf.idf_
tf_idf_matrix = tfidf.transform(smatrix)
pairwise_similarity = tf_idf_matrix * tf_idf_matrix.T
msum = tf_idf_matrix.sum(axis=1)
cos_sum = pairwise_similarity.sum(axis=1)
mlist = msum.tolist()
cos_sim = cos_sum.tolist()
count = 0
tfidfscores = {}
for s in train_set:
tfidfscores[s] = []
tfidfscores[s].append(mlist[count][0])
tfidfscores[s].append(cos_sim[count][0])
count += 1
return tfidfscores
示例15: classify
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit_transform [as 別名]
def classify(good_deals,bad_deals,dictionary):
word_with_low_freq = [word for word in dictionary.elements() if dictionary[word]<1]
for word in word_with_low_freq:
del dictionary[word]
tfidf_vectorizer = TfidfVectorizer(vocabulary=dictionary)
good_tfidf = tfidf_vectorizer.fit_transform(good_deals)
bad_tfidf = tfidf_vectorizer.fit_transform(bad_deals)
good_tfidf = good_tfidf.todense()
bad_tfidf = bad_tfidf.todense()
svm_data = []
svm_data.append(good_tfidf)
svm_data.append(bad_tfidf)
svm_data = np.concatenate(svm_data)
svm_pos_lables = np.ones(len(good_tfidf))
svm_neg_lables = np.zeros(len(bad_tfidf))
labels= []
labels.append(svm_pos_lables)
labels.append(svm_neg_lables)
svm_labels = np.concatenate(labels)
param_grid = [
{'C': [1, 10, 100, 1000], 'gamma': [1,0.1,0.001, 0.0001],'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [1,0.1,0.001, 0.0001], 'kernel': ['rbf']},
]
svc = svm.SVC()
clf = grid_search.GridSearchCV(estimator=svc, param_grid=param_grid,n_jobs=1)
print "Training SVM classifier for grid of C and gamma values to select best parameter\n"
clf.fit(svm_data,svm_labels)
print "svm score",clf.best_score
print "svm gamma value",clf.best_estimator.gamma
print "svm C value",clf.best_estimator.C
print "svm kernel",clf.best_estimator.kernel
return clf