當前位置: 首頁>>代碼示例>>Python>>正文


Python TfidfVectorizer.transform方法代碼示例

本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.transform方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.transform方法的具體用法?Python TfidfVectorizer.transform怎麽用?Python TfidfVectorizer.transform使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.feature_extraction.text.TfidfVectorizer的用法示例。


在下文中一共展示了TfidfVectorizer.transform方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: TFID

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def TFID(data , choice):
    #Again removing stop words increased the efficiency, same for Snowball
    
    if(choice==1):    
        tfv = TfidfVectorizer(min_df=3,   max_features=None, strip_accents='unicode',  analyzer='word',token_pattern=r'\w{2,}',ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)
        #, tokenizer=Snowball()    
        print "fitting pipeline and transforming for ", len(data), ' entries'
        tfv.fit(data)
        vect = tfv.transform(data)
        print vect.shape
        return vect
    elif(choice==2):
        print 'Fitting char pipeline'
        tfvc = TfidfVectorizer(norm='l2',min_df=3,max_df=1.0,strip_accents='unicode',analyzer='char',ngram_range=(2,7),use_idf=1,smooth_idf=1,sublinear_tf=1)  
        tfvc.fit(data)    
        vectc = tfvc.transform(data)
        print 'vectc',vectc.shape 
        return vectc
    elif(choice==3):    
        tfv = TfidfVectorizer(min_df=3,   max_features=None, strip_accents='unicode',  analyzer='word',token_pattern=r'\w{2,}',ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1, tokenizer=LancasterTokenizer())
        #, tokenizer=Snowball()    
        print "fitting pipeline and transforming for ", len(data), ' entries'
        tfv.fit(data)
        vect = tfv.transform(data)
        print vect.shape
        return vect
    elif(choice==4):    
        tfv = CountVectorizer(min_df=3,   max_features=None, strip_accents='unicode',  analyzer='word',token_pattern=r'\w{2,}',ngram_range=(1, 3), binary=True)  
        print "fitting count pipeline and transforming for ", len(data), ' entries'
        tfv.fit(data)
        vect = tfv.transform(data)
        print vect.shape
        return vect
    else:
        return []
開發者ID:apurva3000,項目名稱:python-kaggle-hashtag,代碼行數:37,代碼來源:hashtag.py

示例2: tfidf_ize

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def tfidf_ize(train, test, node_info):
    vectorizer = TfidfVectorizer(ngram_range=(1,1))
    vectorizer.fit(node_info.abstract.as_matrix())
    
    for table in [train, test]:
        table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
        table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
        table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
        table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
        
        #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
        table.loc[:, 'temp22'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
        table.loc[:, 'temp23'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
        table.loc[:, 'temp24'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
                        + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
    
    vectorizer = TfidfVectorizer(ngram_range=(2,2))
    vectorizer.fit(node_info.abstract.as_matrix())
    
    for table in [train, test]:
        table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
        table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
        table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
        table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
        
        #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
        table.loc[:, 'temp27'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
        table.loc[:, 'temp28'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
        table.loc[:, 'temp29'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
                        + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
    
    return train, test
開發者ID:Leobouloc,項目名稱:mva_link_prediction,代碼行數:34,代碼來源:main.py

示例3: trainTFIDF2

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def trainTFIDF2(bow21features, bow2kfold, test):
    idx = (test[0][:, 0]).astype(int)
    tfv = TfidfVectorizer(min_df=5, max_df=500, max_features=None, strip_accents='ascii', analyzer='word',
                          token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True,
                          stop_words='english')
    pipeline = Pipeline(
        [('svd', TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)),
         ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)),
         ('svm',
          SVC(C=10.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001,
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None))])
    tfidf2CrossValidationTest = None
    if toTestModel:
        tfidf2CrossValidationTest = tfidfCrossValidation(tfv, pipeline, bow2kfold)
    trainData, lblsTrain, testData, lblstest = bow21features
    tfv.fit(trainData)
    X_train = tfv.transform(trainData)
    X_test = tfv.transform(testData)
    if isinstance(lblsTrain, list):
        lblsTrain = lblsTrain[0]
    lblsTrain = (lblsTrain.astype(int))
    pipeline.fit(X_train, lblsTrain)
    predictions = pipeline.predict(X_test)
    finalResults = pd.DataFrame({"id": idx, "prediction": predictions})
    return tfidf2CrossValidationTest, finalResults
開發者ID:Ilya-Simkin,項目名稱:NLP-crowdflower-assignment,代碼行數:27,代碼來源:modelCreation.py

示例4: word_count_transform

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def word_count_transform(X_train, X_test):
	from sklearn.feature_extraction.text import TfidfVectorizer
	tfidf_vectorizer = TfidfVectorizer(stop_words='english')
	tfidf_vectorizer.fit(X_train)
	X_train_tfidf = tfidf_vectorizer.transform(X_train).todense()
	X_test_tfidf = tfidf_vectorizer.transform(X_test).todense()
	return X_train_tfidf, X_test_tfidf
開發者ID:smartyining,項目名稱:Yelp,代碼行數:9,代碼來源:train.py

示例5: extract

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def extract(max_gram, feat_dims, save_model=False):
    print "extract feature"

    vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features=None, 
            ngram_range=(1, max_gram), sublinear_tf = True )

    vectorizer = vectorizer.fit(reviews_train + reviews_unsup)
    feats_train_ori = vectorizer.transform(reviews_train)
    feats_test_ori = vectorizer.transform(reviews_test)
    print "size of orginal train features", feats_train_ori.shape

    for feat_dim in feat_dims:
        print "perform feature selection"

        fselect = SelectKBest(chi2 , k=feat_dim)
        feats_train = fselect.fit_transform(feats_train_ori, labels_train)
        feats_test = fselect.transform(feats_test_ori)

        print "save features"
        np.savez("feats/%d_%d.npz" % (max_gram, feat_dim), 
                feats_train=feats_train, feats_test=feats_test, 
                labels_train=labels_train, labels_test=labels_test)

        if save_model:
            print "save models"
            with open("models/vectorizer_%d.pkl" % max_gram, "wb") as fout:
                pickle.dump(vectorizer, fout, -1)

            with open("models/fselect_%d_%d.pkl" % (max_gram, feat_dim), "wb") as fout:
                pickle.dump(fselect, fout, -1)
開發者ID:Jewelryland,項目名稱:sentiment,代碼行數:32,代碼來源:feature.py

示例6: num_feat_select

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def num_feat_select(n,k):
	tfidf = TfidfVectorizer(max_features=n, strip_accents='unicode', 
		 tokenizer = MyTokenizer(), analyzer='word')


	tfidf.fit(train['tweet'])
	trainf = tfidf.transform(train['tweet'])
	testf = tfidf.transform(test['tweet'])
	trainlab = np.array(train.ix[:,4:])
	knn = neighbors.KNeighborsRegressor(n_neighbors=k)
	knn.fit(trainf,trainlab)
	print 'here'
	tim = time.time();

	n = 10
	pred = []
	for i in range(0,n):
		pred.extend(knn.predict(testf[(i*1000):((i+1)*(1000))]))
		print(i)
	print "time: " + str(time.time() - tim) 

	#RMSE:
	testlab = np.array(test.ix[:,4:])
	err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0))))
	print err        
開發者ID:jefftn,項目名稱:kaggle-twitter,代碼行數:27,代碼來源:knn.py

示例7: processEssay

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
 def processEssay(self, testidx, trainidx):
     #process essay
     self.rawdata['essay'] = self.rawdata['essay'].apply(clean)
     self.trdata = self.rawdata['essay'].ix[trainidx]
     self.testdata = self.rawdata['essay'].ix[testidx]
     trainessay = np.array(self.trdata.fillna('Missing'))
     testessay = np.array(self.testdata.fillna('Missing'))
     tfidfEs = TfidfVectorizer(min_df=4,  max_features=500)
     tfidfEs.fit(trainessay)
     #=======================================================================
     # #process need statement
     # self.rawdata['need_statement'] = self.rawdata['need_statement'].apply(clean)
     # self.trdata = self.rawdata['need_statement'].ix[trainidx]
     # self.testdata = self.rawdata['need_statement'].ix[testidx]
     # trainneedst = np.array(self.trdata.fillna('Missing'))
     # testneedst= np.array(self.testdata.fillna('Missing'))
     # tfidfNs = TfidfVectorizer(min_df=3,  max_features=20)
     # tfidfNs.fit(trainneedst)
     #  
     # #process short desc
     # self.rawdata['short_description'] = self.rawdata['short_description'].apply(clean)
     # self.trdata = self.rawdata['short_description'].ix[trainidx]
     # self.testdata = self.rawdata['short_description'].ix[testidx]
     # trainshortd = np.array(self.trdata.fillna('Missing'))
     # testshortd= np.array(self.testdata.fillna('Missing'))
     # tfidfSd = TfidfVectorizer(min_df=3,  max_features=20)
     # tfidfSd.fit(trainshortd)
     # 
     # self.exdata_train = sp.hstack((tfidfEs.transform(trainessay),tfidfNs.transform(trainneedst),tfidfSd.transform(trainshortd) ))
     # self.exdata_test =  sp.hstack((tfidfEs.transform(testessay),tfidfNs.transform(testneedst),tfidfSd.transform(testshortd) ))
     #=======================================================================
     self.exdata_train = tfidfEs.transform(trainessay) #only use the essay
     self.exdata_test =  tfidfEs.transform(testessay)
開發者ID:thusithaC,項目名稱:KDD2014,代碼行數:35,代碼來源:dataProcess.py

示例8: ridge_003

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def ridge_003():
    print('*** CLEANING ***')
    tfidf_wrd = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='word', ngram_range=(1, 3),
                                lowercase=True, stop_words='english', min_df=3, max_df=0.5)
    tfidf_wrd.fit(train_set['tweet'])
    X_train_wrd = tfidf_wrd.transform(train_set['tweet'])
    X_test_wrd = tfidf_wrd.transform(test_set['tweet'])

    tfidf_char = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='char', ngram_range=(4, 10),
                                lowercase=True, stop_words='english', min_df=3, max_df=0.5)
    tfidf_char.fit(train_set['tweet'])
    X_train_char = tfidf_char.transform(train_set['tweet'])
    X_test_char = tfidf_char.transform(test_set['tweet'])

    y_train = np.array(train_set.ix[:, 4:])

    print('*** TRAINING ***')
    mdl_wrd = model.ridge(X_train_wrd, y_train)
    mdl_char = model.ridge(X_train_char, y_train)

    print('*** PREDICTING ***')
    test_prediction_wrd = mdl_wrd.predict(X_test_wrd)
    test_prediction_char = mdl_char.predict(X_test_char)

    test_prediction = (test_prediction_wrd + test_prediction_char) / 2

    print('*** OUTPUTTING ***')
    output('results/ridge_003.csv', test_prediction)
開發者ID:cyberport-kaggle,項目名稱:hashtag-weather,代碼行數:30,代碼來源:run.py

示例9: __init__

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
class NaiveBayes:
	def __init__(self):
		self.clf = MultinomialNB()
		self.pattern ='(?u)\\b[A-Za-z]{3,}'
		self.tfidf = TfidfVectorizer(sublinear_tf=False, use_idf=True, smooth_idf=True, stop_words='english', token_pattern=self.pattern, ngram_range=(2,2))

	def train(self,fileName):
		print "Naive Bayes classifier is being trained"
		table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
		X_train = self.tfidf.fit_transform(table.message)
		Y_train = []
		for item in table.cat:
			Y_train.append(int(item)) 
		self.clf.fit(X_train, Y_train)
		self.clf.fit(X_train, Y_train)
		print "Naive Bayes classifier has been trained"

	def classify(self,cFileName, rFileName):
		table = pandas.read_table(cFileName, names=["message"])
		X_test = self.tfidf.transform(table.message)
		print "Data have been classified"
		with open(rFileName,'w') as f:
			for item in self.clf.predict(X_test).astype(str):
				f.write(item+'\n')

	def validate(self,fileName):
		table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
		X_validate = self.tfidf.transform(table.message)
		Y_validated = self.clf.predict(X_validate).astype(str)
		totalNum = len(table.cat)
		errorCount = 0
		for i in range(0,totalNum):
			if int(table.cat[i])!=int(Y_validated[i]):
				errorCount += 1
		print "Data have been validated! Precision={}".format((totalNum-errorCount)/float(totalNum))
開發者ID:richelite,項目名稱:classify,代碼行數:37,代碼來源:lib.py

示例10: train_and_predict_m5

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def train_and_predict_m5 (train, test, labels) :
		# Beautiful soup cleanup and stemming (just to mix it up)
    stemmer = PorterStemmer()
    trainData = modified_cleanup(train, stemmer, is_train = True, pretag = 'full')
    testData = modified_cleanup(test, stemmer, is_train = False, pretag = 'full')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Multinominal Naive Bayes...")
    clf = MultinomialNB(alpha = 0.03)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    # param_grid = {'alpha' : [0.01, 0.03, 0.1, 0.3, 1]}
    param_grid = {'alpha' : [0.01, 0.03]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
開發者ID:sathishrvijay,項目名稱:Kaggle-CrowdFlowerSRR,代碼行數:30,代碼來源:classifier.py

示例11: train_and_predict_m6

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def train_and_predict_m6 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM6, stemmer_type = 'snowball')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting K-Nearest Neighbors...")
    clf = KNeighborsClassifier(p = 2, n_neighbors = 5)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
		# Note: minkowski with p > 2 does not work for sparse matrices
    param_grid = {'n_neighbors' : [3, 4, 5, 6, 7], 'weights' : ['uniform', 'distance'], 'leaf_size' : [1, 3, 5, 10] }
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
開發者ID:sathishrvijay,項目名稱:Kaggle-CrowdFlowerSRR,代碼行數:28,代碼來源:classifier.py

示例12: train_and_predict_m3

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def train_and_predict_m3 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM3, stemmer_type = 'porter')

    """
    # Beautiful soup cleanup and stemming
    stemmer = PorterStemmer()
    trainData = modified_cleanup(train, stemmer, is_train = True)
    testData = modified_cleanup(test, stemmer, is_train = False)
    """
				
    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    clf = SGDClassifier(random_state = randomState, n_jobs = 1, penalty = 'l2', loss = 'huber', n_iter = 50, class_weight = 'auto', learning_rate = 'optimal', epsilon = 1)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {'n_iter' : [30, 50, 80, 100, 200],  'loss': ['huber'], 'epsilon' : [0.3, 1], 'alpha' : [0.0001, 0.0003, 0.001] }
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
開發者ID:sathishrvijay,項目名稱:Kaggle-CrowdFlowerSRR,代碼行數:33,代碼來源:classifier.py

示例13: train_and_predict_m4

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def train_and_predict_m4 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM4, stemmer_type = 'porter')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    clf = LogisticRegression(random_state = randomState, penalty = 'l2', C = 12, class_weight = 'auto')
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    #param_grid = {'C' : [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 30], 'penalty' : ['l2']}
    param_grid = {'C' : [1, 3, 5, 6, 7, 8, 10, 11, 12], 'penalty' : ['l2']}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
開發者ID:sathishrvijay,項目名稱:Kaggle-CrowdFlowerSRR,代碼行數:27,代碼來源:classifier.py

示例14: train_and_predict_m1

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def train_and_predict_m1 (train, test, labels) :
    print ("Training M1 (randomState = %d ...", randomState)
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM1, stemmer_type = 'porter')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    vectorizer = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    vectorizer.fit(trainData)
    X =  vectorizer.transform(trainData) 
    X_test = vectorizer.transform(testData)
    
    ## Use Stemmer post TF-IDF to check if things change
    # print (X)
    print ("X.shape: ", X.shape)
    print ("X_test.shape: ", X_test.shape)

    ## Create the pipeline 
		# 07/02 - RandomizedPCA/PCA does not work on sparse input (so cannot be applied on output of Vectorizer)
		# JimingYe says LDA did not give much benefit.
    clf = Pipeline([('svd', TruncatedSVD(random_state = randomState, n_components = 330)),
    						 						('scl', StandardScaler()),
                    	     ('svm', SVC(random_state = randomState, cache_size = 500, C = 12))])

    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
     param_grid = {'svd__n_components' : [200, 250, 300], 'svm__C': [10, 12]}
開發者ID:sathishrvijay,項目名稱:Kaggle-CrowdFlowerSRR,代碼行數:28,代碼來源:classifier.py

示例15: doTFIDF

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import transform [as 別名]
def doTFIDF(train, test1, test2):
	steemedTrain = stemIt(train)
	steemedTest1 = stemIt(test1)
	steemedTest2 = stemIt(test2)
	print "done stemming tweets"

	regTrain = processIt(train)
	regTest1 = processIt(test1)
	regTest2 = processIt(test2)

	vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1)

	X = vectorizer.fit_transform(regTrain) 
	Xtest1 = vectorizer.transform(regTest1)
	Xtest2 = vectorizer.transform(regTest2)
	scipy.io.mmwrite('train_reg_dataM',X, field='real')
	scipy.io.mmwrite('test1_reg_dataM',Xtest1, field='real')
	scipy.io.mmwrite('test2_reg_dataM',Xtest2, field='real')

	vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1)

	X = vectorizer.fit_transform(steemedTrain) 
	Xtest1 = vectorizer.transform(steemedTest1)
	Xtest2 = vectorizer.transform(steemedTest2)
	scipy.io.mmwrite('train_stem_dataM',X, field='real')
	scipy.io.mmwrite('test1_stem_dataM',Xtest1, field='real')
	scipy.io.mmwrite('test2_stem_dataM',Xtest2, field='real')
開發者ID:Krimit,項目名稱:ml_project,代碼行數:29,代碼來源:ingest.py


注:本文中的sklearn.feature_extraction.text.TfidfVectorizer.transform方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。