當前位置: 首頁>>代碼示例>>Python>>正文


Python TfidfVectorizer.fit方法代碼示例

本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.fit方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.fit方法的具體用法?Python TfidfVectorizer.fit怎麽用?Python TfidfVectorizer.fit使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.feature_extraction.text.TfidfVectorizer的用法示例。


在下文中一共展示了TfidfVectorizer.fit方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: train_and_predict_m3

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m3 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM3, stemmer_type = 'porter')

    """
    # Beautiful soup cleanup and stemming
    stemmer = PorterStemmer()
    trainData = modified_cleanup(train, stemmer, is_train = True)
    testData = modified_cleanup(test, stemmer, is_train = False)
    """
				
    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    clf = SGDClassifier(random_state = randomState, n_jobs = 1, penalty = 'l2', loss = 'huber', n_iter = 50, class_weight = 'auto', learning_rate = 'optimal', epsilon = 1)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {'n_iter' : [30, 50, 80, 100, 200],  'loss': ['huber'], 'epsilon' : [0.3, 1], 'alpha' : [0.0001, 0.0003, 0.001] }
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
開發者ID:sathishrvijay,項目名稱:Kaggle-CrowdFlowerSRR,代碼行數:33,代碼來源:classifier.py

示例2: MedicalKeywordTfIdf

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
class MedicalKeywordTfIdf(BaseEstimator, TransformerMixin):
    MEDICAL_KEYWORDS = ["Medical_Keyword_" + str(i) for i in range(1, 49)]

    def __init__(self):
        self._vec = TfidfVectorizer(max_df=0.95, min_df=2)

    def get_feature_names(self):

        return [x + "_TFIDF" for x in self._vec.get_feature_names()]

    def get_data_array(self, df):

        return df[self.MEDICAL_KEYWORDS] \
            .apply(lambda x: " ".join(x[x == 1].index), axis=1).values

    def fit(self, df, y=None):
        data_arr = self.get_data_array(df)
        self._vec.fit(data_arr)

        return self

    def transform(self, df):
        data_arr = self.get_data_array(df)

        return self._vec.transform(data_arr).toarray()
開發者ID:haisland0909,項目名稱:PrudentialLifeInsuranceAssessment,代碼行數:27,代碼來源:features.py

示例3: compute_tf_idf_vectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def compute_tf_idf_vectorizer(data_path="/Users/HyNguyen/Documents/Research/Data/stories", save_path="exsum/tf_idf_vectorizer_200_05.pickle", min_df = 200, max_df = 0.5):
    """
    Detail:
    Params:
        data_path: data directory
        save_path: idfs save to, suffix: 200_05: min_df= 200, max_df = 0.5(len(documents))
        min_df: lower bound
        max_df: upper bound
    """
    dataset = loadData(data_path)
    documents = []
    for counter, sample in enumerate(dataset):
        filename, contents, highlights = sample
        content_str = ""
        for content in contents:
            if content[-1] != ".":
                content += "."
            content_str += " " + content
        documents.append(content_str)

    tf_idf_vectorizer = TfidfVectorizer(max_df=max_df,min_df=min_df,stop_words=stopwords.words('english'))
    tf_idf_vectorizer.fit(documents)

    with open(save_path, mode="wb") as f:
        pickle.dump(tf_idf_vectorizer,f)

    print ("Tf-idf Vectorizer: length of vocabulary: ", len(tf_idf_vectorizer.vocabulary))
開發者ID:lngvietthang,項目名稱:das,代碼行數:29,代碼來源:utils.py

示例4: num_feat_select

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def num_feat_select(n,k):
	tfidf = TfidfVectorizer(max_features=n, strip_accents='unicode', 
		 tokenizer = MyTokenizer(), analyzer='word')


	tfidf.fit(train['tweet'])
	trainf = tfidf.transform(train['tweet'])
	testf = tfidf.transform(test['tweet'])
	trainlab = np.array(train.ix[:,4:])
	knn = neighbors.KNeighborsRegressor(n_neighbors=k)
	knn.fit(trainf,trainlab)
	print 'here'
	tim = time.time();

	n = 10
	pred = []
	for i in range(0,n):
		pred.extend(knn.predict(testf[(i*1000):((i+1)*(1000))]))
		print(i)
	print "time: " + str(time.time() - tim) 

	#RMSE:
	testlab = np.array(test.ix[:,4:])
	err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0))))
	print err        
開發者ID:jefftn,項目名稱:kaggle-twitter,代碼行數:27,代碼來源:knn.py

示例5: processEssay

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
 def processEssay(self, testidx, trainidx):
     #process essay
     self.rawdata['essay'] = self.rawdata['essay'].apply(clean)
     self.trdata = self.rawdata['essay'].ix[trainidx]
     self.testdata = self.rawdata['essay'].ix[testidx]
     trainessay = np.array(self.trdata.fillna('Missing'))
     testessay = np.array(self.testdata.fillna('Missing'))
     tfidfEs = TfidfVectorizer(min_df=4,  max_features=500)
     tfidfEs.fit(trainessay)
     #=======================================================================
     # #process need statement
     # self.rawdata['need_statement'] = self.rawdata['need_statement'].apply(clean)
     # self.trdata = self.rawdata['need_statement'].ix[trainidx]
     # self.testdata = self.rawdata['need_statement'].ix[testidx]
     # trainneedst = np.array(self.trdata.fillna('Missing'))
     # testneedst= np.array(self.testdata.fillna('Missing'))
     # tfidfNs = TfidfVectorizer(min_df=3,  max_features=20)
     # tfidfNs.fit(trainneedst)
     #  
     # #process short desc
     # self.rawdata['short_description'] = self.rawdata['short_description'].apply(clean)
     # self.trdata = self.rawdata['short_description'].ix[trainidx]
     # self.testdata = self.rawdata['short_description'].ix[testidx]
     # trainshortd = np.array(self.trdata.fillna('Missing'))
     # testshortd= np.array(self.testdata.fillna('Missing'))
     # tfidfSd = TfidfVectorizer(min_df=3,  max_features=20)
     # tfidfSd.fit(trainshortd)
     # 
     # self.exdata_train = sp.hstack((tfidfEs.transform(trainessay),tfidfNs.transform(trainneedst),tfidfSd.transform(trainshortd) ))
     # self.exdata_test =  sp.hstack((tfidfEs.transform(testessay),tfidfNs.transform(testneedst),tfidfSd.transform(testshortd) ))
     #=======================================================================
     self.exdata_train = tfidfEs.transform(trainessay) #only use the essay
     self.exdata_test =  tfidfEs.transform(testessay)
開發者ID:thusithaC,項目名稱:KDD2014,代碼行數:35,代碼來源:dataProcess.py

示例6: _train

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
    def _train(self, train_data, resources):
        sample_length = len(train_data)
        dict_status_path = os.path.join(root_dic,
                                        'dict_vectorizer_{}.status'.
                                        format(sample_length))
        if os.path.isfile(dict_status_path):
            dictVectorizer = joblib.load(dict_status_path)
        else:
            dictVectorizer = DictVectorizer()
            dictVectorizer.fit(train_data[self.features].
                               fillna(0).
                               to_dict('record'))
            joblib.dump(dictVectorizer, dict_status_path)

        tfidf_status_path = os.path.join(root_dic,
                                         'tfidf_vectorizer_{}.status'.
                                         format(sample_length))
        if os.path.isfile(tfidf_status_path):
            tfidf = joblib.load(tfidf_status_path)
        else:
            tfidf = TfidfVectorizer(min_df=40, max_features=300)
            tfidf.fit(train_data.essay)
            joblib.dump(tfidf, tfidf_status_path)

        resources['dictVectorizer'] = dictVectorizer
        resources['tfidf'] = tfidf
        print 'Head Processing Completed'
        return train_data, resources
開發者ID:yelite,項目名稱:KDD2014,代碼行數:30,代碼來源:decomposition.py

示例7: __init__

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
class TfidfBuilder:

    def __init__(self, filtered_out_words=[]):
        self.lemmatizer = WordNetLemmatizer()
        self.tfidf = TfidfVectorizer(tokenizer=self.get_tokens)
        self.filtered_out_words = filtered_out_words

    def filter(self, word):
        result = True
        if word in self.filtered_out_words:
            result = False
        return result

    def get_tokens(self, text):
        all_tokens = nltk.word_tokenize(text)
        filtered_tokens = [word for word in all_tokens if self.filter(word)]
        lemmatized_tokens = [self.lemmatizer.lemmatize(word) for word in filtered_tokens]
        return lemmatized_tokens

    def to_tfidf(self, documents):
        self.tfidf.fit(documents)
        return self.tfidf

    def to_tfidf_vector(self, document):
        return self.tfidf.transform([document]).toarray()
開發者ID:josip-u,項目名稱:alien-vs-sex-predator,代碼行數:27,代碼來源:tfidf_builder.py

示例8: vectorize

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def vectorize(data, new_doc, local = False):
    """
    Vectorize the data as described in file docstring.
    """
    # Generator for all glossaries
    glossaries = lambda: (data.tag_glossary(t) for t in data.tags())

    # Create the bag of words descriptors for each glossary
    vectorizer = TfidfVectorizer(use_idf=True)
    vectorizer.fit(glossaries())
    tag_bows = dict(zip(data.tags(), vectorizer.transform(glossaries())))

    # Count the number of occurences for each tag
    tag_counter = Counter()
    for i in data.items(): tag_counter.update(data.item(i)['tags'])
        
    # Generator for lists of tags for each item
    item_tags = (data.item(i)['tags'] for i in data.items())

    # The number of dimensions in the bow vector
    v_dim = len(vectorizer.get_feature_names())
    # lambda function to create descriptors
    create_desc = lambda x: create_descriptor(x, tag_bows, tag_counter, 
                                              v_dim, len(data.data['items']))

    # Create descriptors for all known documents and new document
    item_descriptors = [create_desc(tags) for tags in  item_tags]
    new_doc_descriptor = create_desc(new_doc['tags'])
    
    # For analysis or use in other vectorizers, also return the vectorizer itself
    if(local):
        return (zip(data.items(), item_descriptors), new_doc_descriptor, vectorizer)

    # Asssociate document ids with descriptors and return.
    return(zip(data.items(), item_descriptors), new_doc_descriptor)
開發者ID:PerceptumNL,項目名稱:TweedejaarsProject,代碼行數:37,代碼來源:weighted_tagvectorizer.py

示例9: tfIDFeats

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def tfIDFeats(ids,data):


    # the infamous tfidf vectorizer (Do you remember this one?)
    tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
    # Fit TFIDF
    tfv.fit(data)
    X =  tfv.transform(data) 
        # Initialize SVD

    svd = TruncatedSVD(n_components=350)
    
    # Initialize the standard scaler 
    scl = StandardScaler( with_mean=False)
    
    
    
    if X.shape[1]>350:
        X = svd.fit_transform(X)
    X = scl.fit_transform(X,ids)
    if plotData:
        X = PCA(n_components=2).fit_transform(X)
    return (X,ids)
開發者ID:mostafaelaraby,項目名稱:articles-clustering,代碼行數:28,代碼來源:clusterRelated.py

示例10: train_and_predict_m7

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m7 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'snowball')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Passive-Aggressive Classifer...")
    clf = PassiveAggressiveClassifier(random_state = randomState, loss = 'squared_hinge', n_iter = 100, C = 0.01)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
		# Note: minkowski with p > 2 does not work for sparse matrices
    param_grid = {'C' : [0.003, 0.01, 0.03, 0.1], 'loss': ['hinge', 'squared_hinge'], 'n_iter': [5, 10, 30, 100, 300]}
    #param_grid = {'C' : [0.003, 0.01, 0.03, 0.1, 0.3, 1], 'loss': ['hinge'], 'n_iter': [5, 10, 30, 100, 300, 1000]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
開發者ID:sathishrvijay,項目名稱:Kaggle-CrowdFlowerSRR,代碼行數:29,代碼來源:classifier.py

示例11: train_and_predict_m8

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m8 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'porter')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Ridge Classifer...")
    clf = RidgeClassifier(class_weight = 'auto', alpha = 1, normalize = True)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {'alpha' : [0.1, 0.3, 1, 3, 10], 'normalize' : [True, False]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
開發者ID:sathishrvijay,項目名稱:Kaggle-CrowdFlowerSRR,代碼行數:27,代碼來源:classifier.py

示例12: train_and_predict_m6

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m6 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM6, stemmer_type = 'snowball')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting K-Nearest Neighbors...")
    clf = KNeighborsClassifier(p = 2, n_neighbors = 5)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
		# Note: minkowski with p > 2 does not work for sparse matrices
    param_grid = {'n_neighbors' : [3, 4, 5, 6, 7], 'weights' : ['uniform', 'distance'], 'leaf_size' : [1, 3, 5, 10] }
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
開發者ID:sathishrvijay,項目名稱:Kaggle-CrowdFlowerSRR,代碼行數:28,代碼來源:classifier.py

示例13: train_and_predict_m5

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m5 (train, test, labels) :
		# Beautiful soup cleanup and stemming (just to mix it up)
    stemmer = PorterStemmer()
    trainData = modified_cleanup(train, stemmer, is_train = True, pretag = 'full')
    testData = modified_cleanup(test, stemmer, is_train = False, pretag = 'full')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Multinominal Naive Bayes...")
    clf = MultinomialNB(alpha = 0.03)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    # param_grid = {'alpha' : [0.01, 0.03, 0.1, 0.3, 1]}
    param_grid = {'alpha' : [0.01, 0.03]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
開發者ID:sathishrvijay,項目名稱:Kaggle-CrowdFlowerSRR,代碼行數:30,代碼來源:classifier.py

示例14: train_and_predict_m4

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m4 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM4, stemmer_type = 'porter')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    clf = LogisticRegression(random_state = randomState, penalty = 'l2', C = 12, class_weight = 'auto')
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    #param_grid = {'C' : [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 30], 'penalty' : ['l2']}
    param_grid = {'C' : [1, 3, 5, 6, 7, 8, 10, 11, 12], 'penalty' : ['l2']}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
開發者ID:sathishrvijay,項目名稱:Kaggle-CrowdFlowerSRR,代碼行數:27,代碼來源:classifier.py

示例15: create_vectorizer

# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
    def create_vectorizer(self, names):
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english')
        # tokenize and build vocab
        vectorizer.fit(names)

        return vectorizer
開發者ID:Rosdex,項目名稱:rest-category-classificator-study,代碼行數:9,代碼來源:classificator_study_module.py


注:本文中的sklearn.feature_extraction.text.TfidfVectorizer.fit方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。