本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.fit方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.fit方法的具體用法?Python TfidfVectorizer.fit怎麽用?Python TfidfVectorizer.fit使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.fit方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: train_and_predict_m3
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m3 (train, test, labels) :
## Apply basic concatenation + stemming
trainData, testData = stemmer_clean (train, test, stemmerEnableM3, stemmer_type = 'porter')
"""
# Beautiful soup cleanup and stemming
stemmer = PorterStemmer()
trainData = modified_cleanup(train, stemmer, is_train = True)
testData = modified_cleanup(test, stemmer, is_train = False)
"""
## TF-IDF transform with sub-linear TF and stop-word removal
tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
tfv.fit(trainData)
X = tfv.transform(trainData)
X_test = tfv.transform(testData)
## Create the classifier
clf = SGDClassifier(random_state = randomState, n_jobs = 1, penalty = 'l2', loss = 'huber', n_iter = 50, class_weight = 'auto', learning_rate = 'optimal', epsilon = 1)
## Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'n_iter' : [30, 50, 80, 100, 200], 'loss': ['huber'], 'epsilon' : [0.3, 1], 'alpha' : [0.0001, 0.0003, 0.001] }
## Predict model with best parameters optimized for quadratic_weighted_kappa
if (gridSearch) :
model = perform_grid_search (clf, param_grid, X, labels)
pred = model.predict(X_test)
else :
clf.fit(X, labels)
pred = clf.predict(X_test)
return pred
示例2: MedicalKeywordTfIdf
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
class MedicalKeywordTfIdf(BaseEstimator, TransformerMixin):
MEDICAL_KEYWORDS = ["Medical_Keyword_" + str(i) for i in range(1, 49)]
def __init__(self):
self._vec = TfidfVectorizer(max_df=0.95, min_df=2)
def get_feature_names(self):
return [x + "_TFIDF" for x in self._vec.get_feature_names()]
def get_data_array(self, df):
return df[self.MEDICAL_KEYWORDS] \
.apply(lambda x: " ".join(x[x == 1].index), axis=1).values
def fit(self, df, y=None):
data_arr = self.get_data_array(df)
self._vec.fit(data_arr)
return self
def transform(self, df):
data_arr = self.get_data_array(df)
return self._vec.transform(data_arr).toarray()
示例3: compute_tf_idf_vectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def compute_tf_idf_vectorizer(data_path="/Users/HyNguyen/Documents/Research/Data/stories", save_path="exsum/tf_idf_vectorizer_200_05.pickle", min_df = 200, max_df = 0.5):
"""
Detail:
Params:
data_path: data directory
save_path: idfs save to, suffix: 200_05: min_df= 200, max_df = 0.5(len(documents))
min_df: lower bound
max_df: upper bound
"""
dataset = loadData(data_path)
documents = []
for counter, sample in enumerate(dataset):
filename, contents, highlights = sample
content_str = ""
for content in contents:
if content[-1] != ".":
content += "."
content_str += " " + content
documents.append(content_str)
tf_idf_vectorizer = TfidfVectorizer(max_df=max_df,min_df=min_df,stop_words=stopwords.words('english'))
tf_idf_vectorizer.fit(documents)
with open(save_path, mode="wb") as f:
pickle.dump(tf_idf_vectorizer,f)
print ("Tf-idf Vectorizer: length of vocabulary: ", len(tf_idf_vectorizer.vocabulary))
示例4: num_feat_select
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def num_feat_select(n,k):
tfidf = TfidfVectorizer(max_features=n, strip_accents='unicode',
tokenizer = MyTokenizer(), analyzer='word')
tfidf.fit(train['tweet'])
trainf = tfidf.transform(train['tweet'])
testf = tfidf.transform(test['tweet'])
trainlab = np.array(train.ix[:,4:])
knn = neighbors.KNeighborsRegressor(n_neighbors=k)
knn.fit(trainf,trainlab)
print 'here'
tim = time.time();
n = 10
pred = []
for i in range(0,n):
pred.extend(knn.predict(testf[(i*1000):((i+1)*(1000))]))
print(i)
print "time: " + str(time.time() - tim)
#RMSE:
testlab = np.array(test.ix[:,4:])
err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0))))
print err
示例5: processEssay
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def processEssay(self, testidx, trainidx):
#process essay
self.rawdata['essay'] = self.rawdata['essay'].apply(clean)
self.trdata = self.rawdata['essay'].ix[trainidx]
self.testdata = self.rawdata['essay'].ix[testidx]
trainessay = np.array(self.trdata.fillna('Missing'))
testessay = np.array(self.testdata.fillna('Missing'))
tfidfEs = TfidfVectorizer(min_df=4, max_features=500)
tfidfEs.fit(trainessay)
#=======================================================================
# #process need statement
# self.rawdata['need_statement'] = self.rawdata['need_statement'].apply(clean)
# self.trdata = self.rawdata['need_statement'].ix[trainidx]
# self.testdata = self.rawdata['need_statement'].ix[testidx]
# trainneedst = np.array(self.trdata.fillna('Missing'))
# testneedst= np.array(self.testdata.fillna('Missing'))
# tfidfNs = TfidfVectorizer(min_df=3, max_features=20)
# tfidfNs.fit(trainneedst)
#
# #process short desc
# self.rawdata['short_description'] = self.rawdata['short_description'].apply(clean)
# self.trdata = self.rawdata['short_description'].ix[trainidx]
# self.testdata = self.rawdata['short_description'].ix[testidx]
# trainshortd = np.array(self.trdata.fillna('Missing'))
# testshortd= np.array(self.testdata.fillna('Missing'))
# tfidfSd = TfidfVectorizer(min_df=3, max_features=20)
# tfidfSd.fit(trainshortd)
#
# self.exdata_train = sp.hstack((tfidfEs.transform(trainessay),tfidfNs.transform(trainneedst),tfidfSd.transform(trainshortd) ))
# self.exdata_test = sp.hstack((tfidfEs.transform(testessay),tfidfNs.transform(testneedst),tfidfSd.transform(testshortd) ))
#=======================================================================
self.exdata_train = tfidfEs.transform(trainessay) #only use the essay
self.exdata_test = tfidfEs.transform(testessay)
示例6: _train
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def _train(self, train_data, resources):
sample_length = len(train_data)
dict_status_path = os.path.join(root_dic,
'dict_vectorizer_{}.status'.
format(sample_length))
if os.path.isfile(dict_status_path):
dictVectorizer = joblib.load(dict_status_path)
else:
dictVectorizer = DictVectorizer()
dictVectorizer.fit(train_data[self.features].
fillna(0).
to_dict('record'))
joblib.dump(dictVectorizer, dict_status_path)
tfidf_status_path = os.path.join(root_dic,
'tfidf_vectorizer_{}.status'.
format(sample_length))
if os.path.isfile(tfidf_status_path):
tfidf = joblib.load(tfidf_status_path)
else:
tfidf = TfidfVectorizer(min_df=40, max_features=300)
tfidf.fit(train_data.essay)
joblib.dump(tfidf, tfidf_status_path)
resources['dictVectorizer'] = dictVectorizer
resources['tfidf'] = tfidf
print 'Head Processing Completed'
return train_data, resources
示例7: __init__
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
class TfidfBuilder:
def __init__(self, filtered_out_words=[]):
self.lemmatizer = WordNetLemmatizer()
self.tfidf = TfidfVectorizer(tokenizer=self.get_tokens)
self.filtered_out_words = filtered_out_words
def filter(self, word):
result = True
if word in self.filtered_out_words:
result = False
return result
def get_tokens(self, text):
all_tokens = nltk.word_tokenize(text)
filtered_tokens = [word for word in all_tokens if self.filter(word)]
lemmatized_tokens = [self.lemmatizer.lemmatize(word) for word in filtered_tokens]
return lemmatized_tokens
def to_tfidf(self, documents):
self.tfidf.fit(documents)
return self.tfidf
def to_tfidf_vector(self, document):
return self.tfidf.transform([document]).toarray()
示例8: vectorize
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def vectorize(data, new_doc, local = False):
"""
Vectorize the data as described in file docstring.
"""
# Generator for all glossaries
glossaries = lambda: (data.tag_glossary(t) for t in data.tags())
# Create the bag of words descriptors for each glossary
vectorizer = TfidfVectorizer(use_idf=True)
vectorizer.fit(glossaries())
tag_bows = dict(zip(data.tags(), vectorizer.transform(glossaries())))
# Count the number of occurences for each tag
tag_counter = Counter()
for i in data.items(): tag_counter.update(data.item(i)['tags'])
# Generator for lists of tags for each item
item_tags = (data.item(i)['tags'] for i in data.items())
# The number of dimensions in the bow vector
v_dim = len(vectorizer.get_feature_names())
# lambda function to create descriptors
create_desc = lambda x: create_descriptor(x, tag_bows, tag_counter,
v_dim, len(data.data['items']))
# Create descriptors for all known documents and new document
item_descriptors = [create_desc(tags) for tags in item_tags]
new_doc_descriptor = create_desc(new_doc['tags'])
# For analysis or use in other vectorizers, also return the vectorizer itself
if(local):
return (zip(data.items(), item_descriptors), new_doc_descriptor, vectorizer)
# Asssociate document ids with descriptors and return.
return(zip(data.items(), item_descriptors), new_doc_descriptor)
示例9: tfIDFeats
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def tfIDFeats(ids,data):
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
# Fit TFIDF
tfv.fit(data)
X = tfv.transform(data)
# Initialize SVD
svd = TruncatedSVD(n_components=350)
# Initialize the standard scaler
scl = StandardScaler( with_mean=False)
if X.shape[1]>350:
X = svd.fit_transform(X)
X = scl.fit_transform(X,ids)
if plotData:
X = PCA(n_components=2).fit_transform(X)
return (X,ids)
示例10: train_and_predict_m7
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m7 (train, test, labels) :
## Apply basic concatenation + stemming
trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'snowball')
## TF-IDF transform with sub-linear TF and stop-word removal
tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
tfv.fit(trainData)
X = tfv.transform(trainData)
X_test = tfv.transform(testData)
## Create the classifier
print ("Fitting Passive-Aggressive Classifer...")
clf = PassiveAggressiveClassifier(random_state = randomState, loss = 'squared_hinge', n_iter = 100, C = 0.01)
## Create a parameter grid to search for best parameters for everything in the pipeline
# Note: minkowski with p > 2 does not work for sparse matrices
param_grid = {'C' : [0.003, 0.01, 0.03, 0.1], 'loss': ['hinge', 'squared_hinge'], 'n_iter': [5, 10, 30, 100, 300]}
#param_grid = {'C' : [0.003, 0.01, 0.03, 0.1, 0.3, 1], 'loss': ['hinge'], 'n_iter': [5, 10, 30, 100, 300, 1000]}
## Predict model with best parameters optimized for quadratic_weighted_kappa
if (gridSearch) :
model = perform_grid_search (clf, param_grid, X, labels)
pred = model.predict(X_test)
else :
clf.fit(X, labels)
pred = clf.predict(X_test)
return pred
示例11: train_and_predict_m8
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m8 (train, test, labels) :
## Apply basic concatenation + stemming
trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'porter')
## TF-IDF transform with sub-linear TF and stop-word removal
tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
tfv.fit(trainData)
X = tfv.transform(trainData)
X_test = tfv.transform(testData)
## Create the classifier
print ("Fitting Ridge Classifer...")
clf = RidgeClassifier(class_weight = 'auto', alpha = 1, normalize = True)
## Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'alpha' : [0.1, 0.3, 1, 3, 10], 'normalize' : [True, False]}
## Predict model with best parameters optimized for quadratic_weighted_kappa
if (gridSearch) :
model = perform_grid_search (clf, param_grid, X, labels)
pred = model.predict(X_test)
else :
clf.fit(X, labels)
pred = clf.predict(X_test)
return pred
示例12: train_and_predict_m6
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m6 (train, test, labels) :
## Apply basic concatenation + stemming
trainData, testData = stemmer_clean (train, test, stemmerEnableM6, stemmer_type = 'snowball')
## TF-IDF transform with sub-linear TF and stop-word removal
tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
tfv.fit(trainData)
X = tfv.transform(trainData)
X_test = tfv.transform(testData)
## Create the classifier
print ("Fitting K-Nearest Neighbors...")
clf = KNeighborsClassifier(p = 2, n_neighbors = 5)
## Create a parameter grid to search for best parameters for everything in the pipeline
# Note: minkowski with p > 2 does not work for sparse matrices
param_grid = {'n_neighbors' : [3, 4, 5, 6, 7], 'weights' : ['uniform', 'distance'], 'leaf_size' : [1, 3, 5, 10] }
## Predict model with best parameters optimized for quadratic_weighted_kappa
if (gridSearch) :
model = perform_grid_search (clf, param_grid, X, labels)
pred = model.predict(X_test)
else :
clf.fit(X, labels)
pred = clf.predict(X_test)
return pred
示例13: train_and_predict_m5
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m5 (train, test, labels) :
# Beautiful soup cleanup and stemming (just to mix it up)
stemmer = PorterStemmer()
trainData = modified_cleanup(train, stemmer, is_train = True, pretag = 'full')
testData = modified_cleanup(test, stemmer, is_train = False, pretag = 'full')
## TF-IDF transform with sub-linear TF and stop-word removal
tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
tfv.fit(trainData)
X = tfv.transform(trainData)
X_test = tfv.transform(testData)
## Create the classifier
print ("Fitting Multinominal Naive Bayes...")
clf = MultinomialNB(alpha = 0.03)
## Create a parameter grid to search for best parameters for everything in the pipeline
# param_grid = {'alpha' : [0.01, 0.03, 0.1, 0.3, 1]}
param_grid = {'alpha' : [0.01, 0.03]}
## Predict model with best parameters optimized for quadratic_weighted_kappa
if (gridSearch) :
model = perform_grid_search (clf, param_grid, X, labels)
pred = model.predict(X_test)
else :
clf.fit(X, labels)
pred = clf.predict(X_test)
return pred
示例14: train_and_predict_m4
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def train_and_predict_m4 (train, test, labels) :
## Apply basic concatenation + stemming
trainData, testData = stemmer_clean (train, test, stemmerEnableM4, stemmer_type = 'porter')
## TF-IDF transform with sub-linear TF and stop-word removal
tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
tfv.fit(trainData)
X = tfv.transform(trainData)
X_test = tfv.transform(testData)
## Create the classifier
clf = LogisticRegression(random_state = randomState, penalty = 'l2', C = 12, class_weight = 'auto')
## Create a parameter grid to search for best parameters for everything in the pipeline
#param_grid = {'C' : [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 30], 'penalty' : ['l2']}
param_grid = {'C' : [1, 3, 5, 6, 7, 8, 10, 11, 12], 'penalty' : ['l2']}
## Predict model with best parameters optimized for quadratic_weighted_kappa
if (gridSearch) :
model = perform_grid_search (clf, param_grid, X, labels)
pred = model.predict(X_test)
else :
clf.fit(X, labels)
pred = clf.predict(X_test)
return pred
示例15: create_vectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import fit [as 別名]
def create_vectorizer(self, names):
# create the transform
vectorizer = TfidfVectorizer(stop_words='english')
# tokenize and build vocab
vectorizer.fit(names)
return vectorizer