本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer類的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer類的具體用法?Python TfidfVectorizer怎麽用?Python TfidfVectorizer使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了TfidfVectorizer類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: genfeature
def genfeature(self, ls_x):
'''
a. Shallow features
1. number of words in the sentence (normalize)
2. average number of characters in the words
3. percentage of stop words
4. minimum, maximum and average inverse document frequency
:param ls_x: sencences X without label
:return:
'''
vectorizer = TfidfVectorizer(stop_words='english',smooth_idf=True, sublinear_tf=False,
use_idf=True)
tfidf = vectorizer.fit_transform(ls_x)
array = tfidf.toarray()
X = []
append = X.append
maxtoken = 0
for idx,l in enumerate(ls_x):
ws = l.split()
maxtoken = max(len(ws),maxtoken)
try:
stops = round(reduce(lambda x,y: x+1 if y in self.tweetmanager.stop else x, ws,0)/(len(ws)+1e-10),2)
except:
pass
append([len(ws),self.avgch(ws), stops,
min(array[idx]), max(array[idx]), sum(array[idx])/len(array[idx])])
return [[round(x[0]*1.0/maxtoken,2)] + x[1:] for x in X]
示例2: get_data
def get_data(self, abstract=False):
data = self.mongo.get_all(order_by='id_doc')
data = [doc for doc in data]
if abstract:
only_text = self.get_data_with_abstract(data)
else:
only_text = [doc['text'] for doc in data]
only_labels = [doc['label'] for doc in data]
tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
max_features=200000,
min_df=2,
stop_words='english',
strip_accents='unicode',
use_idf=True,
ngram_range=(1, 1),
norm='l2',
tokenizer=TextUtils.tokenize_and_stem)
tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)
print 'After tfidf vectorizer: found %s documents and %s terms' \
% (tfidf_matrix.shape[0], tfidf_matrix.shape[1])
dict_out = {}
for l in sorted(set(only_labels)):
dict_out[l] = {
'docs': [],
'fscore': ''
}
for doc in data:
dict_out[doc['label']]['docs'].append(doc['id_doc'])
return tfidf_matrix, dict_out
示例3: tfidf_ize
def tfidf_ize(train, test, node_info):
vectorizer = TfidfVectorizer(ngram_range=(1,1))
vectorizer.fit(node_info.abstract.as_matrix())
for table in [train, test]:
table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
#table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
table.loc[:, 'temp22'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
table.loc[:, 'temp23'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
table.loc[:, 'temp24'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
+ table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
vectorizer = TfidfVectorizer(ngram_range=(2,2))
vectorizer.fit(node_info.abstract.as_matrix())
for table in [train, test]:
table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
#table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
table.loc[:, 'temp27'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
table.loc[:, 'temp28'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
table.loc[:, 'temp29'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
+ table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
return train, test
示例4: t_test_accuracy
def t_test_accuracy(topic_id, n_runs, estimator_params_votes_per_doc_tuples):
""" Test if accuracy for estimators with given parameters is
significantly better than that of the first estimator in the tuple
"""
texts, vote_lists, truths = texts_vote_lists_truths_by_topic_id[topic_id]
vectorizer = TfidfVectorizer()
text_similarity = cosine_similarity(vectorizer.fit_transform(texts))
accuracy_arrays = []
for estimator, args, votes_per_doc in estimator_params_votes_per_doc_tuples:
stop_idx = votes_per_doc * len(texts)
# Now get n_runs accuracies and put then into numpy arrays
accuracies = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts,
vote_lists, truths, text_similarity, idx, True, *args) for idx in xrange(n_runs) )
accuracy_arrays.append( np.array( filter(lambda x: x is not None, accuracies) ) )
# Baseline
result_row = []
result_row.append( "%0.2f" % np.mean(accuracy_arrays[0]) )
# T-tests
for accuracy_array in accuracy_arrays[1:]:
_, pval = ttest_ind(accuracy_array, accuracy_arrays[0], equal_var=False)
significance_indicator = lambda p: "*" if p < 0.01 else " "
is_better = "$" if np.mean(accuracy_array) > np.mean(accuracy_arrays[0]) else " "
result_row.append( "%0.2f %s %s" % (np.mean(accuracy_array), significance_indicator(pval), is_better))
return "|".join(result_row)
示例5: tfidf
def tfidf(synopses):
tfidf_vectorizer=TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
terms=tfidf_vectorizer.get_feature_names()
print("terms:",terms)
print(tfidf_matrix.shape)
return terms,tfidf_matrix # 返回tfidf矩陣
示例6: read_examples
def read_examples(filename, sparm):
"""Parses an input file into an example sequence."""
# This reads example files of the type read by SVM^multiclass.
examples = []
text = []
count = 0
# Open the file and read each example.
for line in file(filename):
# Get rid of comments.
if line.find('#'): line = line[:line.find('#')]
target, tokens = line.split('::')[0], line.split('::')[1:]
# If the line is empty, who cares?
if not tokens: continue
# Get the target.
text[count] = target
# Get the features.
tokens = [t.split(':') for t in tokens]
features = [(0,1)]+[(int(k),float(v)) for k,v in tokens]
# Add the example to the list
examples.append((svmapi.Sparse(features), count))
count += 1
# Print out some very useful statistics.
vectorizer = TfidfVectorizer(stop_words='english')
global tf_idf_transformed_matrix
tf_idf_transformed_matrix = vectorizer.fit_transform(text)
print len(examples),'examples read'
return examples
示例7: get_features
def get_features(vocab):
vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_head = vectorizer_head.fit_transform(headlines)
vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_body = vectorizer_body.fit_transform(bodies)
# calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
# more important topic words a body contains of a certain topic, the higher its value for this topic
lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)
print("latent_dirichlet_allocation_cos: fit and transform body")
t0 = time()
lda_body_matrix = lda_body.fit_transform(X_train_body)
print("done in %0.3fs." % (time() - t0))
print("latent_dirichlet_allocation_cos: transform head")
# use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
# their vectors should be similar
lda_head_matrix = lda_body.transform(X_train_head)
#print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)
print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
# calculate cosine distance between the body and head
X = []
for i in range(len(lda_head_matrix)):
X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
X.append(cos_dist.tolist())
return X
示例8: gen_data
def gen_data(self, fname):
"""
:fname : input file, every line means a single data
:rtype : List[List[float]]: data matrix
"""
lines = [ self.langConvHandler.convert(line.strip().lower()) for line in codecs.open(fname, "rb","utf-8") if len(line) > 6]
# lines = list(set(lines)) # remove duplicates
logging.info("number of data %d " % len(lines))
cut_lines = [" ".join(jieba.cut(line)) for line in lines]
# transform to tfidfVec
tfidfVec = TfidfVectorizer(max_features = 3000)
tfidf_data = tfidfVec.fit_transform(cut_lines)
tfidf_data = tfidf_data.toarray()
# save origin text
with open("./output/origin_lines.txt", "wb") as fw:
json.dump(lines, fw)
# save vectorize data
np.save("./output/tfidf.corpus.npy", tfidf_data)
self.lines = lines
self.tfidf_data = tfidf_data
示例9: simple_tfidf_alldocs
def simple_tfidf_alldocs():
qs = Posts.objects.all()
docs,post_index_map = vectorize_docs(n_samples=n_samples,log_batch_size=log_batch_size, qs=qs) #Get the doc bodies
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features = n_features_init,ngram_range=(1,n_gram),max_df=0.8)
tfidf_matrix_raw = tfidf_vectorizer.fit_transform(docs) #docs x n-gram-features
tfidf_matrix_scaled = scale(tfidf_matrix_raw, with_mean = False) #Can't use sparse matrices unless with_mean=False
return tfidf_matrix_scaled, post_index_map
示例10: train_classifier
def train_classifier(download=True, parameters=None, ngram_range=(1, 1)):
"""Train the intent classifier."""
if download:
download_wiki()
path = os.path.join(l.TOPDIR, 'train.json')
training_set = json.load(open(path))
path = os.path.join(l.TOPDIR, 'wiki.json')
wiki_set = json.load(open(path))
target_names = list(set([i['unit'] for i in training_set + wiki_set]))
train_data, train_target = [], []
for example in training_set + wiki_set:
train_data.append(clean_text(example['text']))
train_target.append(target_names.index(example['unit']))
tfidf_model = TfidfVectorizer(sublinear_tf=True,
ngram_range=ngram_range,
stop_words='english')
matrix = tfidf_model.fit_transform(train_data)
if parameters is None:
parameters = {'loss': 'log', 'penalty': 'l2', 'n_iter': 50,
'alpha': 0.00001, 'fit_intercept': True}
clf = SGDClassifier(**parameters).fit(matrix, train_target)
obj = {'tfidf_model': tfidf_model,
'clf': clf,
'target_names': target_names}
path = os.path.join(l.TOPDIR, 'clf.pickle')
pickle.dump(obj, open(path, 'w'))
示例11: get_top_terms
def get_top_terms(self, stops=STOPS):
# vecotrize using only 1-grams
vectorizer = TfidfVectorizer(stop_words=stops, ngram_range=(1,3))
tfidf = vectorizer.fit_transform(self.docs)
# enumerate feature names, ie. the actual words
self.feature_names = vectorizer.get_feature_names()
# convert to dense array
dense = tfidf.todense()
# container for top terms per doc
self.features = []
for doc in dense:
doc = doc.tolist()[0]
# creates a list of tuples, (term_id, score)
phrase_scores = [pair for pair in zip(range(0, len(doc)), doc) if pair[1] > 0]
# feature_ids = sorted(phrase_scores, key=lambda t: t[1] * -1)
doc_features = []
for f_ in phrase_scores:
fname = self.feature_names[f_[0]]
fscore = f_[1]
doc_features.append((fscore, fname))
top_terms = sorted(doc_features, reverse=True) #[:n_terms]
# top_terms = ",".join([ x[1] for x in top_terms ])
self.features.append(top_terms)
示例12: classify
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20):
# convert the training data text to features using TF-IDF vectorization
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
X_train = vectorizer.fit_transform(chapter_contents_train)
# X_train_array = X_train.toarray()
# print "tfidf vector length: ", len(X_train_array) #dbg
# print "X_train_array[0] length: ", len(X_train_array[0]) #dbg
# use only the best k features according to chi-sq selection
ch2 = SelectKBest(chi2, k=k)
X_train = ch2.fit_transform(X_train, y_train)
# determine the actual features used after best-k selection
feature_names = np.asarray(vectorizer.get_feature_names())
chisq_mask = ch2.get_support()
features_masks = zip(feature_names,chisq_mask)
selected_features = [z[0] for z in features_masks if z[1]]
# train the classifier
clf.fit(X_train, y_train)
# convert the test data text into features using the same vectorizer as for training
X_test = vectorizer.transform(chapter_contents_test)
X_test = ch2.transform(X_test)
# obtain binary class predictions for the test set
preds = clf.predict(X_test)
return preds, selected_features, clf
示例13: tfidf_covariance
def tfidf_covariance(texts, savepath):
if not savepath.endswith("/"):
savepath = savepath + "/"
if os.path.exists(savepath + "__linkage_average.npy"):
Z = np.load(savepath + "__linkage_average.npy")
else:
if not os.path.exists(savepath):
os.makedirs(savepath)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(input = str,
strip_accents = 'ascii',
analyzer ='word',
max_features=5000)
y = vectorizer.fit_transform(" ".join(text) for text in texts)
Z = linkage(y.todense(), method='average', metric='euclidean')
np.save(savepath + "__linkage_average.npy", Z)
if os.path.exists(savepath + "__covariance__.npy"):
Cov = np.load(savepath + "__covariance__.npy")
observables = HierarchicalObservation(Cov)
else:
root, nodes = to_tree(Z, rd=True)
assign_parents(root)
adj_mat = get_adjacency_matrix(nodes)
deg_mat = get_degree_matrix(nodes)
sigma = 5
laplacian = np.diag(deg_mat) - adj_mat + 1/(sigma**2) * np.eye(len(deg_mat))
Cov = np.linalg.inv(laplacian)[:len(texts), :len(texts)]
np.save(savepath + "__covariance__.npy", Cov)
observables = HierarchicalObservation(Cov)
return observables
示例14: fit
def fit(self, docs, clean=False):
'''
pipeline: clean, tokenize, tfidf, nmf, kmeans
'''
if clean:
print 'cleaning raw docs ......'
clean_docs = self.clean(docs)
else:
clean_docs = docs
print 'running tfidf ......'
if 'tokenizer' not in self.kw_tfidf:
self.tfidf = TfidfVectorizer(tokenizer=self.tokenize,
**self.kw_tfidf)
else:
self.tfidf = TfidfVectorizer(**self.kw_tfidf)
X = self.tfidf.fit_transform(clean_docs)
print 'running NMF ......'
self.nmf = NMF(**self.kw_nmf)
H = self.nmf.fit_transform(X)
W = self.nmf.components_
print 'fetching top 50 words for each topic ......'
self.top_n_words(50, W)
return X, H, W
示例15: MedicalKeywordTfIdf
class MedicalKeywordTfIdf(BaseEstimator, TransformerMixin):
MEDICAL_KEYWORDS = ["Medical_Keyword_" + str(i) for i in range(1, 49)]
def __init__(self):
self._vec = TfidfVectorizer(max_df=0.95, min_df=2)
def get_feature_names(self):
return [x + "_TFIDF" for x in self._vec.get_feature_names()]
def get_data_array(self, df):
return df[self.MEDICAL_KEYWORDS] \
.apply(lambda x: " ".join(x[x == 1].index), axis=1).values
def fit(self, df, y=None):
data_arr = self.get_data_array(df)
self._vec.fit(data_arr)
return self
def transform(self, df):
data_arr = self.get_data_array(df)
return self._vec.transform(data_arr).toarray()