本文整理匯總了Python中sklearn.feature_extraction.text.TfidfTransformer方法的典型用法代碼示例。如果您正苦於以下問題:Python text.TfidfTransformer方法的具體用法?Python text.TfidfTransformer怎麽用?Python text.TfidfTransformer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text
的用法示例。
在下文中一共展示了text.TfidfTransformer方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: write_topics
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def write_topics(ftopics, fwords, ftopics_words, poem_words, n_topic, n_topic_words):
count_matrix = count_vect.fit_transform(poem_words)
tfidf = TfidfTransformer().fit_transform(count_matrix)
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
feature_names = count_vect.get_feature_names()
fw = codecs.open(ftopics, 'w', 'utf-8')
for topic in nmf.components_:
fw.write(' '.join([feature_names[i] for i in topic.argsort()[:-n_topic_words - 1:-1]]) + '\n')
fw.close()
print('Write topics done.')
fw = codecs.open(fwords, 'wb')
pickle.dump(feature_names, fw)
fw.close()
print('Write words done.')
fw = codecs.open(ftopics_words, 'wb')
pickle.dump(nmf.components_, fw)
fw.close()
print('Write topic_words done.')
示例2: get_logistic_regression_coefs_l2
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def get_logistic_regression_coefs_l2(self, category,
clf=RidgeClassifierCV()):
''' Computes l2-penalized logistic regression score.
Parameters
----------
category : str
category name to score
category : str
category name to score
Returns
-------
(coefficient array, accuracy, majority class baseline accuracy)
'''
try:
from sklearn.cross_validation import cross_val_predict
except:
from sklearn.model_selection import cross_val_predict
y = self._get_mask_from_category(category)
X = TfidfTransformer().fit_transform(self._X)
clf.fit(X, y)
y_hat = cross_val_predict(clf, X, y)
acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
return clf.coef_[0], acc, baseline
示例3: test_main
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def test_main(self):
categories, documents = get_docs_categories()
clean_function = lambda text: '' if text.startswith('[') else text
entity_types = set(['GPE'])
term_doc_mat = (
TermDocMatrixFactory(
category_text_iter=zip(categories, documents),
clean_function=clean_function,
nlp=_testing_nlp,
feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
).build()
)
clf = PassiveAggressiveClassifier()
fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
clean_function=clean_function,
feats_from_spacy_doc=FeatsFromSpacyDoc(
entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
tfidf = TfidfTransformer(norm='l1')
X = tfidf.fit_transform(term_doc_mat._X)
clf.fit(X, term_doc_mat._y)
X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
pred = clf.predict(tfidf.transform(X_to_predict))
dec = clf.decision_function(X_to_predict)
示例4: tdidf_sim
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def tdidf_sim(sentences):
"""
tfidf相似度
:param sentences:
:return:
"""
# tfidf計算
model = TfidfVectorizer(tokenizer=jieba.cut,
ngram_range=(1, 2), # 3,5
stop_words=[' ', '\t', '\n'], # 停用詞
max_features=10000,
token_pattern=r"(?u)\b\w+\b", # 過濾停用詞
min_df=1,
max_df=0.9,
use_idf=1, # 光滑
smooth_idf=1, # 光滑
sublinear_tf=1, ) # 光滑
matrix = model.fit_transform(sentences)
matrix_norm = TfidfTransformer().fit_transform(matrix)
return matrix_norm
示例5: test_tf_idf_smoothing
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def test_tf_idf_smoothing():
X = [[1, 1, 1],
[1, 1, 0],
[1, 0, 0]]
tr = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = tr.fit_transform(X).toarray()
assert (tfidf >= 0).all()
# check normalization
assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])
# this is robust to features with only zeros
X = [[1, 1, 0],
[1, 1, 0],
[1, 0, 0]]
tr = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = tr.fit_transform(X).toarray()
assert (tfidf >= 0).all()
示例6: extract_characters
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def extract_characters(sentences: list, dimension: int):
"""
vertorizer
:param sentences: list
:param dimension: int
:return: weight, training_data
"""
print("Vetorizier...")
# Transfer into frequency matrix a[i][j], word j in text class i frequency
vertorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.46)
# vertorizer = CountVectorizer()
# collect tf-idf weight
transformer = TfidfTransformer()
# outer transform for calculate tf-idf, second for transform into matrix
tfidf = transformer.fit_transform(vertorizer.fit_transform(sentences))
# get all words in BOW
words_bag = vertorizer.get_feature_names()
# w[i][j] represents word j's weight in text class i
weight = tfidf.toarray()
print('Features length:' + str(len(words_bag)))
pca = PCA(n_components=dimension)
training_data = pca.fit_transform(weight)
return weight, training_data
示例7: build_vectorization_pipeline
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
"""
Build SKLearn vectorization pipeline for this field.
This is used in field-based machine learning when we calculate value of one field based on the
values of other fields of this document.
We are able to detect only choice fields this way at the moment.
To reach this we need to build a feature vector of all dependencies of the field being detected.
This feature vector is built as a union of feature vectors of each dependency.
See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..)
:return: Tuple of: 1. List of vectorization steps - to be added to a Pipeline()
2. List of str feature names or a function returning list of str feature names.
"""
vect = CountVectorizer(strip_accents='unicode', analyzer='word',
stop_words=self._build_stop_words())
return [('clean', vectorizers.ReplaceNoneTransformer('')),
('vect', vect),
('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)
示例8: get_topic_idf
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def get_topic_idf(self, sentences):
vectorizer = CountVectorizer()
sent_word_matrix = vectorizer.fit_transform(sentences)
transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
tfidf = transformer.fit_transform(sent_word_matrix)
tfidf = tfidf.toarray()
centroid_vector = tfidf.sum(0)
centroid_vector = np.divide(centroid_vector, centroid_vector.max())
# print(centroid_vector.max())
feature_names = vectorizer.get_feature_names()
relevant_vector_indices = np.where(centroid_vector > self.topic_threshold)[0]
word_list = list(np.array(feature_names)[relevant_vector_indices])
return word_list
示例9: get_keyword_from_tf
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def get_keyword_from_tf(sentences, p):
"""
獲取某個類型下語料的熱詞
:param sentences: list, cuted sentences, filter by " "
:param p: float, rate, 0 < p < 1
:return: list, words
"""
sentence_cut_list = [" ".join(list(jieba.cut(text.strip(), cut_all=False, HMM=True))) for text in sentences]
# token_pattern指定統計詞頻的模式, 不指定, 默認如英文, 不統計單字
vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b')
# norm=None對詞頻結果不歸一化
# use_idf=False, 因為使用的是計算tfidf的函數, 所以要忽略idf的計算
transformer = TfidfTransformer(norm=None, use_idf=False)
vectorizer.fit_transform(sentence_cut_list)
# tf = transformer.fit_transform(vectorizer.fit_transform(sentence_cut_list))
word = vectorizer.get_feature_names()
# weight = tf.toarray()
return word[-int(len(word) * p):]
示例10: load_data_ssl
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def load_data_ssl(data_name):
adj_csr, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(data_name)
adj_mat = np.asarray(adj_csr.toarray(), dtype=np_float_type)
x_tr = np.reshape(np.arange(len(train_mask))[train_mask], (-1, 1))
x_val = np.reshape(np.arange(len(val_mask))[val_mask], (-1, 1))
x_test = np.reshape(np.arange(len(test_mask))[test_mask], (-1, 1))
y_tr = np.asarray(y_train[train_mask], dtype=np.int32)
y_tr = np.reshape(np.sum(np.tile(np.arange(y_tr.shape[1]), (np.sum(train_mask), 1)) * y_tr, axis=1), (-1, 1))
y_val = np.asarray(y_val[val_mask], dtype=np.int32)
y_val = np.reshape(np.sum(np.tile(np.arange(y_val.shape[1]), (np.sum(val_mask), 1)) * y_val, axis=1), (-1, 1))
y_test = np.asarray(y_test[test_mask], dtype=np.int32)
y_test = np.reshape(np.sum(np.tile(np.arange(y_test.shape[1]), (np.sum(test_mask), 1)) * y_test, axis=1), (-1, 1))
node_features = features.toarray()
if data_name.lower() != 'pubmed': #pubmed already comes with tf-idf
transformer = TfidfTransformer(smooth_idf=True)
node_features = transformer.fit_transform(node_features).toarray()
return adj_mat, node_features, x_tr, y_tr, x_val, y_val, x_test, y_test
示例11: test_nlp_not_padded_invalid
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def test_nlp_not_padded_invalid(self):
num_words = 1024
(x_train, y_train), (_, _) = TestUtil.get_random_variable_length_dataset(max_value=num_words)
explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)
counter = CountVectoriser(num_words)
tfidf_transformer = TfidfTransformer()
explained_model = Pipeline([('counts', counter),
('tfidf', tfidf_transformer),
('model', explained_model)])
explained_model.fit(x_train, y_train)
model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
masking_operation = WordDropMasking()
loss = binary_crossentropy
explainer = CXPlain(explained_model, model_builder, masking_operation, loss)
with self.assertRaises(ValueError):
explainer.fit(x_train, y_train)
示例12: transform_bag_of_words
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def transform_bag_of_words(filename, n_dimensions, out_fn):
import gzip
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import random_projection
with gzip.open(filename, 'rb') as f:
file_content = f.readlines()
entries = int(file_content[0])
words = int(file_content[1])
file_content = file_content[3:] # strip first three entries
print("building matrix...")
A = lil_matrix((entries, words))
for e in file_content:
doc, word, cnt = [int(v) for v in e.strip().split()]
A[doc - 1, word - 1] = cnt
print("normalizing matrix entries with tfidf...")
B = TfidfTransformer().fit_transform(A)
print("reducing dimensionality...")
C = random_projection.GaussianRandomProjection(
n_components=n_dimensions).fit_transform(B)
X_train, X_test = train_test_split(C)
write_output(numpy.array(X_train), numpy.array(
X_test), out_fn, 'angular')
示例13: question_classifier
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def question_classifier(data):
questions = [i[1] for i in data]
sql_type = [i[2].split(' ')[1] for i in data]
sql_type_set = set(sql_type)
sql_classes = dict([(type, i) for i, type in enumerate(sql_type_set)])
target = np.array([sql_classes[i] for i in sql_type])
sql_type_to_indices = {}
for type in sql_type_set:
sql_type_to_indices[type] = [idx for idx, i in enumerate(sql_type) if i == type]
# Build classifier
# TODO better ones
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', svm.LinearSVC())])
text_clf.fit(questions, target)
predicted = text_clf.predict(questions)
print('Training Acc.: %f' %(np.mean(predicted == target)))
return sql_type_to_indices, text_clf
示例14: removeSimilarSentences
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def removeSimilarSentences(generatedSentences, originalSentences, stopwords,threshold=0.80,):
docs=[]
for sent, sim in generatedSentences:
docs.append(sent)
docs.extend(originalSentences)
bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs)
normalized = TfidfTransformer().fit_transform(bow_matrix)
#simMatrix = (normalized[0:] * normalized[0:].T).A
simindices=[]
#print 'Num original, ', len(originalSentences)
for i in xrange(len(generatedSentences)):
simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten()
if(max(simGeneratedScores) >= threshold):
simindices.append(i)
#print simindices
finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices]
#print len(generatedSentences), len(finalGen)
return finalGen
示例15: __init__
# 需要導入模塊: from sklearn.feature_extraction import text [as 別名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 別名]
def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False):
"""
`min_df` is set to filter out extremely rare words,
since we don't want those to dominate the distance metric.
`max_df` is set to filter out extremely common words,
since they don't convey much information.
"""
# Wrap the specified tokenizer
t = Tokenizer(tokenizer())
if hash:
vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t)
else:
vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df)
args = [
('vectorizer', vectr),
('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
('normalizer', Normalizer(copy=False))
]
self.pipeline = Pipeline(args)
self.trained = False