本文整理汇总了Python中sklearn.feature_extraction.text.TfidfTransformer方法的典型用法代码示例。如果您正苦于以下问题:Python text.TfidfTransformer方法的具体用法?Python text.TfidfTransformer怎么用?Python text.TfidfTransformer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_extraction.text
的用法示例。
在下文中一共展示了text.TfidfTransformer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: write_topics
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def write_topics(ftopics, fwords, ftopics_words, poem_words, n_topic, n_topic_words):
count_matrix = count_vect.fit_transform(poem_words)
tfidf = TfidfTransformer().fit_transform(count_matrix)
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
feature_names = count_vect.get_feature_names()
fw = codecs.open(ftopics, 'w', 'utf-8')
for topic in nmf.components_:
fw.write(' '.join([feature_names[i] for i in topic.argsort()[:-n_topic_words - 1:-1]]) + '\n')
fw.close()
print('Write topics done.')
fw = codecs.open(fwords, 'wb')
pickle.dump(feature_names, fw)
fw.close()
print('Write words done.')
fw = codecs.open(ftopics_words, 'wb')
pickle.dump(nmf.components_, fw)
fw.close()
print('Write topic_words done.')
示例2: get_logistic_regression_coefs_l2
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def get_logistic_regression_coefs_l2(self, category,
clf=RidgeClassifierCV()):
''' Computes l2-penalized logistic regression score.
Parameters
----------
category : str
category name to score
category : str
category name to score
Returns
-------
(coefficient array, accuracy, majority class baseline accuracy)
'''
try:
from sklearn.cross_validation import cross_val_predict
except:
from sklearn.model_selection import cross_val_predict
y = self._get_mask_from_category(category)
X = TfidfTransformer().fit_transform(self._X)
clf.fit(X, y)
y_hat = cross_val_predict(clf, X, y)
acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
return clf.coef_[0], acc, baseline
示例3: test_main
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def test_main(self):
categories, documents = get_docs_categories()
clean_function = lambda text: '' if text.startswith('[') else text
entity_types = set(['GPE'])
term_doc_mat = (
TermDocMatrixFactory(
category_text_iter=zip(categories, documents),
clean_function=clean_function,
nlp=_testing_nlp,
feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
).build()
)
clf = PassiveAggressiveClassifier()
fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
clean_function=clean_function,
feats_from_spacy_doc=FeatsFromSpacyDoc(
entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
tfidf = TfidfTransformer(norm='l1')
X = tfidf.fit_transform(term_doc_mat._X)
clf.fit(X, term_doc_mat._y)
X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
pred = clf.predict(tfidf.transform(X_to_predict))
dec = clf.decision_function(X_to_predict)
示例4: tdidf_sim
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def tdidf_sim(sentences):
"""
tfidf相似度
:param sentences:
:return:
"""
# tfidf计算
model = TfidfVectorizer(tokenizer=jieba.cut,
ngram_range=(1, 2), # 3,5
stop_words=[' ', '\t', '\n'], # 停用词
max_features=10000,
token_pattern=r"(?u)\b\w+\b", # 过滤停用词
min_df=1,
max_df=0.9,
use_idf=1, # 光滑
smooth_idf=1, # 光滑
sublinear_tf=1, ) # 光滑
matrix = model.fit_transform(sentences)
matrix_norm = TfidfTransformer().fit_transform(matrix)
return matrix_norm
示例5: test_tf_idf_smoothing
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def test_tf_idf_smoothing():
X = [[1, 1, 1],
[1, 1, 0],
[1, 0, 0]]
tr = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = tr.fit_transform(X).toarray()
assert (tfidf >= 0).all()
# check normalization
assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])
# this is robust to features with only zeros
X = [[1, 1, 0],
[1, 1, 0],
[1, 0, 0]]
tr = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = tr.fit_transform(X).toarray()
assert (tfidf >= 0).all()
示例6: extract_characters
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def extract_characters(sentences: list, dimension: int):
"""
vertorizer
:param sentences: list
:param dimension: int
:return: weight, training_data
"""
print("Vetorizier...")
# Transfer into frequency matrix a[i][j], word j in text class i frequency
vertorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.46)
# vertorizer = CountVectorizer()
# collect tf-idf weight
transformer = TfidfTransformer()
# outer transform for calculate tf-idf, second for transform into matrix
tfidf = transformer.fit_transform(vertorizer.fit_transform(sentences))
# get all words in BOW
words_bag = vertorizer.get_feature_names()
# w[i][j] represents word j's weight in text class i
weight = tfidf.toarray()
print('Features length:' + str(len(words_bag)))
pca = PCA(n_components=dimension)
training_data = pca.fit_transform(weight)
return weight, training_data
示例7: build_vectorization_pipeline
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
"""
Build SKLearn vectorization pipeline for this field.
This is used in field-based machine learning when we calculate value of one field based on the
values of other fields of this document.
We are able to detect only choice fields this way at the moment.
To reach this we need to build a feature vector of all dependencies of the field being detected.
This feature vector is built as a union of feature vectors of each dependency.
See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..)
:return: Tuple of: 1. List of vectorization steps - to be added to a Pipeline()
2. List of str feature names or a function returning list of str feature names.
"""
vect = CountVectorizer(strip_accents='unicode', analyzer='word',
stop_words=self._build_stop_words())
return [('clean', vectorizers.ReplaceNoneTransformer('')),
('vect', vect),
('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)
示例8: get_topic_idf
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def get_topic_idf(self, sentences):
vectorizer = CountVectorizer()
sent_word_matrix = vectorizer.fit_transform(sentences)
transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
tfidf = transformer.fit_transform(sent_word_matrix)
tfidf = tfidf.toarray()
centroid_vector = tfidf.sum(0)
centroid_vector = np.divide(centroid_vector, centroid_vector.max())
# print(centroid_vector.max())
feature_names = vectorizer.get_feature_names()
relevant_vector_indices = np.where(centroid_vector > self.topic_threshold)[0]
word_list = list(np.array(feature_names)[relevant_vector_indices])
return word_list
示例9: get_keyword_from_tf
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def get_keyword_from_tf(sentences, p):
"""
获取某个类型下语料的热词
:param sentences: list, cuted sentences, filter by " "
:param p: float, rate, 0 < p < 1
:return: list, words
"""
sentence_cut_list = [" ".join(list(jieba.cut(text.strip(), cut_all=False, HMM=True))) for text in sentences]
# token_pattern指定统计词频的模式, 不指定, 默认如英文, 不统计单字
vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b')
# norm=None对词频结果不归一化
# use_idf=False, 因为使用的是计算tfidf的函数, 所以要忽略idf的计算
transformer = TfidfTransformer(norm=None, use_idf=False)
vectorizer.fit_transform(sentence_cut_list)
# tf = transformer.fit_transform(vectorizer.fit_transform(sentence_cut_list))
word = vectorizer.get_feature_names()
# weight = tf.toarray()
return word[-int(len(word) * p):]
示例10: load_data_ssl
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def load_data_ssl(data_name):
adj_csr, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(data_name)
adj_mat = np.asarray(adj_csr.toarray(), dtype=np_float_type)
x_tr = np.reshape(np.arange(len(train_mask))[train_mask], (-1, 1))
x_val = np.reshape(np.arange(len(val_mask))[val_mask], (-1, 1))
x_test = np.reshape(np.arange(len(test_mask))[test_mask], (-1, 1))
y_tr = np.asarray(y_train[train_mask], dtype=np.int32)
y_tr = np.reshape(np.sum(np.tile(np.arange(y_tr.shape[1]), (np.sum(train_mask), 1)) * y_tr, axis=1), (-1, 1))
y_val = np.asarray(y_val[val_mask], dtype=np.int32)
y_val = np.reshape(np.sum(np.tile(np.arange(y_val.shape[1]), (np.sum(val_mask), 1)) * y_val, axis=1), (-1, 1))
y_test = np.asarray(y_test[test_mask], dtype=np.int32)
y_test = np.reshape(np.sum(np.tile(np.arange(y_test.shape[1]), (np.sum(test_mask), 1)) * y_test, axis=1), (-1, 1))
node_features = features.toarray()
if data_name.lower() != 'pubmed': #pubmed already comes with tf-idf
transformer = TfidfTransformer(smooth_idf=True)
node_features = transformer.fit_transform(node_features).toarray()
return adj_mat, node_features, x_tr, y_tr, x_val, y_val, x_test, y_test
示例11: test_nlp_not_padded_invalid
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def test_nlp_not_padded_invalid(self):
num_words = 1024
(x_train, y_train), (_, _) = TestUtil.get_random_variable_length_dataset(max_value=num_words)
explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)
counter = CountVectoriser(num_words)
tfidf_transformer = TfidfTransformer()
explained_model = Pipeline([('counts', counter),
('tfidf', tfidf_transformer),
('model', explained_model)])
explained_model.fit(x_train, y_train)
model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
masking_operation = WordDropMasking()
loss = binary_crossentropy
explainer = CXPlain(explained_model, model_builder, masking_operation, loss)
with self.assertRaises(ValueError):
explainer.fit(x_train, y_train)
示例12: transform_bag_of_words
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def transform_bag_of_words(filename, n_dimensions, out_fn):
import gzip
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import random_projection
with gzip.open(filename, 'rb') as f:
file_content = f.readlines()
entries = int(file_content[0])
words = int(file_content[1])
file_content = file_content[3:] # strip first three entries
print("building matrix...")
A = lil_matrix((entries, words))
for e in file_content:
doc, word, cnt = [int(v) for v in e.strip().split()]
A[doc - 1, word - 1] = cnt
print("normalizing matrix entries with tfidf...")
B = TfidfTransformer().fit_transform(A)
print("reducing dimensionality...")
C = random_projection.GaussianRandomProjection(
n_components=n_dimensions).fit_transform(B)
X_train, X_test = train_test_split(C)
write_output(numpy.array(X_train), numpy.array(
X_test), out_fn, 'angular')
示例13: question_classifier
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def question_classifier(data):
questions = [i[1] for i in data]
sql_type = [i[2].split(' ')[1] for i in data]
sql_type_set = set(sql_type)
sql_classes = dict([(type, i) for i, type in enumerate(sql_type_set)])
target = np.array([sql_classes[i] for i in sql_type])
sql_type_to_indices = {}
for type in sql_type_set:
sql_type_to_indices[type] = [idx for idx, i in enumerate(sql_type) if i == type]
# Build classifier
# TODO better ones
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', svm.LinearSVC())])
text_clf.fit(questions, target)
predicted = text_clf.predict(questions)
print('Training Acc.: %f' %(np.mean(predicted == target)))
return sql_type_to_indices, text_clf
示例14: removeSimilarSentences
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def removeSimilarSentences(generatedSentences, originalSentences, stopwords,threshold=0.80,):
docs=[]
for sent, sim in generatedSentences:
docs.append(sent)
docs.extend(originalSentences)
bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs)
normalized = TfidfTransformer().fit_transform(bow_matrix)
#simMatrix = (normalized[0:] * normalized[0:].T).A
simindices=[]
#print 'Num original, ', len(originalSentences)
for i in xrange(len(generatedSentences)):
simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten()
if(max(simGeneratedScores) >= threshold):
simindices.append(i)
#print simindices
finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices]
#print len(generatedSentences), len(finalGen)
return finalGen
示例15: __init__
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfTransformer [as 别名]
def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False):
"""
`min_df` is set to filter out extremely rare words,
since we don't want those to dominate the distance metric.
`max_df` is set to filter out extremely common words,
since they don't convey much information.
"""
# Wrap the specified tokenizer
t = Tokenizer(tokenizer())
if hash:
vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t)
else:
vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df)
args = [
('vectorizer', vectr),
('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
('normalizer', Normalizer(copy=False))
]
self.pipeline = Pipeline(args)
self.trained = False