本文整理汇总了Python中sklearn.feature_extraction.text.TfidfVectorizer方法的典型用法代码示例。如果您正苦于以下问题:Python text.TfidfVectorizer方法的具体用法?Python text.TfidfVectorizer怎么用?Python text.TfidfVectorizer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_extraction.text
的用法示例。
在下文中一共展示了text.TfidfVectorizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: text_to_graph
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def text_to_graph(text):
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import kneighbors_graph
# use tfidf to transform texts into feature vectors
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(text)
# build the graph which is full-connected
N = vectors.shape[0]
mat = kneighbors_graph(vectors, N, metric='cosine', mode='distance', include_self=True)
mat.data = 1 - mat.data # to similarity
g = nx.from_scipy_sparse_matrix(mat, create_using=nx.Graph())
return g
示例2: load
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def load(self):
categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
newsgroups_train = fetch_20newsgroups(
subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(
subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)
vectorizer = TfidfVectorizer(stop_words='english', min_df=0.001, max_df=0.20)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)
x1 = vectors
y1 = newsgroups_train.target
x2 = vectors_test
y2 = newsgroups_test.target
x = np.array(np.r_[x1.todense(), x2.todense()])
y = np.r_[y1, y2]
return x, y
示例3: train
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def train(self, training_data) -> None:
questions = training_data[0]
answers = training_data[1]
answer_docs = defaultdict(str)
for q, ans in zip(questions, answers):
text = ' '.join(q)
answer_docs[ans] += ' ' + text
x_array = []
y_array = []
for ans, doc in answer_docs.items():
x_array.append(doc)
y_array.append(ans)
self.i_to_ans = {i: ans for i, ans in enumerate(y_array)}
self.tfidf_vectorizer = TfidfVectorizer(
ngram_range=(1, 3), min_df=2, max_df=.9
).fit(x_array)
self.tfidf_matrix = self.tfidf_vectorizer.transform(x_array)
示例4: test_ShapLinearExplainer
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def test_ShapLinearExplainer(self):
corpus, y = shap.datasets.imdb()
corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=7)
vectorizer = TfidfVectorizer(min_df=10)
X_train = vectorizer.fit_transform(corpus_train)
X_test = vectorizer.transform(corpus_test)
model = sklearn.linear_model.LogisticRegression(penalty="l1", C=0.1, solver='liblinear')
model.fit(X_train, y_train)
shapexplainer = LinearExplainer(model, X_train, feature_dependence="independent")
shap_values = shapexplainer.explain_instance(X_test)
print("Invoked Shap LinearExplainer")
# comment this test as travis runs out of resources
示例5: extract_bow
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True):
"""return 4 tensors of train_q1,q2 and test_q1,q2"""
df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("")
df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("")
df = pd.DataFrame()
df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique()
if to_preprocess:
df['text'] = df['text'].map(lambda x: preprocess(x))
df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess)
df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess)
df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess)
df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess)
if analyzer == 'char':
vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
else:
vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
vect.fit(df["text"].tolist())
return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect
示例6: _init_word_ngram_tfidf
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def _init_word_ngram_tfidf(self, ngram, vocabulary=None):
tfidf = TfidfVectorizer(min_df=3,
max_df=0.75,
max_features=None,
norm="l2",
strip_accents="unicode",
analyzer="word",
token_pattern=r"\w{1,}",
ngram_range=(1, ngram),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,
# stop_words="english",
vocabulary=vocabulary)
return tfidf
## char based
示例7: _init_char_ngram_tfidf
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def _init_char_ngram_tfidf(self, ngram, vocabulary=None):
tfidf = TfidfVectorizer(min_df=3,
max_df=0.75,
max_features=None,
norm="l2",
strip_accents="unicode",
analyzer="char",
token_pattern=r"\w{1,}",
ngram_range=(1, ngram),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,
# stop_words="english",
vocabulary=vocabulary)
return tfidf
# ------------------------ LSA -------------------------------
示例8: create_ngram_model
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def create_ngram_model(params=None):
def preprocessor(tweet):
global emoticons_replaced
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
clf = MultinomialNB()
pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
开发者ID:PacktPublishing,项目名称:Building-Machine-Learning-Systems-With-Python-Second-Edition,代码行数:23,代码来源:03_clean.py
示例9: create_union_model
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def create_union_model(params=None):
def preprocessor(tweet):
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet.replace("-", " ").replace("_", " ")
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion(
[('ling', ling_stats), ('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('ling', ling_stats)])
clf = MultinomialNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
开发者ID:PacktPublishing,项目名称:Building-Machine-Learning-Systems-With-Python-Second-Edition,代码行数:27,代码来源:04_sent.py
示例10: tfidf_fit
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def tfidf_fit(sentences):
"""
tfidf相似度
:param sentences:
:return:
"""
# tfidf计算
model = TfidfVectorizer(ngram_range=(1, 2), # 3,5
stop_words=[' ', '\t', '\n'], # 停用词
max_features=10000,
token_pattern=r"(?u)\b\w+\b", # 过滤停用词
min_df=1,
max_df=0.9,
use_idf=1, # 光滑
smooth_idf=1, # 光滑
sublinear_tf=1, ) # 光滑
matrix = model.fit_transform(sentences)
return matrix
示例11: tfidf_fit
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def tfidf_fit(sentences):
"""
tfidf相似度
:param sentences:
:return:
"""
# tfidf计算
model = TfidfVectorizer(ngram_range=(1, 2), # 3,5
stop_words=[' ', '\t', '\n'], # 停用词
max_features=10000,
token_pattern=r"(?u)\b\w+\b", # 过滤停用词
min_df=1,
max_df=0.9,
use_idf=1, # 光滑
smooth_idf=1, # 光滑
sublinear_tf=1, ) # 光滑
matrix = model.fit_transform(sentences)
return matrix
示例12: tdidf_sim
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def tdidf_sim(sentences):
"""
tfidf相似度
:param sentences:
:return:
"""
# tfidf计算
model = TfidfVectorizer(tokenizer=jieba.cut,
ngram_range=(1, 2), # 3,5
stop_words=[' ', '\t', '\n'], # 停用词
max_features=10000,
token_pattern=r"(?u)\b\w+\b", # 过滤停用词
min_df=1,
max_df=0.9,
use_idf=1, # 光滑
smooth_idf=1, # 光滑
sublinear_tf=1, ) # 光滑
matrix = model.fit_transform(sentences)
matrix_norm = TfidfTransformer().fit_transform(matrix)
return matrix_norm
示例13: test_vectorizer_stop_words_inconsistent
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def test_vectorizer_stop_words_inconsistent():
lstr = "['and', 'll', 've']"
message = ('Your stop_words may be inconsistent with your '
'preprocessing. Tokenizing the stop words generated '
'tokens %s not in stop_words.' % lstr)
for vec in [CountVectorizer(),
TfidfVectorizer(), HashingVectorizer()]:
vec.set_params(stop_words=["you've", "you", "you'll", 'AND'])
assert_warns_message(UserWarning, message, vec.fit_transform,
['hello world'])
# reset stop word validation
del vec._stop_words_id
assert _check_stop_words_consistency(vec) is False
# Only one warning per stop list
assert_no_warnings(vec.fit_transform, ['hello world'])
assert _check_stop_words_consistency(vec) is None
# Test caching of inconsistency assessment
vec.set_params(stop_words=["you've", "you", "you'll", 'blah', 'AND'])
assert_warns_message(UserWarning, message, vec.fit_transform,
['hello world'])
示例14: build_language_classifier
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def build_language_classifier(texts, labels, verbose=False, random_state=None):
"""Train a text classifier with scikit-learn
The text classifier is composed of two elements assembled in a pipeline:
- A text feature extractor (`TfidfVectorizer`) that extract the relative
frequencies of unigrams, bigrams and trigrams of characters in the text.
- An instance of `SGDClassifier` for the classification it-self. To speed
up training it is recommended to enable early stopping.
`random_state` is passed to the underlying `SGDClassifier` instance.
"""
language_classifier = make_pipeline(
TfidfVectorizer(analyzer="char", ngram_range=(1, 3),
min_df=2, max_df=0.9, norm="l2", dtype=np.float32),
SGDClassifier(early_stopping=True, validation_fraction=0.2,
n_iter_no_change=3, max_iter=1000, tol=1e-3,
alpha=1e-5, penalty="l2", verbose=verbose,
random_state=random_state)
)
return language_classifier.fit(texts, labels)
示例15: __init__
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import TfidfVectorizer [as 别名]
def __init__(self, language=None):
""" Initialize stoplist builder with option for language specific parameters
:type language: str
:param language : text from which to build the stoplist
"""
if language:
self.language = language.lower()
self.numpy_installed = True # Write utility for common import traps?
self.sklearn_installed = True
try:
import numpy as np
self.np = np
except ImportError:
self.numpy_installed = False
try:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# self.vectorizer = CountVectorizer(input='content') # Set df?
# self.tfidf_vectorizer = TfidfVectorizer()
except ImportError:
self.sklearn_installed = False