本文整理汇总了Python中sklearn.feature_extraction.text.CountVectorizer方法的典型用法代码示例。如果您正苦于以下问题:Python text.CountVectorizer方法的具体用法?Python text.CountVectorizer怎么用?Python text.CountVectorizer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_extraction.text
的用法示例。
在下文中一共展示了text.CountVectorizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_validate_sklearn_sgd_with_text_cv
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def test_validate_sklearn_sgd_with_text_cv(self):
categories = ['alt.atheism','talk.religion.misc']
data = fetch_20newsgroups(subset='train', categories=categories)
X = data.data[:4]
Y = data.target[:4]
features = ['input']
target = 'output'
model = SGDClassifier(loss="log")
file_name = model.__class__.__name__ + '_CountVec_.pmml'
pipeline = Pipeline([
('vect', CountVectorizer()),
('clf', model)
])
pipeline.fit(X, Y)
skl_to_pmml(pipeline, features , target, file_name)
self.assertEqual(self.schema.is_valid(file_name), True)
示例2: read_relations
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def read_relations(file_name):
bow = []
count_vec = CountVectorizer()
d = {}
file = open(file_name)
for line in file:
index, name = line.strip().split('\t')
d[name] = int(index)
if args.feature_type == 'bow' and not os.path.exists('../data/' + args.dataset + '/bow.npy'):
tokens = re.findall('[a-z]{2,}', name)
bow.append(' '.join(tokens))
file.close()
if args.feature_type == 'bow' and not os.path.exists('../data/' + args.dataset + '/bow.npy'):
bow = count_vec.fit_transform(bow)
np.save('../data/' + args.dataset + '/bow.npy', bow.toarray())
return d
示例3: test_from_texts
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def test_from_texts():
cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0),
doc2label_vectorizer=LabelCountVectorizer())
actual = cal.from_texts(docs, labels)
assert_equal(actual.shape[1], 4)
assert_equal(actual.shape[0], 9)
assert_equal(cal.index2word_, {0: u'information',
1: u'language',
2: u'learning',
3: u'machine',
4: u'mining',
5: u'natural',
6: u'processing',
7: u'retrieval',
8: u'text'})
assert_equal(cal.index2label_, {0: 'information retrieval'.split(),
1: 'machine learning'.split(),
2: 'natural language processing'.split(),
3: 'text mining'.split()})
示例4: _te_ss_t_build
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def _te_ss_t_build(self):
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
corpus = CorpusFromScikit(
X=X_counts,
y=newsgroups_train.target,
feature_vocabulary=count_vectorizer.vocabulary_,
category_names=newsgroups_train.target_names,
raw_texts=newsgroups_train.data
).build()
self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
self.assertEqual(corpus
.get_term_freq_df()
.assign(score=corpus.get_scaled_f_scores('alt.atheism'))
.sort_values(by='score', ascending=False).index.tolist()[:5],
['atheism', 'atheists', 'islam', 'atheist', 'belief'])
self.assertGreater(len(corpus.get_texts()[0]), 5)
示例5: test_build
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def test_build(self):
from sklearn.feature_extraction.text import CountVectorizer
categories, docs = get_docs_categories_semiotic()
idx_store = IndexStore()
y = np.array([idx_store.getidx(c) for c in categories])
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(docs)
term_doc_mat = TermDocMatrixFromScikit(
X=X_counts,
y=y,
feature_vocabulary=count_vectorizer.vocabulary_,
category_names=idx_store.values()).build()
self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly'])
self.assertEqual(term_doc_mat
.get_term_freq_df()
.assign(score=term_doc_mat.get_scaled_f_scores('hamlet'))
.sort_values(by='score', ascending=False).index.tolist()[:5],
['that', 'march', 'did', 'majesty', 'sometimes'])
示例6: __init__
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def __init__(
self,
anonymize=True,
trim_window=5,
lowercase=True,
drop_stopwords=True,
stem=True,
ngram_range=(1, 3),
**vectorizer_kwargs,
):
self.anonymize = anonymize
self.lowercase = lowercase
self.drop_stopwords = drop_stopwords
if drop_stopwords:
nltk.download("stopwords")
self.stopwords = set(nltk.corpus.stopwords.words("english"))
self.trim_window = trim_window
self.stem = stem
if stem:
self.porter = nltk.PorterStemmer()
self.vectorizer = CountVectorizer(
ngram_range=ngram_range, binary=True, **vectorizer_kwargs
)
示例7: _n_grams
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def _n_grams():
"""
Calculates various statistical features over the 1-,2- and 3-grams of the suffix and dot free domain
:return:
"""
global __unigram
feature = []
for i in range(1,4):
ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(i, i))
counts = ngram_vectorizer.build_analyzer()(__joined_dot_split_suffix_free)
npa = numpy.array(list(Counter(counts).values()), dtype=int)
if i == 1:
__unigram = npa
feature += __stats_over_n_grams(npa)
return feature
示例8: __init__
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def __init__(self, language=None):
""" Initialize stoplist builder with option for language specific parameters
:type language: str
:param language : text from which to build the stoplist
"""
if language:
self.language = language.lower()
self.numpy_installed = True # Write utility for common import traps?
self.sklearn_installed = True
try:
import numpy as np
self.np = np
except ImportError:
self.numpy_installed = False
try:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# self.vectorizer = CountVectorizer(input='content') # Set df?
# self.tfidf_vectorizer = TfidfVectorizer()
except ImportError:
self.sklearn_installed = False
示例9: tf_word_feature
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def tf_word_feature(self, data_set):
"""
Get TF feature by word
:param data_set:
:return:
"""
data_set = get_word_segment_data(data_set)
if self.is_infer:
self.vectorizer = load_pkl(self.feature_vec_path)
data_feature = self.vectorizer.transform(data_set)
else:
self.vectorizer = CountVectorizer(vocabulary=self.word_vocab)
data_feature = self.vectorizer.fit_transform(data_set)
vocab = self.vectorizer.vocabulary_
logger.info('Vocab size:%d' % len(vocab))
feature_names = self.vectorizer.get_feature_names()
logger.info('feature_names:%s' % feature_names[:20])
logger.info(data_feature.shape)
if not self.is_infer:
save_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
return data_feature
示例10: test_04_lgbm_regressor
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def test_04_lgbm_regressor(self):
print("\ntest 04 (lgbm regressor with preprocessing)\n")
auto = pd.read_csv('nyoka/tests/auto-mpg.csv')
X = auto.drop(['mpg'], axis=1)
y = auto['mpg']
feature_names = [name for name in auto.columns if name not in ('mpg')]
target_name='mpg'
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)
pd.DataFrame(data=x_test, columns=feature_names).to_csv("test.csv",index=False)
pipeline_obj = Pipeline([
('mapper', DataFrameMapper([
('car name', CountVectorizer()),
(['displacement'],[StandardScaler()])
])),
('lgbmr',LGBMRegressor())
])
pipeline_obj.fit(x_train,y_train)
file_name = "test04lgbm.pmml"
lgb_to_pmml(pipeline_obj, feature_names, 'mpg', file_name)
model_name = self.adapa_utility.upload_to_zserver(file_name)
predictions, _ = self.adapa_utility.score_in_zserver(model_name, "test.csv")
predictions = numpy.array(predictions)
model_pred = pipeline_obj.predict(x_test)
self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
示例11: word2vec
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def word2vec(word_list,n_features=1000,topics = 5):
tf_vectorizer = CountVectorizer(strip_accents='unicode',
max_features=n_features,
#stop_words='english',
max_df=0.5,
min_df=10)
tf = tf_vectorizer.fit_transform(word_list)
lda = LatentDirichletAllocation(n_components=topics,#主题数
learning_method='batch',#样本量不大只是用来学习的话用"batch"比较好,这样可以少很多参数要调
)
#用变分贝叶斯方法训练模型
lda.fit(tf)
#依次输出每个主题的关键词表
tf_feature_names = tf_vectorizer.get_feature_names()
return lda,tf,tf_feature_names,tf_vectorizer
#将主题以可视化结果展现出来
示例12: build_vectorization_pipeline
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
"""
Build SKLearn vectorization pipeline for this field.
This is used in field-based machine learning when we calculate value of one field based on the
values of other fields of this document.
We are able to detect only choice fields this way at the moment.
To reach this we need to build a feature vector of all dependencies of the field being detected.
This feature vector is built as a union of feature vectors of each dependency.
See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..)
:return: Tuple of: 1. List of vectorization steps - to be added to a Pipeline()
2. List of str feature names or a function returning list of str feature names.
"""
vect = CountVectorizer(strip_accents='unicode', analyzer='word',
stop_words=self._build_stop_words())
return [('clean', vectorizers.ReplaceNoneTransformer('')),
('vect', vect),
('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)
示例13: __init__
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def __init__(self):
"""Initializes the Encoder object and sets internal tokenizer,
labelEncoder and vectorizer using predefined objects.
"""
self.tokenizer = BOWTokenizer(
English()
) # the tokenizer must have a tokenize() and parse() function.
self.labelEncoder = LabelEncoder()
self.vectorizer = CountVectorizer(
tokenizer=self.tokenizer.tokenize, ngram_range=(1, 1)
)
self.decode_params = {}
# The keep_ids flag, is used by explain local in the explainer to decode
# importances over raw features.
示例14: create_random_forest_vectorizer
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def create_random_forest_vectorizer():
vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
rf = RandomForestClassifier(n_estimators=500, random_state=777)
return Pipeline([("vectorizer", vectorizer), ("rf", rf)])
示例15: create_logistic_vectorizer
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import CountVectorizer [as 别名]
def create_logistic_vectorizer():
vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
lr = LogisticRegression(random_state=777)
return Pipeline([("vectorizer", vectorizer), ("lr", lr)])