本文整理汇总了Python中sklearn.feature_extraction.DictVectorizer方法的典型用法代码示例。如果您正苦于以下问题:Python feature_extraction.DictVectorizer方法的具体用法?Python feature_extraction.DictVectorizer怎么用?Python feature_extraction.DictVectorizer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_extraction
的用法示例。
在下文中一共展示了feature_extraction.DictVectorizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
示例2: __init__
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def __init__(self):
self.vectorizer = HashingVectorizer(ngram_range=(1, 2))
self.dict_vectorizer = DictVectorizer()
# These are set dynamically in training
# but fixed here to match the end feature names
# in the trained model. If the model is retrained then
# these may have to change
self.dict_vectorizer.feature_names_ = [
'DocumentPositionQuintile0',
'DocumentPositionQuintile1',
'DocumentPositionQuintile2',
'DocumentPositionQuintile3',
'DocumentPositionQuintile4',
'DocumentPositionQuintile5',
'DocumentPositionQuintile6']
self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)}
self.drugbank = Drugbank()
示例3: test_dictvectorizer
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_dictvectorizer(sparse, dtype, sort, iterable):
D = [{"foo": 1, "bar": 3},
{"bar": 4, "baz": 2},
{"bar": 1, "quux": 1, "quuux": 2}]
v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
X = v.fit_transform(iter(D) if iterable else D)
assert_equal(sp.issparse(X), sparse)
assert_equal(X.shape, (3, 5))
assert_equal(X.sum(), 14)
assert_equal(v.inverse_transform(X), D)
if sparse:
# CSR matrices can't be compared for equality
assert_array_equal(X.A, v.transform(iter(D) if iterable
else D).A)
else:
assert_array_equal(X, v.transform(iter(D) if iterable
else D))
if sort:
assert_equal(v.feature_names_,
sorted(v.feature_names_))
示例4: test_unseen_or_no_features
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_unseen_or_no_features():
D = [{"camelot": 0, "spamalot": 1}]
for sparse in [True, False]:
v = DictVectorizer(sparse=sparse).fit(D)
X = v.transform({"push the pram a lot": 2})
if sparse:
X = X.toarray()
assert_array_equal(X, np.zeros((1, 2)))
X = v.transform({})
if sparse:
X = X.toarray()
assert_array_equal(X, np.zeros((1, 2)))
try:
v.transform([])
except ValueError as e:
assert_in("empty", str(e))
示例5: transform
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def transform(self, documents):
"""
Returns a dictionary of text features in advance of a DictVectorizer.
"""
for document in documents:
# Collect token and vocabulary counts
counts = Counter(
item[0] for para in document for sent in para for item in sent
)
# Yield structured information about the document
yield {
'paragraphs': len(document),
'sentences': sum(len(para) for para in document),
'words': sum(counts.values()),
'vocab': len(counts),
}
##########################################################################
## Model Building Functions
##########################################################################
示例6: _consolidate_pipeline
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
# First, restrict our DictVectorizer or DataFrameVectorizer
# This goes through and has DV only output the items that have passed our support mask
# This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
# It also significantly reduces the size of dv.vocabulary_ which can get quite large
try:
feature_selection = transformation_pipeline.named_steps['feature_selection']
feature_selection_mask = feature_selection.support_mask
transformation_pipeline.named_steps['dv'].restrict(feature_selection_mask)
except KeyError:
pass
# We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
# In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)
return trained_pipeline_without_feature_selection
示例7: __init__
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None):
""" Initializes the extractor.
:param language: The language of the sentences that will be used
:param window_width: how many tokens to look before and after a each
token when building its features.
:param collapse_fes: Whether to collapse FEs to a single token
or to keep them split.
"""
self.language = language
self.tagger = TTPosTagger(language)
self.window_width = window_width
self.collapse_fes = collapse_fes
self.unk_feature = 'UNK'
self.vectorizer = DictVectorizer()
self.target_size = target_size
self.reducer = TruncatedSVD(target_size) if target_size else None
self.vocabulary = set()
self.label_index = {}
self.lu_index = {}
self.stopwords = set(w.lower() for w in StopWords().words(language))
self.start()
示例8: test_model_dict_vectorizer
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_model_dict_vectorizer(self):
model = DictVectorizer()
data = [{"amy": 1.0, "chin": 200.0}, {"nice": 3.0, "amy": 1.0}]
model.fit_transform(data)
model_onnx = convert_sklearn(
model, "dictionary vectorizer",
[(
"input",
DictionaryType(StringTensorType([1]), FloatTensorType([1])),
)])
self.assertTrue(model_onnx is not None)
dump_data_and_model(
data, model, model_onnx,
basename="SklearnDictVectorizer-OneOff-SkipDim1",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.1.3') or "
"StrictVersion(onnx.__version__)"
" < StrictVersion('1.3.0')")
示例9: test_model_dict_vectorizer_sort_false
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_model_dict_vectorizer_sort_false(self):
model = DictVectorizer(sparse=False, sort=False)
data = [{1: 1.0, 2: 200.0}, {1: 3.0, 3: 1.0}]
model.fit_transform(data)
model_onnx = convert_sklearn(
model,
"dictionary vectorizer",
[(
"input",
DictionaryType(Int64TensorType([1]), FloatTensorType([1])),
)],
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
data,
model,
model_onnx,
basename="SklearnDictVectorizerSortFalse-OneOff-SkipDim1",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.1.3') or "
"StrictVersion(onnx.__version__)"
" < StrictVersion('1.3.0')",
)
示例10: test_objectmapper
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.feature_extraction.DictVectorizer, fe.DictVectorizer)
self.assertIs(df.feature_extraction.FeatureHasher, fe.FeatureHasher)
self.assertIs(df.feature_extraction.image.img_to_graph, fe.image.img_to_graph)
self.assertIs(df.feature_extraction.image.grid_to_graph, fe.image.grid_to_graph)
self.assertIs(df.feature_extraction.image.extract_patches_2d, fe.image.extract_patches_2d)
self.assertIs(df.feature_extraction.image.reconstruct_from_patches_2d,
fe.image.reconstruct_from_patches_2d)
self.assertIs(df.feature_extraction.image.PatchExtractor, fe.image.PatchExtractor)
self.assertIs(df.feature_extraction.text.CountVectorizer, fe.text.CountVectorizer)
self.assertIs(df.feature_extraction.text.HashingVectorizer, fe.text.HashingVectorizer)
self.assertIs(df.feature_extraction.text.TfidfTransformer, fe.text.TfidfTransformer)
self.assertIs(df.feature_extraction.text.TfidfVectorizer, fe.text.TfidfVectorizer)
示例11: get_feature_transformer
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def get_feature_transformer(parser, run_grammar=True, run_tfidf=True):
'''
Creates a transformer object that will take a text series and generate TFIDF counts and frequency of syntactical structures.
Suitable for use as a step in a SKLearn Pipeline.
inputs:
parser: a Spacy pipeline object
returns:
feature transformer: FeatureUnion
'''
tfidf = Pipeline([
('cln', CleanTextTransformer()),
('pre', PreTokenizer(parser=parser)),
('vect', TfidfVectorizer(
max_features=3000, decode_error='replace')),
('clf', None)
])
grammar_counter = Pipeline([
('cln', CleanTextTransformer()),
('grm', GrammarTransformer(parser=parser)),
('to_dict', DictVectorizer()),
('clf', None)
])
if run_grammar and run_tfidf:
print('Running both feature sets.')
feature_transformer = FeatureUnion([("tfidf", tfidf), ('grammar_counter', grammar_counter)])
elif not run_grammar:
print('Running only TFIDF.')
feature_transformer = FeatureUnion([("tfidf", tfidf)])
elif not run_tfidf:
print('Running only PCFGs.')
feature_transformer = FeatureUnion([('grammar_counter', grammar_counter)])
return feature_transformer
示例12: test_feature_selection
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_feature_selection():
# make two feature dicts with two useful features and a bunch of useless
# ones, in terms of chi2
d1 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=1, useful2=20)
d2 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=20, useful2=1)
for indices in (True, False):
v = DictVectorizer().fit([d1, d2])
X = v.transform([d1, d2])
sel = SelectKBest(chi2, k=2).fit(X, [0, 1])
v.restrict(sel.get_support(indices=indices), indices=indices)
assert_equal(v.get_feature_names(), ["useful1", "useful2"])
示例13: test_one_of_k
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_one_of_k():
D_in = [{"version": "1", "ham": 2},
{"version": "2", "spam": .3},
{"version=3": True, "spam": -1}]
v = DictVectorizer()
X = v.fit_transform(D_in)
assert_equal(X.shape, (3, 5))
D_out = v.inverse_transform(X)
assert_equal(D_out[0], {"version=1": 1, "ham": 2})
names = v.get_feature_names()
assert "version=2" in names
assert "version" not in names
示例14: test_deterministic_vocabulary
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_deterministic_vocabulary():
# Generate equal dictionaries with different memory layouts
items = [("%03d" % i, i) for i in range(1000)]
rng = Random(42)
d_sorted = dict(items)
rng.shuffle(items)
d_shuffled = dict(items)
# check that the memory layout does not impact the resulting vocabulary
v_1 = DictVectorizer().fit([d_sorted])
v_2 = DictVectorizer().fit([d_shuffled])
assert_equal(v_1.vocabulary_, v_2.vocabulary_)
示例15: funcs_to_sparse
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def funcs_to_sparse(func_list):
vectorizor = DictVectorizer()
func_sparse = vectorizor.fit_transform(func_list)
return vectorizor, func_sparse