本文整理汇总了Python中sklearn.feature_extraction.FeatureHasher方法的典型用法代码示例。如果您正苦于以下问题:Python feature_extraction.FeatureHasher方法的具体用法?Python feature_extraction.FeatureHasher怎么用?Python feature_extraction.FeatureHasher使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_extraction
的用法示例。
在下文中一共展示了feature_extraction.FeatureHasher方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_feature_hasher_strings
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_strings():
# mix byte and Unicode strings; note that "foo" is a duplicate in row 0
raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
["bar".encode("ascii"), "baz", "quux"]]
for lg_n_features in (7, 9, 11, 16, 22):
n_features = 2 ** lg_n_features
it = (x for x in raw_X) # iterable
h = FeatureHasher(n_features, input_type="string",
alternate_sign=False)
X = h.transform(it)
assert_equal(X.shape[0], len(raw_X))
assert_equal(X.shape[1], n_features)
assert_equal(X[0].sum(), 4)
assert_equal(X[1].sum(), 3)
assert_equal(X.nnz, 6)
示例2: test_feature_hasher_pairs_with_string_values
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_pairs_with_string_values():
raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
{"baz": "abc", "quux": 4, "foo": -1}])
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert_equal([1, 1], x1_nz)
assert_equal([1, 1, 4], x2_nz)
raw_X = (iter(d.items()) for d in [{"bax": "abc"},
{"bax": "abc"}])
x1, x2 = h.transform(raw_X).toarray()
x1_nz = np.abs(x1[x1 != 0])
x2_nz = np.abs(x2[x2 != 0])
assert_equal([1], x1_nz)
assert_equal([1], x2_nz)
assert_array_equal(x1, x2)
示例3: imports_features
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def imports_features(self, lief_binary):
from sklearn.feature_extraction import FeatureHasher
imports = lief_binary.imports
features = {}
for lib in imports:
if lib.name not in features:
features[lib.name] = []
for entry in lib.entries:
if entry.is_ordinal:
features[lib.name].append("ordinal" + str(entry.ordinal))
else:
features[lib.name].append(entry.name[:10000])
features_hashed = {}
libraries = sorted(list(set([l.lower() for l in features.keys()])))
for i, x in enumerate(FeatureHasher(256, input_type='string').transform([libraries]).toarray()[0]):
features_hashed.update({f'Imports_libraries_hash_{i}': x})
entries = sorted([lib.lower() + ':' + e for lib, elist in features.items() for e in elist])
for i, x in enumerate(FeatureHasher(1024, input_type='string').transform([entries]).toarray()[0]):
features_hashed.update({f'Imports_entries_hash_{i}': x})
return features_hashed
示例4: main
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def main(neg_rate, submission_num, n_iter, train_path):
ids = [x for x in pp.get_int_field('id', 'original_data/test')]
clicks = pp.get_int_field('click', train_path)
# Get Data Generators
train = pp.data_generator(pp.clean_parse_row, train_path)
test = pp.data_generator(pp.clean_parse_row, 'original_data/test')
# Define estimators
fh = FeatureHasher(n_features=2 ** 20, input_type='pair')
sgd = SGDClassifier(loss='log', n_iter=1, alpha=.003, penalty='l2')
#Fit pipeline
pipeline = ml.PartialFitter([fh, sgd],
batch_size=10000,
logging=True,
n_iter=n_iter,
neg_rate=neg_rate)
pipeline.partial_fit(X=train, y=clicks)
# Correct Intercept
pipeline.steps[-1].intercept_[0] += np.log(neg_rate)
preds = pipeline.predict_proba(newX=test)[:, 1]
pp.write_submission(number=submission_num, ids=ids, preds=preds)
示例5: test_objectmapper
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.feature_extraction.DictVectorizer, fe.DictVectorizer)
self.assertIs(df.feature_extraction.FeatureHasher, fe.FeatureHasher)
self.assertIs(df.feature_extraction.image.img_to_graph, fe.image.img_to_graph)
self.assertIs(df.feature_extraction.image.grid_to_graph, fe.image.grid_to_graph)
self.assertIs(df.feature_extraction.image.extract_patches_2d, fe.image.extract_patches_2d)
self.assertIs(df.feature_extraction.image.reconstruct_from_patches_2d,
fe.image.reconstruct_from_patches_2d)
self.assertIs(df.feature_extraction.image.PatchExtractor, fe.image.PatchExtractor)
self.assertIs(df.feature_extraction.text.CountVectorizer, fe.text.CountVectorizer)
self.assertIs(df.feature_extraction.text.HashingVectorizer, fe.text.HashingVectorizer)
self.assertIs(df.feature_extraction.text.TfidfTransformer, fe.text.TfidfTransformer)
self.assertIs(df.feature_extraction.text.TfidfVectorizer, fe.text.TfidfVectorizer)
示例6: test_feature_hasher_strings
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_strings():
# mix byte and Unicode strings; note that "foo" is a duplicate in row 0
raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
["bar".encode("ascii"), "baz", "quux"]]
for lg_n_features in (7, 9, 11, 16, 22):
n_features = 2 ** lg_n_features
it = (x for x in raw_X) # iterable
h = FeatureHasher(n_features, non_negative=True, input_type="string")
X = h.transform(it)
assert_equal(X.shape[0], len(raw_X))
assert_equal(X.shape[1], n_features)
assert_true(np.all(X.data > 0))
assert_equal(X[0].sum(), 4)
assert_equal(X[1].sum(), 3)
assert_equal(X.nnz, 6)
示例7: test_feature_hasher_pairs_with_string_values
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_pairs_with_string_values():
raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
{"baz": u"abc", "quux": 4, "foo": -1}])
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert_equal([1, 1], x1_nz)
assert_equal([1, 1, 4], x2_nz)
raw_X = (iter(d.items()) for d in [{"bax": "abc"},
{"bax": "abc"}])
x1, x2 = h.transform(raw_X).toarray()
x1_nz = np.abs(x1[x1 != 0])
x2_nz = np.abs(x2[x2 != 0])
assert_equal([1], x1_nz)
assert_equal([1], x2_nz)
assert_array_equal(x1, x2)
示例8: test_hasher_alternate_sign
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_hasher_alternate_sign():
X = [list("Thequickbrownfoxjumped")]
Xt = FeatureHasher(alternate_sign=True, non_negative=False,
input_type='string').fit_transform(X)
assert Xt.data.min() < 0 and Xt.data.max() > 0
Xt = FeatureHasher(alternate_sign=True, non_negative=True,
input_type='string').fit_transform(X)
assert Xt.data.min() > 0
Xt = FeatureHasher(alternate_sign=False, non_negative=True,
input_type='string').fit_transform(X)
assert Xt.data.min() > 0
Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False,
input_type='string').fit_transform(X)
# With initially positive features, the non_negative option should
# have no impact when alternate_sign=False
assert_array_equal(Xt.data, Xt_2.data)
示例9: test_hash_collisions
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_hash_collisions():
X = [list("Thequickbrownfoxjumped")]
Xt = FeatureHasher(alternate_sign=True, non_negative=False,
n_features=1, input_type='string').fit_transform(X)
# check that some of the hashed tokens are added
# with an opposite sign and cancel out
assert abs(Xt.data[0]) < len(X[0])
Xt = FeatureHasher(alternate_sign=True, non_negative=True,
n_features=1, input_type='string').fit_transform(X)
assert abs(Xt.data[0]) < len(X[0])
Xt = FeatureHasher(alternate_sign=False, non_negative=True,
n_features=1, input_type='string').fit_transform(X)
assert Xt.data[0] == len(X[0])
示例10: __call__
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def __call__(self, binary):
libraries = [l.lower() for l in binary.libraries]
# we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry
imports = [lib.name.lower() + ':' +
e.name for lib in binary.imports for e in lib.entries]
# two separate elements: libraries (alone) and fully-qualified names of imported functions
return np.concatenate([
FeatureHasher(256, input_type="string", dtype=self.dtype).transform(
[libraries]).toarray(),
FeatureHasher(1024, input_type="string", dtype=self.dtype).transform(
[imports]).toarray()
], axis=-1).flatten().astype(self.dtype)
示例11: test_feature_hasher_dicts
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_dicts():
h = FeatureHasher(n_features=16)
assert_equal("dict", h.input_type)
raw_X = [{"foo": "bar", "dada": 42, "tzara": 37},
{"foo": "baz", "gaga": "string1"}]
X1 = FeatureHasher(n_features=16).transform(raw_X)
gen = (iter(d.items()) for d in raw_X)
X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
assert_array_equal(X1.toarray(), X2.toarray())
示例12: test_feature_hasher_pairs
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_pairs():
raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2},
{"baz": 3, "quux": 4, "foo": -1}])
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert_equal([1, 2], x1_nz)
assert_equal([1, 3, 4], x2_nz)
示例13: test_hash_empty_input
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_hash_empty_input():
n_features = 16
raw_X = [[], (), iter(range(0))]
h = FeatureHasher(n_features=n_features, input_type="string")
X = h.transform(raw_X)
assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
示例14: test_hasher_set_params
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_hasher_set_params():
# Test delayed input validation in fit (useful for grid search).
hasher = FeatureHasher()
hasher.set_params(n_features=np.inf)
assert_raises(TypeError, hasher.fit)
示例15: test_hasher_zeros
# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_hasher_zeros():
# Assert that no zeros are materialized in the output.
X = FeatureHasher().transform([{'foo': 0}])
assert_equal(X.data.shape, (0,))