当前位置: 首页>>代码示例>>Python>>正文


Python feature_extraction.FeatureHasher方法代码示例

本文整理汇总了Python中sklearn.feature_extraction.FeatureHasher方法的典型用法代码示例。如果您正苦于以下问题:Python feature_extraction.FeatureHasher方法的具体用法?Python feature_extraction.FeatureHasher怎么用?Python feature_extraction.FeatureHasher使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.feature_extraction的用法示例。


在下文中一共展示了feature_extraction.FeatureHasher方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_feature_hasher_strings

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_strings():
    # mix byte and Unicode strings; note that "foo" is a duplicate in row 0
    raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
             ["bar".encode("ascii"), "baz", "quux"]]

    for lg_n_features in (7, 9, 11, 16, 22):
        n_features = 2 ** lg_n_features

        it = (x for x in raw_X)                 # iterable

        h = FeatureHasher(n_features, input_type="string",
                          alternate_sign=False)
        X = h.transform(it)

        assert_equal(X.shape[0], len(raw_X))
        assert_equal(X.shape[1], n_features)

        assert_equal(X[0].sum(), 4)
        assert_equal(X[1].sum(), 3)

        assert_equal(X.nnz, 6) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:23,代码来源:test_feature_hasher.py

示例2: test_feature_hasher_pairs_with_string_values

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_pairs_with_string_values():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
                                       {"baz": "abc", "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 1], x1_nz)
    assert_equal([1, 1, 4], x2_nz)

    raw_X = (iter(d.items()) for d in [{"bax": "abc"},
                                       {"bax": "abc"}])
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = np.abs(x1[x1 != 0])
    x2_nz = np.abs(x2[x2 != 0])
    assert_equal([1], x1_nz)
    assert_equal([1], x2_nz)
    assert_array_equal(x1, x2) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:20,代码来源:test_feature_hasher.py

示例3: imports_features

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def imports_features(self, lief_binary):
        from sklearn.feature_extraction import FeatureHasher

        imports = lief_binary.imports
        features = {}
        for lib in imports:
            if lib.name not in features:
                features[lib.name] = []
            for entry in lib.entries:
                if entry.is_ordinal:
                    features[lib.name].append("ordinal" + str(entry.ordinal))
                else:
                    features[lib.name].append(entry.name[:10000])

        features_hashed = {}
        libraries = sorted(list(set([l.lower() for l in features.keys()])))
        for i, x in enumerate(FeatureHasher(256, input_type='string').transform([libraries]).toarray()[0]):
            features_hashed.update({f'Imports_libraries_hash_{i}': x})
        entries = sorted([lib.lower() + ':' + e for lib, elist in features.items() for e in elist])
        for i, x in enumerate(FeatureHasher(1024, input_type='string').transform([entries]).toarray()[0]):
            features_hashed.update({f'Imports_entries_hash_{i}': x})
        return features_hashed 
开发者ID:h2oai,项目名称:driverlessai-recipes,代码行数:24,代码来源:pe_imports_features.py

示例4: main

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def main(neg_rate, submission_num, n_iter, train_path):
    ids = [x for x in pp.get_int_field('id', 'original_data/test')]
    clicks = pp.get_int_field('click', train_path)
    # Get Data Generators
    train = pp.data_generator(pp.clean_parse_row, train_path)
    test = pp.data_generator(pp.clean_parse_row, 'original_data/test')

    # Define estimators
    fh = FeatureHasher(n_features=2 ** 20, input_type='pair')
    sgd = SGDClassifier(loss='log', n_iter=1, alpha=.003, penalty='l2')

    #Fit pipeline
    pipeline = ml.PartialFitter([fh, sgd],
                                batch_size=10000,
                                logging=True,
                                n_iter=n_iter,
                                neg_rate=neg_rate)

    pipeline.partial_fit(X=train, y=clicks)
    # Correct Intercept
    pipeline.steps[-1].intercept_[0] += np.log(neg_rate)
    preds = pipeline.predict_proba(newX=test)[:, 1]
    pp.write_submission(number=submission_num, ids=ids, preds=preds) 
开发者ID:mkneierV,项目名称:kaggle_avazu_benchmark,代码行数:25,代码来源:run_model.py

示例5: test_objectmapper

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_extraction.DictVectorizer, fe.DictVectorizer)
        self.assertIs(df.feature_extraction.FeatureHasher, fe.FeatureHasher)

        self.assertIs(df.feature_extraction.image.img_to_graph, fe.image.img_to_graph)
        self.assertIs(df.feature_extraction.image.grid_to_graph, fe.image.grid_to_graph)
        self.assertIs(df.feature_extraction.image.extract_patches_2d, fe.image.extract_patches_2d)
        self.assertIs(df.feature_extraction.image.reconstruct_from_patches_2d,
                      fe.image.reconstruct_from_patches_2d)
        self.assertIs(df.feature_extraction.image.PatchExtractor, fe.image.PatchExtractor)

        self.assertIs(df.feature_extraction.text.CountVectorizer, fe.text.CountVectorizer)
        self.assertIs(df.feature_extraction.text.HashingVectorizer, fe.text.HashingVectorizer)
        self.assertIs(df.feature_extraction.text.TfidfTransformer, fe.text.TfidfTransformer)
        self.assertIs(df.feature_extraction.text.TfidfVectorizer, fe.text.TfidfVectorizer) 
开发者ID:pandas-ml,项目名称:pandas-ml,代码行数:18,代码来源:test_feature_extraction.py

示例6: test_feature_hasher_strings

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_strings():
    # mix byte and Unicode strings; note that "foo" is a duplicate in row 0
    raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
             ["bar".encode("ascii"), "baz", "quux"]]

    for lg_n_features in (7, 9, 11, 16, 22):
        n_features = 2 ** lg_n_features

        it = (x for x in raw_X)                 # iterable

        h = FeatureHasher(n_features, non_negative=True, input_type="string")
        X = h.transform(it)

        assert_equal(X.shape[0], len(raw_X))
        assert_equal(X.shape[1], n_features)

        assert_true(np.all(X.data > 0))
        assert_equal(X[0].sum(), 4)
        assert_equal(X[1].sum(), 3)

        assert_equal(X.nnz, 6) 
开发者ID:alvarobartt,项目名称:twitter-stock-recommendation,代码行数:23,代码来源:test_feature_hasher.py

示例7: test_feature_hasher_pairs_with_string_values

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_pairs_with_string_values():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
                                       {"baz": u"abc", "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 1], x1_nz)
    assert_equal([1, 1, 4], x2_nz)

    raw_X = (iter(d.items()) for d in [{"bax": "abc"},
                                       {"bax": "abc"}])
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = np.abs(x1[x1 != 0])
    x2_nz = np.abs(x2[x2 != 0])
    assert_equal([1], x1_nz)
    assert_equal([1], x2_nz)
    assert_array_equal(x1, x2) 
开发者ID:alvarobartt,项目名称:twitter-stock-recommendation,代码行数:20,代码来源:test_feature_hasher.py

示例8: test_hasher_alternate_sign

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_hasher_alternate_sign():
    X = [list("Thequickbrownfoxjumped")]

    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() < 0 and Xt.data.max() > 0

    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() > 0

    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
                       input_type='string').fit_transform(X)
    assert Xt.data.min() > 0
    Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False,
                         input_type='string').fit_transform(X)
    # With initially positive features, the non_negative option should
    # have no impact when alternate_sign=False
    assert_array_equal(Xt.data, Xt_2.data) 
开发者ID:alvarobartt,项目名称:twitter-stock-recommendation,代码行数:21,代码来源:test_feature_hasher.py

示例9: test_hash_collisions

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_hash_collisions():
    X = [list("Thequickbrownfoxjumped")]

    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
                       n_features=1, input_type='string').fit_transform(X)
    # check that some of the hashed tokens are added
    # with an opposite sign and cancel out
    assert abs(Xt.data[0]) < len(X[0])

    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
                       n_features=1, input_type='string').fit_transform(X)
    assert abs(Xt.data[0]) < len(X[0])

    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
                       n_features=1, input_type='string').fit_transform(X)
    assert Xt.data[0] == len(X[0]) 
开发者ID:alvarobartt,项目名称:twitter-stock-recommendation,代码行数:18,代码来源:test_feature_hasher.py

示例10: __call__

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def __call__(self, binary):
        libraries = [l.lower() for l in binary.libraries]
        # we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry
        imports = [lib.name.lower() + ':' +
                   e.name for lib in binary.imports for e in lib.entries]

        # two separate elements: libraries (alone) and fully-qualified names of imported functions
        return np.concatenate([
            FeatureHasher(256, input_type="string", dtype=self.dtype).transform(
                [libraries]).toarray(),
            FeatureHasher(1024, input_type="string", dtype=self.dtype).transform(
                [imports]).toarray()
        ], axis=-1).flatten().astype(self.dtype) 
开发者ID:endgameinc,项目名称:gym-malware,代码行数:15,代码来源:pefeatures.py

示例11: test_feature_hasher_dicts

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_dicts():
    h = FeatureHasher(n_features=16)
    assert_equal("dict", h.input_type)

    raw_X = [{"foo": "bar", "dada": 42, "tzara": 37},
             {"foo": "baz", "gaga": "string1"}]
    X1 = FeatureHasher(n_features=16).transform(raw_X)
    gen = (iter(d.items()) for d in raw_X)
    X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
    assert_array_equal(X1.toarray(), X2.toarray()) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:12,代码来源:test_feature_hasher.py

示例12: test_feature_hasher_pairs

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_feature_hasher_pairs():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2},
                                       {"baz": 3, "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 2], x1_nz)
    assert_equal([1, 3, 4], x2_nz) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:11,代码来源:test_feature_hasher.py

示例13: test_hash_empty_input

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_hash_empty_input():
    n_features = 16
    raw_X = [[], (), iter(range(0))]

    h = FeatureHasher(n_features=n_features, input_type="string")
    X = h.transform(raw_X)

    assert_array_equal(X.A, np.zeros((len(raw_X), n_features))) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:10,代码来源:test_feature_hasher.py

示例14: test_hasher_set_params

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_hasher_set_params():
    # Test delayed input validation in fit (useful for grid search).
    hasher = FeatureHasher()
    hasher.set_params(n_features=np.inf)
    assert_raises(TypeError, hasher.fit) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:7,代码来源:test_feature_hasher.py

示例15: test_hasher_zeros

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import FeatureHasher [as 别名]
def test_hasher_zeros():
    # Assert that no zeros are materialized in the output.
    X = FeatureHasher().transform([{'foo': 0}])
    assert_equal(X.data.shape, (0,)) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:6,代码来源:test_feature_hasher.py


注:本文中的sklearn.feature_extraction.FeatureHasher方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。