当前位置: 首页>>代码示例>>Python>>正文


Python feature_extraction.DictVectorizer方法代码示例

本文整理汇总了Python中sklearn.feature_extraction.DictVectorizer方法的典型用法代码示例。如果您正苦于以下问题:Python feature_extraction.DictVectorizer方法的具体用法?Python feature_extraction.DictVectorizer怎么用?Python feature_extraction.DictVectorizer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.feature_extraction的用法示例。


在下文中一共展示了feature_extraction.DictVectorizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:21,代码来源:scikitlearn.py

示例2: __init__

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def __init__(self):
        self.vectorizer = HashingVectorizer(ngram_range=(1, 2))
        self.dict_vectorizer = DictVectorizer()

        # These are set dynamically in training
        # but fixed here to match the end feature names
        # in the trained model. If the model is retrained then
        # these may have to change
        self.dict_vectorizer.feature_names_ = [
            'DocumentPositionQuintile0',
            'DocumentPositionQuintile1',
            'DocumentPositionQuintile2',
            'DocumentPositionQuintile3',
            'DocumentPositionQuintile4',
            'DocumentPositionQuintile5',
            'DocumentPositionQuintile6']
        self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)}

        self.drugbank = Drugbank() 
开发者ID:ijmarshall,项目名称:robotreviewer,代码行数:21,代码来源:pico_robot.py

示例3: test_dictvectorizer

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_dictvectorizer(sparse, dtype, sort, iterable):
    D = [{"foo": 1, "bar": 3},
         {"bar": 4, "baz": 2},
         {"bar": 1, "quux": 1, "quuux": 2}]

    v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
    X = v.fit_transform(iter(D) if iterable else D)

    assert_equal(sp.issparse(X), sparse)
    assert_equal(X.shape, (3, 5))
    assert_equal(X.sum(), 14)
    assert_equal(v.inverse_transform(X), D)

    if sparse:
        # CSR matrices can't be compared for equality
        assert_array_equal(X.A, v.transform(iter(D) if iterable
                                            else D).A)
    else:
        assert_array_equal(X, v.transform(iter(D) if iterable
                                          else D))

    if sort:
        assert_equal(v.feature_names_,
                     sorted(v.feature_names_)) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:26,代码来源:test_dict_vectorizer.py

示例4: test_unseen_or_no_features

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_unseen_or_no_features():
    D = [{"camelot": 0, "spamalot": 1}]
    for sparse in [True, False]:
        v = DictVectorizer(sparse=sparse).fit(D)

        X = v.transform({"push the pram a lot": 2})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        X = v.transform({})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        try:
            v.transform([])
        except ValueError as e:
            assert_in("empty", str(e)) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:21,代码来源:test_dict_vectorizer.py

示例5: transform

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def transform(self, documents):
        """
        Returns a dictionary of text features in advance of a DictVectorizer.
        """
        for document in documents:
            # Collect token and vocabulary counts
            counts = Counter(
                item[0] for para in document for sent in para for item in sent
            )

            # Yield structured information about the document
            yield {
                'paragraphs': len(document),
                'sentences': sum(len(para) for para in document),
                'words': sum(counts.values()),
                'vocab': len(counts),
            }


##########################################################################
## Model Building Functions
########################################################################## 
开发者ID:DistrictDataLabs,项目名称:partisan-discourse,代码行数:24,代码来源:learn.py

示例6: _consolidate_pipeline

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
        # First, restrict our DictVectorizer or DataFrameVectorizer
        # This goes through and has DV only output the items that have passed our support mask
        # This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
        # It also significantly reduces the size of dv.vocabulary_ which can get quite large

        try:
            feature_selection = transformation_pipeline.named_steps['feature_selection']
            feature_selection_mask = feature_selection.support_mask
            transformation_pipeline.named_steps['dv'].restrict(feature_selection_mask)
        except KeyError:
            pass

        # We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
        # In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
        trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)

        return trained_pipeline_without_feature_selection 
开发者ID:ClimbsRocks,项目名称:auto_ml,代码行数:20,代码来源:predictor.py

示例7: __init__

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None):
        """ Initializes the extractor.

            :param language: The language of the sentences that will be used
            :param window_width: how many tokens to look before and after a each
             token when building its features.
            :param collapse_fes: Whether to collapse FEs to a single token
             or to keep them split.
        """
        self.language = language
        self.tagger = TTPosTagger(language)
        self.window_width = window_width
        self.collapse_fes = collapse_fes
        self.unk_feature = 'UNK'
        self.vectorizer = DictVectorizer()
        self.target_size = target_size
        self.reducer = TruncatedSVD(target_size) if target_size else None
        self.vocabulary = set()
        self.label_index = {}
        self.lu_index = {}
        self.stopwords = set(w.lower() for w in StopWords().words(language))
        self.start() 
开发者ID:Wikidata,项目名称:StrepHit,代码行数:24,代码来源:feature_extractors.py

示例8: test_model_dict_vectorizer

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_model_dict_vectorizer(self):
        model = DictVectorizer()
        data = [{"amy": 1.0, "chin": 200.0}, {"nice": 3.0, "amy": 1.0}]
        model.fit_transform(data)
        model_onnx = convert_sklearn(
            model, "dictionary vectorizer",
            [(
                "input",
                DictionaryType(StringTensorType([1]), FloatTensorType([1])),
            )])
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data, model, model_onnx,
            basename="SklearnDictVectorizer-OneOff-SkipDim1",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.1.3') or "
                          "StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.3.0')") 
开发者ID:onnx,项目名称:sklearn-onnx,代码行数:20,代码来源:test_sklearn_dict_vectorizer_converter.py

示例9: test_model_dict_vectorizer_sort_false

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_model_dict_vectorizer_sort_false(self):
        model = DictVectorizer(sparse=False, sort=False)
        data = [{1: 1.0, 2: 200.0}, {1: 3.0, 3: 1.0}]
        model.fit_transform(data)
        model_onnx = convert_sklearn(
            model,
            "dictionary vectorizer",
            [(
                "input",
                DictionaryType(Int64TensorType([1]), FloatTensorType([1])),
            )],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data,
            model,
            model_onnx,
            basename="SklearnDictVectorizerSortFalse-OneOff-SkipDim1",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.1.3') or "
                          "StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.3.0')",
        ) 
开发者ID:onnx,项目名称:sklearn-onnx,代码行数:25,代码来源:test_sklearn_dict_vectorizer_converter.py

示例10: test_objectmapper

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_extraction.DictVectorizer, fe.DictVectorizer)
        self.assertIs(df.feature_extraction.FeatureHasher, fe.FeatureHasher)

        self.assertIs(df.feature_extraction.image.img_to_graph, fe.image.img_to_graph)
        self.assertIs(df.feature_extraction.image.grid_to_graph, fe.image.grid_to_graph)
        self.assertIs(df.feature_extraction.image.extract_patches_2d, fe.image.extract_patches_2d)
        self.assertIs(df.feature_extraction.image.reconstruct_from_patches_2d,
                      fe.image.reconstruct_from_patches_2d)
        self.assertIs(df.feature_extraction.image.PatchExtractor, fe.image.PatchExtractor)

        self.assertIs(df.feature_extraction.text.CountVectorizer, fe.text.CountVectorizer)
        self.assertIs(df.feature_extraction.text.HashingVectorizer, fe.text.HashingVectorizer)
        self.assertIs(df.feature_extraction.text.TfidfTransformer, fe.text.TfidfTransformer)
        self.assertIs(df.feature_extraction.text.TfidfVectorizer, fe.text.TfidfVectorizer) 
开发者ID:pandas-ml,项目名称:pandas-ml,代码行数:18,代码来源:test_feature_extraction.py

示例11: get_feature_transformer

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def get_feature_transformer(parser, run_grammar=True, run_tfidf=True):
    '''
    Creates a transformer object that will take a text series and generate TFIDF counts and frequency of syntactical structures.
    Suitable for use as a step in a SKLearn Pipeline.

    inputs:
        parser: a Spacy pipeline object
    returns:
        feature transformer: FeatureUnion
    '''
    tfidf = Pipeline([
            ('cln', CleanTextTransformer()),
            ('pre', PreTokenizer(parser=parser)),
            ('vect', TfidfVectorizer(
                         max_features=3000, decode_error='replace')),
            ('clf', None)
        ])
    grammar_counter = Pipeline([
            ('cln', CleanTextTransformer()),
            ('grm', GrammarTransformer(parser=parser)),
            ('to_dict', DictVectorizer()),
            ('clf', None)
        ])
    if run_grammar and run_tfidf:
        print('Running both feature sets.')
        feature_transformer = FeatureUnion([("tfidf", tfidf), ('grammar_counter', grammar_counter)])
    elif not run_grammar:
        print('Running only TFIDF.')
        feature_transformer = FeatureUnion([("tfidf", tfidf)])
    elif not run_tfidf:
        print('Running only PCFGs.')
        feature_transformer = FeatureUnion([('grammar_counter', grammar_counter)])
    return feature_transformer 
开发者ID:aldengolab,项目名称:fake-news-detection,代码行数:35,代码来源:transform_features.py

示例12: test_feature_selection

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_feature_selection():
    # make two feature dicts with two useful features and a bunch of useless
    # ones, in terms of chi2
    d1 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=1, useful2=20)
    d2 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=20, useful2=1)

    for indices in (True, False):
        v = DictVectorizer().fit([d1, d2])
        X = v.transform([d1, d2])
        sel = SelectKBest(chi2, k=2).fit(X, [0, 1])

        v.restrict(sel.get_support(indices=indices), indices=indices)
        assert_equal(v.get_feature_names(), ["useful1", "useful2"]) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:17,代码来源:test_dict_vectorizer.py

示例13: test_one_of_k

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_one_of_k():
    D_in = [{"version": "1", "ham": 2},
            {"version": "2", "spam": .3},
            {"version=3": True, "spam": -1}]
    v = DictVectorizer()
    X = v.fit_transform(D_in)
    assert_equal(X.shape, (3, 5))

    D_out = v.inverse_transform(X)
    assert_equal(D_out[0], {"version=1": 1, "ham": 2})

    names = v.get_feature_names()
    assert "version=2" in names
    assert "version" not in names 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:16,代码来源:test_dict_vectorizer.py

示例14: test_deterministic_vocabulary

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def test_deterministic_vocabulary():
    # Generate equal dictionaries with different memory layouts
    items = [("%03d" % i, i) for i in range(1000)]
    rng = Random(42)
    d_sorted = dict(items)
    rng.shuffle(items)
    d_shuffled = dict(items)

    # check that the memory layout does not impact the resulting vocabulary
    v_1 = DictVectorizer().fit([d_sorted])
    v_2 = DictVectorizer().fit([d_shuffled])

    assert_equal(v_1.vocabulary_, v_2.vocabulary_) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:15,代码来源:test_dict_vectorizer.py

示例15: funcs_to_sparse

# 需要导入模块: from sklearn import feature_extraction [as 别名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 别名]
def funcs_to_sparse(func_list):
    vectorizor = DictVectorizer()
    func_sparse = vectorizor.fit_transform(func_list)
    return vectorizor, func_sparse 
开发者ID:ChrisTheCoolHut,项目名称:Firmware_Slap,代码行数:6,代码来源:function_clustering.py


注:本文中的sklearn.feature_extraction.DictVectorizer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。