當前位置: 首頁>>代碼示例>>Python>>正文


Python feature_extraction.DictVectorizer方法代碼示例

本文整理匯總了Python中sklearn.feature_extraction.DictVectorizer方法的典型用法代碼示例。如果您正苦於以下問題:Python feature_extraction.DictVectorizer方法的具體用法?Python feature_extraction.DictVectorizer怎麽用?Python feature_extraction.DictVectorizer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.feature_extraction的用法示例。


在下文中一共展示了feature_extraction.DictVectorizer方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: __init__

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse) 
開發者ID:rafasashi,項目名稱:razzy-spinner,代碼行數:21,代碼來源:scikitlearn.py

示例2: __init__

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def __init__(self):
        self.vectorizer = HashingVectorizer(ngram_range=(1, 2))
        self.dict_vectorizer = DictVectorizer()

        # These are set dynamically in training
        # but fixed here to match the end feature names
        # in the trained model. If the model is retrained then
        # these may have to change
        self.dict_vectorizer.feature_names_ = [
            'DocumentPositionQuintile0',
            'DocumentPositionQuintile1',
            'DocumentPositionQuintile2',
            'DocumentPositionQuintile3',
            'DocumentPositionQuintile4',
            'DocumentPositionQuintile5',
            'DocumentPositionQuintile6']
        self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)}

        self.drugbank = Drugbank() 
開發者ID:ijmarshall,項目名稱:robotreviewer,代碼行數:21,代碼來源:pico_robot.py

示例3: test_dictvectorizer

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def test_dictvectorizer(sparse, dtype, sort, iterable):
    D = [{"foo": 1, "bar": 3},
         {"bar": 4, "baz": 2},
         {"bar": 1, "quux": 1, "quuux": 2}]

    v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
    X = v.fit_transform(iter(D) if iterable else D)

    assert_equal(sp.issparse(X), sparse)
    assert_equal(X.shape, (3, 5))
    assert_equal(X.sum(), 14)
    assert_equal(v.inverse_transform(X), D)

    if sparse:
        # CSR matrices can't be compared for equality
        assert_array_equal(X.A, v.transform(iter(D) if iterable
                                            else D).A)
    else:
        assert_array_equal(X, v.transform(iter(D) if iterable
                                          else D))

    if sort:
        assert_equal(v.feature_names_,
                     sorted(v.feature_names_)) 
開發者ID:PacktPublishing,項目名稱:Mastering-Elasticsearch-7.0,代碼行數:26,代碼來源:test_dict_vectorizer.py

示例4: test_unseen_or_no_features

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def test_unseen_or_no_features():
    D = [{"camelot": 0, "spamalot": 1}]
    for sparse in [True, False]:
        v = DictVectorizer(sparse=sparse).fit(D)

        X = v.transform({"push the pram a lot": 2})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        X = v.transform({})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        try:
            v.transform([])
        except ValueError as e:
            assert_in("empty", str(e)) 
開發者ID:PacktPublishing,項目名稱:Mastering-Elasticsearch-7.0,代碼行數:21,代碼來源:test_dict_vectorizer.py

示例5: transform

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def transform(self, documents):
        """
        Returns a dictionary of text features in advance of a DictVectorizer.
        """
        for document in documents:
            # Collect token and vocabulary counts
            counts = Counter(
                item[0] for para in document for sent in para for item in sent
            )

            # Yield structured information about the document
            yield {
                'paragraphs': len(document),
                'sentences': sum(len(para) for para in document),
                'words': sum(counts.values()),
                'vocab': len(counts),
            }


##########################################################################
## Model Building Functions
########################################################################## 
開發者ID:DistrictDataLabs,項目名稱:partisan-discourse,代碼行數:24,代碼來源:learn.py

示例6: _consolidate_pipeline

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
        # First, restrict our DictVectorizer or DataFrameVectorizer
        # This goes through and has DV only output the items that have passed our support mask
        # This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
        # It also significantly reduces the size of dv.vocabulary_ which can get quite large

        try:
            feature_selection = transformation_pipeline.named_steps['feature_selection']
            feature_selection_mask = feature_selection.support_mask
            transformation_pipeline.named_steps['dv'].restrict(feature_selection_mask)
        except KeyError:
            pass

        # We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
        # In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
        trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)

        return trained_pipeline_without_feature_selection 
開發者ID:ClimbsRocks,項目名稱:auto_ml,代碼行數:20,代碼來源:predictor.py

示例7: __init__

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None):
        """ Initializes the extractor.

            :param language: The language of the sentences that will be used
            :param window_width: how many tokens to look before and after a each
             token when building its features.
            :param collapse_fes: Whether to collapse FEs to a single token
             or to keep them split.
        """
        self.language = language
        self.tagger = TTPosTagger(language)
        self.window_width = window_width
        self.collapse_fes = collapse_fes
        self.unk_feature = 'UNK'
        self.vectorizer = DictVectorizer()
        self.target_size = target_size
        self.reducer = TruncatedSVD(target_size) if target_size else None
        self.vocabulary = set()
        self.label_index = {}
        self.lu_index = {}
        self.stopwords = set(w.lower() for w in StopWords().words(language))
        self.start() 
開發者ID:Wikidata,項目名稱:StrepHit,代碼行數:24,代碼來源:feature_extractors.py

示例8: test_model_dict_vectorizer

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def test_model_dict_vectorizer(self):
        model = DictVectorizer()
        data = [{"amy": 1.0, "chin": 200.0}, {"nice": 3.0, "amy": 1.0}]
        model.fit_transform(data)
        model_onnx = convert_sklearn(
            model, "dictionary vectorizer",
            [(
                "input",
                DictionaryType(StringTensorType([1]), FloatTensorType([1])),
            )])
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data, model, model_onnx,
            basename="SklearnDictVectorizer-OneOff-SkipDim1",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.1.3') or "
                          "StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.3.0')") 
開發者ID:onnx,項目名稱:sklearn-onnx,代碼行數:20,代碼來源:test_sklearn_dict_vectorizer_converter.py

示例9: test_model_dict_vectorizer_sort_false

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def test_model_dict_vectorizer_sort_false(self):
        model = DictVectorizer(sparse=False, sort=False)
        data = [{1: 1.0, 2: 200.0}, {1: 3.0, 3: 1.0}]
        model.fit_transform(data)
        model_onnx = convert_sklearn(
            model,
            "dictionary vectorizer",
            [(
                "input",
                DictionaryType(Int64TensorType([1]), FloatTensorType([1])),
            )],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data,
            model,
            model_onnx,
            basename="SklearnDictVectorizerSortFalse-OneOff-SkipDim1",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.1.3') or "
                          "StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.3.0')",
        ) 
開發者ID:onnx,項目名稱:sklearn-onnx,代碼行數:25,代碼來源:test_sklearn_dict_vectorizer_converter.py

示例10: test_objectmapper

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_extraction.DictVectorizer, fe.DictVectorizer)
        self.assertIs(df.feature_extraction.FeatureHasher, fe.FeatureHasher)

        self.assertIs(df.feature_extraction.image.img_to_graph, fe.image.img_to_graph)
        self.assertIs(df.feature_extraction.image.grid_to_graph, fe.image.grid_to_graph)
        self.assertIs(df.feature_extraction.image.extract_patches_2d, fe.image.extract_patches_2d)
        self.assertIs(df.feature_extraction.image.reconstruct_from_patches_2d,
                      fe.image.reconstruct_from_patches_2d)
        self.assertIs(df.feature_extraction.image.PatchExtractor, fe.image.PatchExtractor)

        self.assertIs(df.feature_extraction.text.CountVectorizer, fe.text.CountVectorizer)
        self.assertIs(df.feature_extraction.text.HashingVectorizer, fe.text.HashingVectorizer)
        self.assertIs(df.feature_extraction.text.TfidfTransformer, fe.text.TfidfTransformer)
        self.assertIs(df.feature_extraction.text.TfidfVectorizer, fe.text.TfidfVectorizer) 
開發者ID:pandas-ml,項目名稱:pandas-ml,代碼行數:18,代碼來源:test_feature_extraction.py

示例11: get_feature_transformer

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def get_feature_transformer(parser, run_grammar=True, run_tfidf=True):
    '''
    Creates a transformer object that will take a text series and generate TFIDF counts and frequency of syntactical structures.
    Suitable for use as a step in a SKLearn Pipeline.

    inputs:
        parser: a Spacy pipeline object
    returns:
        feature transformer: FeatureUnion
    '''
    tfidf = Pipeline([
            ('cln', CleanTextTransformer()),
            ('pre', PreTokenizer(parser=parser)),
            ('vect', TfidfVectorizer(
                         max_features=3000, decode_error='replace')),
            ('clf', None)
        ])
    grammar_counter = Pipeline([
            ('cln', CleanTextTransformer()),
            ('grm', GrammarTransformer(parser=parser)),
            ('to_dict', DictVectorizer()),
            ('clf', None)
        ])
    if run_grammar and run_tfidf:
        print('Running both feature sets.')
        feature_transformer = FeatureUnion([("tfidf", tfidf), ('grammar_counter', grammar_counter)])
    elif not run_grammar:
        print('Running only TFIDF.')
        feature_transformer = FeatureUnion([("tfidf", tfidf)])
    elif not run_tfidf:
        print('Running only PCFGs.')
        feature_transformer = FeatureUnion([('grammar_counter', grammar_counter)])
    return feature_transformer 
開發者ID:aldengolab,項目名稱:fake-news-detection,代碼行數:35,代碼來源:transform_features.py

示例12: test_feature_selection

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def test_feature_selection():
    # make two feature dicts with two useful features and a bunch of useless
    # ones, in terms of chi2
    d1 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=1, useful2=20)
    d2 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=20, useful2=1)

    for indices in (True, False):
        v = DictVectorizer().fit([d1, d2])
        X = v.transform([d1, d2])
        sel = SelectKBest(chi2, k=2).fit(X, [0, 1])

        v.restrict(sel.get_support(indices=indices), indices=indices)
        assert_equal(v.get_feature_names(), ["useful1", "useful2"]) 
開發者ID:PacktPublishing,項目名稱:Mastering-Elasticsearch-7.0,代碼行數:17,代碼來源:test_dict_vectorizer.py

示例13: test_one_of_k

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def test_one_of_k():
    D_in = [{"version": "1", "ham": 2},
            {"version": "2", "spam": .3},
            {"version=3": True, "spam": -1}]
    v = DictVectorizer()
    X = v.fit_transform(D_in)
    assert_equal(X.shape, (3, 5))

    D_out = v.inverse_transform(X)
    assert_equal(D_out[0], {"version=1": 1, "ham": 2})

    names = v.get_feature_names()
    assert "version=2" in names
    assert "version" not in names 
開發者ID:PacktPublishing,項目名稱:Mastering-Elasticsearch-7.0,代碼行數:16,代碼來源:test_dict_vectorizer.py

示例14: test_deterministic_vocabulary

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def test_deterministic_vocabulary():
    # Generate equal dictionaries with different memory layouts
    items = [("%03d" % i, i) for i in range(1000)]
    rng = Random(42)
    d_sorted = dict(items)
    rng.shuffle(items)
    d_shuffled = dict(items)

    # check that the memory layout does not impact the resulting vocabulary
    v_1 = DictVectorizer().fit([d_sorted])
    v_2 = DictVectorizer().fit([d_shuffled])

    assert_equal(v_1.vocabulary_, v_2.vocabulary_) 
開發者ID:PacktPublishing,項目名稱:Mastering-Elasticsearch-7.0,代碼行數:15,代碼來源:test_dict_vectorizer.py

示例15: funcs_to_sparse

# 需要導入模塊: from sklearn import feature_extraction [as 別名]
# 或者: from sklearn.feature_extraction import DictVectorizer [as 別名]
def funcs_to_sparse(func_list):
    vectorizor = DictVectorizer()
    func_sparse = vectorizor.fit_transform(func_list)
    return vectorizor, func_sparse 
開發者ID:ChrisTheCoolHut,項目名稱:Firmware_Slap,代碼行數:6,代碼來源:function_clustering.py


注:本文中的sklearn.feature_extraction.DictVectorizer方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。