当前位置: 首页>>代码示例>>Python>>正文


Python text.HashingVectorizer方法代码示例

本文整理汇总了Python中sklearn.feature_extraction.text.HashingVectorizer方法的典型用法代码示例。如果您正苦于以下问题:Python text.HashingVectorizer方法的具体用法?Python text.HashingVectorizer怎么用?Python text.HashingVectorizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.feature_extraction.text的用法示例。


在下文中一共展示了text.HashingVectorizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: transform

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def transform(self, X_si, high=None, low=None, limit=None):
        """
        Same as HashingVectorizer transform, except allows for 
        interaction list, which is an iterable the same length as X
        filled with True/False. This method adds an empty row to
        docs labelled as False.
        """
        analyzer = self.build_analyzer()

        X = self._get_hasher().transform(
            analyzer(self._deal_with_input(doc)) for doc in X_si)
        
        X.data.fill(1)

        if self.norm is not None:
            X = normalize(X, norm=self.norm, copy=False)

        if low:
            X = self._limit_features(X, low=low)
        return X 
开发者ID:ijmarshall,项目名称:robotreviewer,代码行数:22,代码来源:vectorizer.py

示例2: __init__

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self):
        self.vectorizer = HashingVectorizer(ngram_range=(1, 2))
        self.dict_vectorizer = DictVectorizer()

        # These are set dynamically in training
        # but fixed here to match the end feature names
        # in the trained model. If the model is retrained then
        # these may have to change
        self.dict_vectorizer.feature_names_ = [
            'DocumentPositionQuintile0',
            'DocumentPositionQuintile1',
            'DocumentPositionQuintile2',
            'DocumentPositionQuintile3',
            'DocumentPositionQuintile4',
            'DocumentPositionQuintile5',
            'DocumentPositionQuintile6']
        self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)}

        self.drugbank = Drugbank() 
开发者ID:ijmarshall,项目名称:robotreviewer,代码行数:21,代码来源:pico_robot.py

示例3: test_hashed_binary_occurrences

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_hashed_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = ['aaabc', 'abbde']
    vect = HashingVectorizer(alternate_sign=False, analyzer='char', norm=None)
    X = vect.transform(test_data)
    assert_equal(np.max(X[0:1].data), 3)
    assert_equal(np.max(X[1:2].data), 2)
    assert_equal(X.dtype, np.float64)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = HashingVectorizer(analyzer='char', alternate_sign=False,
                             binary=True, norm=None)
    X = vect.transform(test_data)
    assert_equal(np.max(X.data), 1)
    assert_equal(X.dtype, np.float64)

    # check the ability to change the dtype
    vect = HashingVectorizer(analyzer='char', alternate_sign=False,
                             binary=True, norm=None, dtype=np.float64)
    X = vect.transform(test_data)
    assert_equal(X.dtype, np.float64) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:24,代码来源:test_text.py

示例4: test_vectorizer_unicode

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_vectorizer_unicode():
    # tests that the count vectorizer works with cyrillic.
    document = (
        "Машинное обучение — обширный подраздел искусственного "
        "интеллекта, изучающий методы построения алгоритмов, "
        "способных обучаться."
        )

    vect = CountVectorizer()
    X_counted = vect.fit_transform([document])
    assert_equal(X_counted.shape, (1, 12))

    vect = HashingVectorizer(norm=None, alternate_sign=False)
    X_hashed = vect.transform([document])
    assert_equal(X_hashed.shape, (1, 2 ** 20))

    # No collisions on such a small dataset
    assert_equal(X_counted.nnz, X_hashed.nnz)

    # When norm is None and not alternate_sign, the tokens are counted up to
    # collisions
    assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data)) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:24,代码来源:test_text.py

示例5: test_vectorizer_stop_words_inconsistent

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_vectorizer_stop_words_inconsistent():
    lstr = "['and', 'll', 've']"
    message = ('Your stop_words may be inconsistent with your '
               'preprocessing. Tokenizing the stop words generated '
               'tokens %s not in stop_words.' % lstr)
    for vec in [CountVectorizer(),
                TfidfVectorizer(), HashingVectorizer()]:
        vec.set_params(stop_words=["you've", "you", "you'll", 'AND'])
        assert_warns_message(UserWarning, message, vec.fit_transform,
                             ['hello world'])
        # reset stop word validation
        del vec._stop_words_id
        assert _check_stop_words_consistency(vec) is False

    # Only one warning per stop list
    assert_no_warnings(vec.fit_transform, ['hello world'])
    assert _check_stop_words_consistency(vec) is None

    # Test caching of inconsistency assessment
    vec.set_params(stop_words=["you've", "you", "you'll", 'blah', 'AND'])
    assert_warns_message(UserWarning, message, vec.fit_transform,
                         ['hello world']) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:24,代码来源:test_text.py

示例6: __init__

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False):
        """
        `min_df` is set to filter out extremely rare words,
        since we don't want those to dominate the distance metric.

        `max_df` is set to filter out extremely common words,
        since they don't convey much information.
        """

        # Wrap the specified tokenizer
        t = Tokenizer(tokenizer())

        if hash:
            vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t)
        else:
            vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df)

        args = [
            ('vectorizer', vectr),
            ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
            ('normalizer', Normalizer(copy=False))
        ]

        self.pipeline = Pipeline(args)
        self.trained = False 
开发者ID:frnsys,项目名称:broca,代码行数:27,代码来源:bow.py

示例7: __init__

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self,
                 input_columns: Any,
                 output_column: str = None,
                 max_tokens: int = 2 ** 18,
                 tokens: str = 'chars',
                 ngram_range: tuple = None,
                 prefixed_concatenation: bool = True) -> None:

        if ngram_range is None:
            ngram_range = (1, 3) if tokens == 'words' else (1, 5)

        ColumnEncoder.__init__(self, input_columns, output_column, int(max_tokens))

        if tokens == 'words':
            self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range)
        elif tokens == 'chars':
            self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range,
                                                analyzer="char")
        else:
            logger.debug(
                "BowEncoder attribute tokens has to be 'words' or 'chars', defaulting to 'chars'")
            self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range,
                                                analyzer="char")

        self.prefixed_concatenation = prefixed_concatenation 
开发者ID:awslabs,项目名称:datawig,代码行数:27,代码来源:column_encoders.py

示例8: test_cv_pipeline

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_cv_pipeline(self):
        pipeline = SKL_Pipeline([
            ('vect', SKL_HashingVectorizer(n_features=20)),
            ('tfidf', SKL_TfidfTransformer(use_idf=False)),
            ('lasso', SKL_Lasso())
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }
        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha']) 
开发者ID:databricks,项目名称:spark-sklearn,代码行数:25,代码来源:test_search_2.py

示例9: test_hashed_binary_occurrences

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_hashed_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = ['aaabc', 'abbde']
    vect = HashingVectorizer(analyzer='char', non_negative=True,
                             norm=None)
    X = vect.transform(test_data)
    assert_equal(np.max(X[0:1].data), 3)
    assert_equal(np.max(X[1:2].data), 2)
    assert_equal(X.dtype, np.float64)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = HashingVectorizer(analyzer='char', non_negative=True, binary=True,
                             norm=None)
    X = vect.transform(test_data)
    assert_equal(np.max(X.data), 1)
    assert_equal(X.dtype, np.float64)

    # check the ability to change the dtype
    vect = HashingVectorizer(analyzer='char', non_negative=True, binary=True,
                             norm=None, dtype=np.float64)
    X = vect.transform(test_data)
    assert_equal(X.dtype, np.float64) 
开发者ID:alvarobartt,项目名称:twitter-stock-recommendation,代码行数:25,代码来源:test_text.py

示例10: test_pickling_vectorizer

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_pickling_vectorizer():
    instances = [
        HashingVectorizer(),
        HashingVectorizer(norm='l1'),
        HashingVectorizer(binary=True),
        HashingVectorizer(ngram_range=(1, 2)),
        CountVectorizer(),
        CountVectorizer(preprocessor=strip_tags),
        CountVectorizer(analyzer=lazy_analyze),
        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
        TfidfVectorizer(),
        TfidfVectorizer(analyzer=lazy_analyze),
        TfidfVectorizer().fit(JUNK_FOOD_DOCS),
    ]

    for orig in instances:
        s = pickle.dumps(orig)
        copy = pickle.loads(s)
        assert_equal(type(copy), orig.__class__)
        assert_equal(copy.get_params(), orig.get_params())
        assert_array_equal(
            copy.fit_transform(JUNK_FOOD_DOCS).toarray(),
            orig.fit_transform(JUNK_FOOD_DOCS).toarray()) 
开发者ID:alvarobartt,项目名称:twitter-stock-recommendation,代码行数:26,代码来源:test_text.py

示例11: get_kmeans_prototypes

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def get_kmeans_prototypes(X, n_prototypes, hashing_dim=128,
                          ngram_range=(3, 3), sparse=False, sample_weight=None,
                          random_state=None):
    """
    Computes prototypes based on:
      - dimensionality reduction (via hashing n-grams)
      - k-means clustering
      - nearest neighbor
    """
    vectorizer = HashingVectorizer(analyzer='char', norm=None,
                                   alternate_sign=False,
                                   ngram_range=ngram_range,
                                   n_features=hashing_dim)
    projected = vectorizer.transform(X)
    if not sparse:
        projected = projected.toarray()
    kmeans = KMeans(n_clusters=n_prototypes, random_state=random_state)
    kmeans.fit(projected, sample_weight=sample_weight)
    centers = kmeans.cluster_centers_
    neighbors = NearestNeighbors()
    neighbors.fit(projected)
    indexes_prototypes = np.unique(neighbors.kneighbors(centers, 1)[-1])
    if indexes_prototypes.shape[0] < n_prototypes:
        warnings.warn('Final number of unique prototypes is lower than ' +
                      'n_prototypes (expected)')
    return np.sort(X[indexes_prototypes]) 
开发者ID:dirty-cat,项目名称:dirty_cat,代码行数:28,代码来源:similarity_encoder.py

示例12: __init__

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self):
        raw_data = np.load(robotreviewer.get_data('pubmed/pubmed_title_hash_2016_07_24.npz'))
        self.vec_ti = csr_matrix((raw_data['data'], raw_data['indices'], raw_data['indptr']), raw_data['shape'])
        self.pmid_ind = np.load(robotreviewer.get_data('pubmed/pubmed_index_2016_07_24.npz'))['pmid_ind']
        self.vectorizer = HashingVectorizer(binary=True, stop_words='english')
        # load database
        self.connection = sqlite3.connect(robotreviewer.get_data('pubmed/pubmed_rcts_2016_07_24.sqlite'))
        self.c = self.connection.cursor() 
开发者ID:ijmarshall,项目名称:robotreviewer,代码行数:10,代码来源:pubmed_robot.py

示例13: __init__

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self):
        
        with open(robotreviewer.get_data(os.path.join('bias_ab', 'bias_prob_clf.pck')), 'rb') as f:
            self.clf = pickle.load(f)

        self.vec = HashingVectorizer(ngram_range=(1, 3), stop_words='english') 
开发者ID:ijmarshall,项目名称:robotreviewer,代码行数:8,代码来源:bias_ab_robot.py

示例14: __init__

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self):
        from keras.preprocessing import sequence
        from keras.models import load_model
        from keras.models import Sequential
        from keras.preprocessing import sequence
        from keras.layers import Dense, Dropout, Activation, Lambda, Input, merge, Flatten
        from keras.layers import Embedding
        from keras.layers import Convolution1D, MaxPooling1D
        from keras import backend as K
        from keras.models import Model
        from keras.regularizers import l2
        global sequence, load_model, Sequential, Dense, Dropout, Activation, Lambda, Input, merge, Flatten
        global Embedding, Convolution1D, MaxPooling1D, K, Model, l2
        self.svm_clf = MiniClassifier(os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))
        cnn_weight_files = glob.glob(os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
        self.cnn_clfs = [load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files]
        self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english')
        self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'), stop_words='english')
        with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_model_calibration.json'), 'r') as f:
            self.constants = json.load(f)

        self.calibration_lr = {}
        with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_ptyp_calibration.pck'), 'rb') as f:
            self.calibration_lr['svm_cnn_ptyp'] = pickle.load(f)

        with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_calibration.pck'), 'rb') as f:
            self.calibration_lr['svm_cnn'] = pickle.load(f) 
开发者ID:ijmarshall,项目名称:robotreviewer,代码行数:29,代码来源:rct_robot.py

示例15: test_hashing_vectorizer

# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_hashing_vectorizer():
    v = HashingVectorizer()
    X = v.transform(ALL_FOOD_DOCS)
    token_nnz = X.nnz
    assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
    assert_equal(X.dtype, v.dtype)

    # By default the hashed values receive a random sign and l2 normalization
    # makes the feature values bounded
    assert np.min(X.data) > -1
    assert np.min(X.data) < 0
    assert np.max(X.data) > 0
    assert np.max(X.data) < 1

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0)

    # Check vectorization with some non-default parameters
    v = HashingVectorizer(ngram_range=(1, 2), norm='l1')
    X = v.transform(ALL_FOOD_DOCS)
    assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
    assert_equal(X.dtype, v.dtype)

    # ngrams generate more non zeros
    ngrams_nnz = X.nnz
    assert ngrams_nnz > token_nnz
    assert ngrams_nnz < 2 * token_nnz

    # makes the feature values bounded
    assert np.min(X.data) > -1
    assert np.max(X.data) < 1

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0) 
开发者ID:PacktPublishing,项目名称:Mastering-Elasticsearch-7.0,代码行数:38,代码来源:test_text.py


注:本文中的sklearn.feature_extraction.text.HashingVectorizer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。