本文整理汇总了Python中sklearn.feature_extraction.text.HashingVectorizer方法的典型用法代码示例。如果您正苦于以下问题:Python text.HashingVectorizer方法的具体用法?Python text.HashingVectorizer怎么用?Python text.HashingVectorizer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_extraction.text
的用法示例。
在下文中一共展示了text.HashingVectorizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: transform
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def transform(self, X_si, high=None, low=None, limit=None):
"""
Same as HashingVectorizer transform, except allows for
interaction list, which is an iterable the same length as X
filled with True/False. This method adds an empty row to
docs labelled as False.
"""
analyzer = self.build_analyzer()
X = self._get_hasher().transform(
analyzer(self._deal_with_input(doc)) for doc in X_si)
X.data.fill(1)
if self.norm is not None:
X = normalize(X, norm=self.norm, copy=False)
if low:
X = self._limit_features(X, low=low)
return X
示例2: __init__
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self):
self.vectorizer = HashingVectorizer(ngram_range=(1, 2))
self.dict_vectorizer = DictVectorizer()
# These are set dynamically in training
# but fixed here to match the end feature names
# in the trained model. If the model is retrained then
# these may have to change
self.dict_vectorizer.feature_names_ = [
'DocumentPositionQuintile0',
'DocumentPositionQuintile1',
'DocumentPositionQuintile2',
'DocumentPositionQuintile3',
'DocumentPositionQuintile4',
'DocumentPositionQuintile5',
'DocumentPositionQuintile6']
self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)}
self.drugbank = Drugbank()
示例3: test_hashed_binary_occurrences
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_hashed_binary_occurrences():
# by default multiple occurrences are counted as longs
test_data = ['aaabc', 'abbde']
vect = HashingVectorizer(alternate_sign=False, analyzer='char', norm=None)
X = vect.transform(test_data)
assert_equal(np.max(X[0:1].data), 3)
assert_equal(np.max(X[1:2].data), 2)
assert_equal(X.dtype, np.float64)
# using boolean features, we can fetch the binary occurrence info
# instead.
vect = HashingVectorizer(analyzer='char', alternate_sign=False,
binary=True, norm=None)
X = vect.transform(test_data)
assert_equal(np.max(X.data), 1)
assert_equal(X.dtype, np.float64)
# check the ability to change the dtype
vect = HashingVectorizer(analyzer='char', alternate_sign=False,
binary=True, norm=None, dtype=np.float64)
X = vect.transform(test_data)
assert_equal(X.dtype, np.float64)
示例4: test_vectorizer_unicode
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_vectorizer_unicode():
# tests that the count vectorizer works with cyrillic.
document = (
"Машинное обучение — обширный подраздел искусственного "
"интеллекта, изучающий методы построения алгоритмов, "
"способных обучаться."
)
vect = CountVectorizer()
X_counted = vect.fit_transform([document])
assert_equal(X_counted.shape, (1, 12))
vect = HashingVectorizer(norm=None, alternate_sign=False)
X_hashed = vect.transform([document])
assert_equal(X_hashed.shape, (1, 2 ** 20))
# No collisions on such a small dataset
assert_equal(X_counted.nnz, X_hashed.nnz)
# When norm is None and not alternate_sign, the tokens are counted up to
# collisions
assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))
示例5: test_vectorizer_stop_words_inconsistent
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_vectorizer_stop_words_inconsistent():
lstr = "['and', 'll', 've']"
message = ('Your stop_words may be inconsistent with your '
'preprocessing. Tokenizing the stop words generated '
'tokens %s not in stop_words.' % lstr)
for vec in [CountVectorizer(),
TfidfVectorizer(), HashingVectorizer()]:
vec.set_params(stop_words=["you've", "you", "you'll", 'AND'])
assert_warns_message(UserWarning, message, vec.fit_transform,
['hello world'])
# reset stop word validation
del vec._stop_words_id
assert _check_stop_words_consistency(vec) is False
# Only one warning per stop list
assert_no_warnings(vec.fit_transform, ['hello world'])
assert _check_stop_words_consistency(vec) is None
# Test caching of inconsistency assessment
vec.set_params(stop_words=["you've", "you", "you'll", 'blah', 'AND'])
assert_warns_message(UserWarning, message, vec.fit_transform,
['hello world'])
示例6: __init__
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False):
"""
`min_df` is set to filter out extremely rare words,
since we don't want those to dominate the distance metric.
`max_df` is set to filter out extremely common words,
since they don't convey much information.
"""
# Wrap the specified tokenizer
t = Tokenizer(tokenizer())
if hash:
vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t)
else:
vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df)
args = [
('vectorizer', vectr),
('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
('normalizer', Normalizer(copy=False))
]
self.pipeline = Pipeline(args)
self.trained = False
示例7: __init__
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self,
input_columns: Any,
output_column: str = None,
max_tokens: int = 2 ** 18,
tokens: str = 'chars',
ngram_range: tuple = None,
prefixed_concatenation: bool = True) -> None:
if ngram_range is None:
ngram_range = (1, 3) if tokens == 'words' else (1, 5)
ColumnEncoder.__init__(self, input_columns, output_column, int(max_tokens))
if tokens == 'words':
self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range)
elif tokens == 'chars':
self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range,
analyzer="char")
else:
logger.debug(
"BowEncoder attribute tokens has to be 'words' or 'chars', defaulting to 'chars'")
self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range,
analyzer="char")
self.prefixed_concatenation = prefixed_concatenation
示例8: test_cv_pipeline
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_cv_pipeline(self):
pipeline = SKL_Pipeline([
('vect', SKL_HashingVectorizer(n_features=20)),
('tfidf', SKL_TfidfTransformer(use_idf=False)),
('lasso', SKL_Lasso())
])
parameters = {
'lasso__alpha': (0.001, 0.005, 0.01)
}
grid_search = GridSearchCV(self.sc, pipeline, parameters)
data = [('hi there', 0.0),
('what is up', 1.0),
('huh', 1.0),
('now is the time', 5.0),
('for what', 0.0),
('the spark was there', 5.0),
('and so', 3.0),
('were many socks', 0.0),
('really', 1.0),
('too cool', 2.0)]
df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
skl_gs = grid_search.fit(df.review.values, df.rating.values)
assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
示例9: test_hashed_binary_occurrences
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_hashed_binary_occurrences():
# by default multiple occurrences are counted as longs
test_data = ['aaabc', 'abbde']
vect = HashingVectorizer(analyzer='char', non_negative=True,
norm=None)
X = vect.transform(test_data)
assert_equal(np.max(X[0:1].data), 3)
assert_equal(np.max(X[1:2].data), 2)
assert_equal(X.dtype, np.float64)
# using boolean features, we can fetch the binary occurrence info
# instead.
vect = HashingVectorizer(analyzer='char', non_negative=True, binary=True,
norm=None)
X = vect.transform(test_data)
assert_equal(np.max(X.data), 1)
assert_equal(X.dtype, np.float64)
# check the ability to change the dtype
vect = HashingVectorizer(analyzer='char', non_negative=True, binary=True,
norm=None, dtype=np.float64)
X = vect.transform(test_data)
assert_equal(X.dtype, np.float64)
示例10: test_pickling_vectorizer
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_pickling_vectorizer():
instances = [
HashingVectorizer(),
HashingVectorizer(norm='l1'),
HashingVectorizer(binary=True),
HashingVectorizer(ngram_range=(1, 2)),
CountVectorizer(),
CountVectorizer(preprocessor=strip_tags),
CountVectorizer(analyzer=lazy_analyze),
CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
TfidfVectorizer(),
TfidfVectorizer(analyzer=lazy_analyze),
TfidfVectorizer().fit(JUNK_FOOD_DOCS),
]
for orig in instances:
s = pickle.dumps(orig)
copy = pickle.loads(s)
assert_equal(type(copy), orig.__class__)
assert_equal(copy.get_params(), orig.get_params())
assert_array_equal(
copy.fit_transform(JUNK_FOOD_DOCS).toarray(),
orig.fit_transform(JUNK_FOOD_DOCS).toarray())
示例11: get_kmeans_prototypes
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def get_kmeans_prototypes(X, n_prototypes, hashing_dim=128,
ngram_range=(3, 3), sparse=False, sample_weight=None,
random_state=None):
"""
Computes prototypes based on:
- dimensionality reduction (via hashing n-grams)
- k-means clustering
- nearest neighbor
"""
vectorizer = HashingVectorizer(analyzer='char', norm=None,
alternate_sign=False,
ngram_range=ngram_range,
n_features=hashing_dim)
projected = vectorizer.transform(X)
if not sparse:
projected = projected.toarray()
kmeans = KMeans(n_clusters=n_prototypes, random_state=random_state)
kmeans.fit(projected, sample_weight=sample_weight)
centers = kmeans.cluster_centers_
neighbors = NearestNeighbors()
neighbors.fit(projected)
indexes_prototypes = np.unique(neighbors.kneighbors(centers, 1)[-1])
if indexes_prototypes.shape[0] < n_prototypes:
warnings.warn('Final number of unique prototypes is lower than ' +
'n_prototypes (expected)')
return np.sort(X[indexes_prototypes])
示例12: __init__
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self):
raw_data = np.load(robotreviewer.get_data('pubmed/pubmed_title_hash_2016_07_24.npz'))
self.vec_ti = csr_matrix((raw_data['data'], raw_data['indices'], raw_data['indptr']), raw_data['shape'])
self.pmid_ind = np.load(robotreviewer.get_data('pubmed/pubmed_index_2016_07_24.npz'))['pmid_ind']
self.vectorizer = HashingVectorizer(binary=True, stop_words='english')
# load database
self.connection = sqlite3.connect(robotreviewer.get_data('pubmed/pubmed_rcts_2016_07_24.sqlite'))
self.c = self.connection.cursor()
示例13: __init__
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self):
with open(robotreviewer.get_data(os.path.join('bias_ab', 'bias_prob_clf.pck')), 'rb') as f:
self.clf = pickle.load(f)
self.vec = HashingVectorizer(ngram_range=(1, 3), stop_words='english')
示例14: __init__
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def __init__(self):
from keras.preprocessing import sequence
from keras.models import load_model
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.layers import Dense, Dropout, Activation, Lambda, Input, merge, Flatten
from keras.layers import Embedding
from keras.layers import Convolution1D, MaxPooling1D
from keras import backend as K
from keras.models import Model
from keras.regularizers import l2
global sequence, load_model, Sequential, Dense, Dropout, Activation, Lambda, Input, merge, Flatten
global Embedding, Convolution1D, MaxPooling1D, K, Model, l2
self.svm_clf = MiniClassifier(os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))
cnn_weight_files = glob.glob(os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
self.cnn_clfs = [load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files]
self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english')
self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'), stop_words='english')
with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_model_calibration.json'), 'r') as f:
self.constants = json.load(f)
self.calibration_lr = {}
with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_ptyp_calibration.pck'), 'rb') as f:
self.calibration_lr['svm_cnn_ptyp'] = pickle.load(f)
with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_calibration.pck'), 'rb') as f:
self.calibration_lr['svm_cnn'] = pickle.load(f)
示例15: test_hashing_vectorizer
# 需要导入模块: from sklearn.feature_extraction import text [as 别名]
# 或者: from sklearn.feature_extraction.text import HashingVectorizer [as 别名]
def test_hashing_vectorizer():
v = HashingVectorizer()
X = v.transform(ALL_FOOD_DOCS)
token_nnz = X.nnz
assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
assert_equal(X.dtype, v.dtype)
# By default the hashed values receive a random sign and l2 normalization
# makes the feature values bounded
assert np.min(X.data) > -1
assert np.min(X.data) < 0
assert np.max(X.data) > 0
assert np.max(X.data) < 1
# Check that the rows are normalized
for i in range(X.shape[0]):
assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0)
# Check vectorization with some non-default parameters
v = HashingVectorizer(ngram_range=(1, 2), norm='l1')
X = v.transform(ALL_FOOD_DOCS)
assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
assert_equal(X.dtype, v.dtype)
# ngrams generate more non zeros
ngrams_nnz = X.nnz
assert ngrams_nnz > token_nnz
assert ngrams_nnz < 2 * token_nnz
# makes the feature values bounded
assert np.min(X.data) > -1
assert np.max(X.data) < 1
# Check that the rows are normalized
for i in range(X.shape[0]):
assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)