本文整理汇总了Python中sklearn.feature_extraction.text.Vectorizer类的典型用法代码示例。如果您正苦于以下问题:Python Vectorizer类的具体用法?Python Vectorizer怎么用?Python Vectorizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Vectorizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _word_tfidf_dist
def _word_tfidf_dist(documents):
words_tfidf = {}
if len(documents) > 0:
if _check_is_sentence(documents): #if document contains only 1 or 2 or 3 chars --> acronyms
try:
text_analyzer = Vectorizer(ngram_range=(1,2),max_features=50)
matrix = text_analyzer.fit_transform(documents).todense()
for vocabulary in text_analyzer.vocabulary_.items():
word = vocabulary[0]
indice = vocabulary[1]
words_tfidf[word] = score_tfidf_freq(matrix[:,indice])
except ValueError:
return {}
else:
return _freqdist(documents)
return words_tfidf
示例2: vectorize_videos
def vectorize_videos(fpath, use_idf=False):
'''
Converts a YouTube tag file to a sparse matrix pondered. We can assign
weights based on IDF if specified.
Arguments
---------
fpath: a path to a file
Each line is a song, tags are separated by space
use_idf: bool (optinal, defaults to True)
Indicates whether to use IDF.
bottom_filter: float (defaults to 0.005, half of one percent)
Minimum probability for tags to be considered useful
'''
#Vectorizes to TF-IDF
vectorizer = Vectorizer(analyzer=NoopAnalyzer(), use_idf = use_idf)
sparse_matrix = vectorizer.fit_transform(clean_up(fpath, bottom_filter=0))
vocabulary = vectorizer.vocabulary
return sparse_matrix, vocabulary
示例3: get_20newsgroups_data_info_for_categories
def get_20newsgroups_data_info_for_categories(categories):
data = fetch_20newsgroups(subset='all', categories=categories, shuffle=False)
vectorizer = Vectorizer()
t0 = time()
tfidf = vectorizer.fit_transform(data.data)
pairwise_similarity = (tfidf * tfidf.T).todense().tolist()
print "done in %fs" % (time() - t0)
labels = [data.target_names[i] for i in data.target]
payloads = [os.sep.join(e.split(os.sep)[-3:]) for e in data.filenames]
# Similarity is from Zero to One - so (1-s) gives distance from 0 to 1.
distances = [[(1-s) for s in row[:col_to+1]]for (col_to, row) in enumerate(pairwise_similarity)]
# Fix the very slight off-ness involved in precision-conversion
for row in distances:
row[-1] = 0
pcd_tuples = zip(payloads, labels, distances)
di = DataInfo.deserialize_pcd_tuples(pcd_tuples)
return di
示例4: test_vectorizer
def test_vectorizer():
# raw documents as an iterator
train_data = iter(ALL_FOOD_DOCS[:-1])
test_data = [ALL_FOOD_DOCS[-1]]
n_train = len(ALL_FOOD_DOCS) - 1
# test without vocabulary
v1 = CountVectorizer(max_df=0.5)
counts_train = v1.fit_transform(train_data)
if hasattr(counts_train, 'tocsr'):
counts_train = counts_train.tocsr()
assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2)
# build a vectorizer v1 with the same vocabulary as the one fitted by v1
v2 = CountVectorizer(vocabulary=v1.vocabulary)
# compare that the two vectorizer give the same output on the test sample
for v in (v1, v2):
counts_test = v.transform(test_data)
if hasattr(counts_test, 'tocsr'):
counts_test = counts_test.tocsr()
assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1)
assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1)
assert_equal(counts_test[0, v.vocabulary[u"water"]], 1)
# stop word from the fixed list
assert_false(u"the" in v.vocabulary)
# stop word found automatically by the vectorizer DF thresholding
# words that are high frequent across the complete corpus are likely
# to be not informative (either real stop words of extraction
# artifacts)
assert_false(u"copyright" in v.vocabulary)
# not present in the sample
assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0)
assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0)
assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0)
assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0)
# test tf-idf
t1 = TfidfTransformer(norm='l1')
tfidf = toarray(t1.fit(counts_train).transform(counts_train))
assert_equal(len(t1.idf_), len(v1.vocabulary))
assert_equal(tfidf.shape, (n_train, len(v1.vocabulary)))
# test tf-idf with new data
tfidf_test = toarray(t1.transform(counts_test))
assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary)))
# test tf alone
t2 = TfidfTransformer(norm='l1', use_idf=False)
tf = toarray(t2.fit(counts_train).transform(counts_train))
assert_equal(t2.idf_, None)
# L1-normalized term frequencies sum to one
assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)
# test the direct tfidf vectorizer
# (equivalent to term count vectorizer + tfidf transformer)
train_data = iter(ALL_FOOD_DOCS[:-1])
tv = Vectorizer(norm='l1')
tv.tc.max_df = v1.max_df
tfidf2 = toarray(tv.fit_transform(train_data))
assert_array_almost_equal(tfidf, tfidf2)
# test the direct tfidf vectorizer with new data
tfidf_test2 = toarray(tv.transform(test_data))
assert_array_almost_equal(tfidf_test, tfidf_test2)
# test empty vocabulary
v3 = CountVectorizer(vocabulary=None)
assert_raises(ValueError, v3.transform, train_data)
示例5: ShuffleSplit
# 'svc': [{'probability': True}],
}
# split a training set and a test set
iter = ShuffleSplit(num_posts, n_iterations=1, test_fraction=0.15, indices=False)
for (iter_no, (train_index, test_index)) in enumerate(iter):
print 'Iteration no. %d' %(iter_no + 1)
y_train = np.array([ x for (x, y) in zip(all_data['target'], train_index) if y ])
y_test = np.array([ x for (x, y) in zip(all_data['target'], test_index) if y ])
print 'Sampled %d training and %d test posts' %(len(y_train), len(y_test))
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
title_vectorizer = Vectorizer(
analyzer=WordNGramAnalyzer(
charset='utf-8',
stop_words=set(['a', 'an', 'and', 'in', 'is', 'of', 'on', 'the', 'to']),
)
)
title_train = title_vectorizer.fit_transform([ x for (x, y) in zip(all_data['title'], train_index) if y ])
domain_vectorizer = extract.SimpleVectorizer()
domain_train = domain_vectorizer.fit_transform([ x for (x, y) in zip(all_data['domain'], train_index) if y ])
X_train = title_train
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print
print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
title_test = title_vectorizer.transform([ x for (x, y) in zip(all_data['title'], test_index) if y ])
domain_test = domain_vectorizer.transform([ x for (x, y) in zip(all_data['domain'], test_index) if y ])
示例6: time
print 'Data loaded!'
print
# Split datasets
y_L1 = data_train.target
y_L2_ca = ca_train.target
y_L2_collect = collect_train.target
y_L2_cookies = cookies_train.target
y_L2_share = share_train.target
# Extract features
print "Extracting features from Layer 1 training set using a sparse vectorizer..."
t0 = time()
vectorizer = Vectorizer()
X_L1 = vectorizer.fit_transform(data_train.data)
print "Done in %0.3fs" % (time() - t0)
print "L1: n_samples: %d, n_features: %d" % X_L1.shape
print
print "Extracting features from Layer 2 training sets using the same vectorizer..."
t0 = time()
X_L2_ca = vectorizer.transform(ca_train.data)
X_L2_collect = vectorizer.transform(collect_train.data)
X_L2_cookies = vectorizer.transform(cookies_train.data)
X_L2_share = vectorizer.transform(share_train.data)
print "Done in %0.3fs" % (time() - t0)
print "CA: n_samples: %d, n_features: %d" % X_L2_ca.shape
print "Collect: n_samples: %d, n_features: %d" % X_L2_collect.shape
print "Cookies: n_samples: %d, n_features: %d" % X_L2_cookies.shape
示例7: set
shuffle=True, random_state=42)
filenames = np.concatenate((data_train.filenames, data_test.filenames))
target_names = set(data_train.target_names + data_test.target_names)
print "%d documents" % len(filenames)
print "%d categories" % len(target_names)
print
# split a training set and a test set
labels = np.concatenate((data_train.target, data_test.target))
true_k = np.unique(labels).shape[0]
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform((open(f).read() for f in filenames))
X = Normalizer(norm="l2", copy=False).transform(X)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print
###############################################################################
# Now sparse MiniBatchKmeans
print "_" * 80
mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
示例8: documents
shuffle=True, random_state=42)
print 'data loaded'
categories = data_train.target_names # for case categories == None
print "%d documents (training set)" % len(data_train.data)
print "%d documents (testing set)" % len(data_test.data)
print "%d categories" % len(categories)
print
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform(data_train.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
print
print "Extracting features from the test dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform(data_test.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape
print
if opts.select_chi2:
print ("Extracting %d best features by a chi-squared test" %
opts.select_chi2)
示例9: time
sys.exit(1)
input_data = csv.reader(open('descriptions_100.csv','rb'))
dataset_data = []
dataset_target = []
for row in input_data:
dataset_data.append(row[1])
dataset_target.append(row[0])
labels = dataset_target
true_k = np.unique(labels).shape[0]
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_df=0.95, max_features=10000)
X = vectorizer.fit_transform(dataset_data)
print X
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
###############################################################################
# Do the actual clustering
km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,init_size=1000,batch_size=1000, verbose=1)
print "Clustering with %s" % km
t0 = time()
km.fit(X)
示例10: load_files
print categories if categories else "all"
data_set = load_files('Privacypolicy/raw', categories = categories,
shuffle = True, random_state = 42)
print 'data loaded'
# print "%d documents" % len(data_set.data)
# print "%d categories" % len(data_set.target_names)
print
# load unlabeled data
data_set_unlabel = load_files('Privacypolicy/unlabeled', shuffle = True, random_state = 30)
# Extract features
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(data_set.data)
X = Normalizer(norm="l2", copy=False).transform(X)
X = X.toarray()
X_unlabel = vectorizer.transform(data_set_unlabel.data)
X_unlabel = X_unlabel.toarray()
y = data_set.target
n_samples, n_features = X.shape
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % (n_samples, n_features)
print
示例11: range
print i, data_train.target_names[i]
# A primary thought on implementing multi-label classifier
# Aborted later due to functions provided by most classifiers
# Method: Transform y to one-else and use loops to learn binary classifiers
# y_0 = y.copy()
# for i in range(len(y_0)):
# if y_0[i] == 1:
# y_0[i] = 2
# Extract features
print "Extracting features from Layer 1 training set using a sparse vectorizer..."
t0 = time()
vectorizer = Vectorizer()
X = vectorizer.fit_transform(data_train.data)
print "Done in %0.3fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print
# to dense array for logistic regression which does not work on sparse
X_den = X.toarray()
# # Feature selection
# select_chi2 = 1000
# print ("Extracting %d best features by a chi-squared test" % select_chi2)
# t0 = time()
# ch2 = SelectKBest(chi2, k = select_chi2)
# X = ch2.fit_transform(X, y)
# print "Done in %fs" % (time() - t0)
示例12: main
def main(tcu_fpath):
data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath)
data = data[data['Situacao'] == 'Aceito e Habilitado']
desc_column = data['Descricao']
des_cmp_column = data['DescricaoComplementar']
unidade_column = data['UnidadeFornecimento']
qtd_column = [str(qtd) for qtd in data['Quantidade']]
#Transforms descriptions to base strings
as_docs = []
for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column):
doc = " ".join(as_text)
as_docs.append(doc)
#Vectorizes to TF-IDF
vectorizer = Vectorizer()
doc_sparse_matrix = vectorizer.fit_transform(as_docs)
#Compute clusters
inter = {}
intra = {}
n_runs = 20
k_vals = range(2, 16)
for i in xrange(n_runs):
for k in k_vals:
#Each K has n_runs clusterings
inter_array = inter.setdefault(k, np.zeros(n_runs))
intra_array = intra.setdefault(k, np.zeros(n_runs))
#Run K-Means
mbkm = MiniBatchKMeans(k, init = 'random')
mbkm.fit(doc_sparse_matrix)
centers = mbkm.cluster_centers_
labels = mbkm.labels_
#Inter distance. We use min because the ideia is to maximize this.
#Min serves as a penalty for worse case.
dist_centers = pairwise.euclidean_distances(centers)
min_dist_between_centers = \
np.min(dist_centers[dist_centers > 0])
inter_array[i] = min_dist_between_centers
#Intra distance
dist_all_centers = mbkm.transform(doc_sparse_matrix)
intra_dists = []
for doc_id, cluster in enumerate(labels):
dist = dist_all_centers[doc_id, cluster]
intra_dists.append(dist)
intra_array[i] = np.mean(intra_dists)
#Prints num elements per cluster
print('Run %d ; k = %d' %(i, k))
counter = Counter(labels)
for cluster, population in counter.items():
print('\tK = %d; Pop = %d' %(cluster, population))
print()
x = inter.keys()
y = []
c = []
for k in x:
div = inter[k] / intra[k]
y.append(np.mean(div))
c.append(half_confidence_interval_size(div, 0.90))
#hack for the zero to apper
x = [0] + x
y = [0] + y
c = [0] + c
ax = plt.gca()
ax.set_yscale('log')
ax.set_xticks(range(0, 16))
plt.ylabel('InterCluster/IntraCluster Ratio')
plt.xlabel('Number of clusters')
plt.errorbar(x, y, yerr=c, fmt='bo', markersize=8, elinewidth=2)
plt.show()
示例13: set
print 'data loaded'
documents = data_train.data + data_test.data
target_names = set(data_train.target_names + data_test.target_names)
print "%d documents" % len(documents)
print "%d categories" % len(target_names)
print
# split a training set and a test set
labels = np.concatenate((data_train.target, data_test.target))
true_k = np.unique(labels).shape[0]
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform(documents)
X = Normalizer(norm="l2", copy=False).transform(X)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print
###############################################################################
# Now sparse MiniBatchKmeans
mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
chunk_size=1000)
print "Clustering sparse data with %s" % str(mbkm)
t0 = time()
示例14: main
def main(tcu_fpath):
data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath)
#We only want accepted data
data = data[data['Situacao'] == 'Aceito e Habilitado']
#Get invalid lines
invalids = invalid(data)
#Transforms descriptions to base strings
desc_column = data['Descricao']
des_cmp_column = data['DescricaoComplementar']
unidade_column = data['UnidadeFornecimento']
qtd_column = [str(qtd) for qtd in data['Quantidade']]
as_docs = []
for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column):
doc = " ".join(as_text)
as_docs.append(doc)
#Vectorizes to TF-IDF
vectorizer = Vectorizer()
doc_sparse_matrix = vectorizer.fit_transform(as_docs)
#Run K-Means
num_clusters = 7
mbkm = MiniBatchKMeans(num_clusters, init = 'random')
mbkm.fit(doc_sparse_matrix)
#New labels column, replaces both Descricao columns
labels_column = mbkm.labels_
#Old columns to keep
chave_column = data['ChavePregao']
uasg_column = data['UASG']
pregoeiro_column = data['PregoeiroOficial']
aceito_column = data['AceitoPara_CNPJ']
lance_column = data['PeloMenorLance']
ref_column = data['ValordeReferencia']
ganho_column = data['GanhoPregao']
#And a new column Superfaturamento
super_faturamento = np.ndarray(shape=len(labels_column), dtype = 'S12')
for i, ganho in enumerate(ganho_column):
if ganho >= -50: #50% vezes o preco é aceito
super_faturamento[i] = 'OK'
elif ganho < -50 and ganho > -500: #Mais que isso é super faturado
super_faturamento[i] = 'Super'
elif ganho < -500: #Mais que 5x o valor é foda.
super_faturamento[i] = 'SuperPlus'
for i in xrange(len(labels_column)):
if i not in invalids:
print(labels_column[i], end=',')
print(chave_column[i], end=',')
print(uasg_column[i], end=',')
print(pregoeiro_column[i], end=',')
print(aceito_column[i], end=',')
print(lance_column[i], end=',')
print(ref_column[i], end=',')
print(ganho_column[i], end=',')
print(super_faturamento[i])
示例15: load_mlcomp
if 'MLCOMP_DATASETS_HOME' not in os.environ:
print "MLCOMP_DATASETS_HOME not set; please follow the above instructions"
sys.exit(0)
# Load the training set
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp('20news-18828', 'train')
print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)
print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read()
for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target
print "Loading 20 newsgroups test set... "
news_test = load_mlcomp('20news-18828', 'test')
t0 = time()
print "done in %fs" % (time() - t0)
print "Predicting the labels of the test set..."
print "%d documents" % len(news_test.filenames)
print "%d categories" % len(news_test.target_names)