本文整理汇总了Python中sklearn.cluster.MiniBatchKMeans.transform方法的典型用法代码示例。如果您正苦于以下问题:Python MiniBatchKMeans.transform方法的具体用法?Python MiniBatchKMeans.transform怎么用?Python MiniBatchKMeans.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.cluster.MiniBatchKMeans
的用法示例。
在下文中一共展示了MiniBatchKMeans.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: KMeansFeatureTransformer
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
class KMeansFeatureTransformer(object):
def __init__(self, patches, k=1500, model_path=None):
self.mean_distances = None
if model_path is None:
self.k_means = MiniBatchKMeans(n_clusters=k, compute_labels=False,
reassignment_ratio=0, max_no_improvement=10, batch_size=10000,
verbose=2)
self.k_means.fit(patches)
# update mean distances
self.compute_mean_distances(patches)
else:
self.load(model_path)
def transform(self, patches):
return self.k_means.transform(patches)
def predict(self, patches):
return self.k_means.predict(patches)
def compute_mean_distances(self, patches):
self.mean_distances = np.mean(self.k_means.transform(patches), axis=0)
def save(self, file_path='model/k_means_model'):
joblib.dump(self.k_means, file_path)
def load(self, file_path):
self.k_means = joblib.load(file_path)
示例2: Embedder
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
class Embedder(object):
"""Transform a set of sparse high dimensional vectors to a set of low dimensional dense vectors.
Under the hood sparse random projection and simplex volume maximization factorization is used.
"""
def __init__(self, complexity=10, n_kmeans=None, random_state=1):
self.complexity = complexity
self.n_kmeans = n_kmeans
self.transformer = None
self.matrix_factorizer = None
self.kmeans = None
self.random_state = random_state
def fit(self, data_matrix):
n_rows, n_cols = data_matrix.shape
if n_rows <= n_cols:
n_components = n_rows
elif n_cols < 5000:
n_components = n_cols
else:
n_components = 'auto'
self.transformer = random_projection.SparseRandomProjection(n_components=n_components,
dense_output=True,
random_state=self.random_state)
data_matrix_new = self.transformer.fit_transform(data_matrix)
self.matrix_factorizer = pymf.SIVM(data_matrix_new.T, num_bases=self.complexity)
self.matrix_factorizer.factorize()
if self.n_kmeans:
self.kmeans = MiniBatchKMeans(n_clusters=self.n_kmeans)
self.kmeans.fit(self.matrix_factorizer.H.T)
def fit_transform(self, data_matrix):
self.fit(data_matrix)
if self.n_kmeans:
return self.kmeans.transform(self.matrix_factorizer.H.T)
else:
return self.matrix_factorizer.H.T
def transform(self, data_matrix):
basis_data_matrix = self.matrix_factorizer.W
data_matrix_new = self.transformer.transform(data_matrix)
self.matrix_factorizer = pymf.SIVM(data_matrix_new.T, num_bases=self.complexity)
self.matrix_factorizer.W = basis_data_matrix
self.matrix_factorizer.factorize(compute_w=False)
if self.n_kmeans:
return self.kmeans.transform(self.matrix_factorizer.H.T)
else:
return self.matrix_factorizer.H.T
示例3: make_cluster
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def make_cluster(datasets):
num_clusters = 5
lsa_dim = 500
max_df = 0.8
max_features = 10000
minibatch = True
print("datasets are %(datasets)s" % locals())
km = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++',
batch_size=1000, n_init=10, max_no_improvement=10, verbose=True)
km.fit(datasets)
labels = km.labels_
transformed = km.transform(x)
dists = np.zeros(labels.shape)
for i in range(len(labels)):
dists[i] = transformed[i, labels[i]]
clusters = []
for i in range(num_clusters):
cluster = []
ii = np.where(labels == i)[0]
dd = dists[ii]
di = np.vstack([dd, ii]).transpose().tolist()
di.sort()
for d, j in di:
cluster.append(datasets[int(j)])
clusters.append(cluster)
return clusters
示例4: clustering
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def clustering(self, X, NUM_CLUSTERS, MINIBATCH):
'''
k平均法によってクラス分け
'''
if MINIBATCH:
km = MiniBatchKMeans(n_clusters = NUM_CLUSTERS,
init='k-means++', batch_size=1000,
n_init=10, max_no_improvement=10)
else:
km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1)
km.fit(X)
transformed = km.transform(X) #商品の各クラスの中心への距離
labels = km.labels_
dists = []
for i in range(len(labels)):
dists.append(transformed[i, labels[i]]) #商品の属するクラスの中心への距離
labels = DataFrame(labels)
dists = DataFrame(dists)
labels.columns = ['label']
dists.columns = ['dists']
self.data = pd.concat([labels, dists, self.data], axis=1) #元のデータにラベルを加える
return km
示例5: make_cluster
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def make_cluster(self):
texts = self._read_from_file()
# print "texts are %(texts)s" %locals()
# ベクトルを生成
vectorizer = TfidfVectorizer(
max_df = self.max_df,
max_features = self.max_features,
stop_words = 'english'
)
X = vectorizer.fit_transform(texts)
# ここでの値は何度やっても同じでした
# print "X values are %(X)s" %locals()
# KMeans インスタンスを生成しクラスタリングする
# パラメータはデータの量や特性に応じて適切なものを与えるようにする
if self.minibatch:
km = MiniBatchKMeans(
n_clusters = self.num_clusters,
init = 'k-means++',
batch_size = 1000,
n_init = 10,
max_no_improvement = 10,
verbose = True
)
else:
km = KMeans(
n_clusters = self.num_clusters,
init = 'k-means++',
n_init = 1,
verbose = True
)
km.fit(X)
labels = km.labels_
transformed = km.transform(X)
dists = np.zeros(labels.shape)
for i in range(len(labels)):
dists[i] = transformed[i, labels[i]]
clusters = []
for i in range(self.num_clusters):
cluster = []
ii = np.where(labels==i)[0]
dd = dists[ii]
di = np.vstack([dd,ii]).transpose().tolist()
di.sort()
for d, j in di:
cluster.append(texts[int(j)])
clusters.append(cluster)
return clusters
示例6: applyKmeans
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def applyKmeans():
rng = RandomState(0)
components = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
# components = [10]
'''
Lets work on stepName only.
'''
sIdMtx = LoadSparseMatrix(ROOTDIR+"train_sId.txt")
sectionMtx = LoadSparseMatrix(ROOTDIR+"train_section.txt")
problemMtx = LoadSparseMatrix(ROOTDIR+"train_problem.txt")
stepMtx = LoadSparseMatrix(ROOTDIR+"train_step.txt")
label = np.load(ROOTDIR+"label_train.npy")
rdata = hstack((sIdMtx, sectionMtx), format='csr')
rdata = hstack((rdata, problemMtx), format='csr')
rdata = hstack((rdata, stepMtx), format='csr')
kcMtx = LoadSparseMatrix(ROOTDIR + "train_kc.txt")
print 'starting to run kmeans++..'
for i in components:
km = MiniBatchKMeans(n_clusters=i, tol=1e-3, batch_size=20, max_iter=60, random_state=rng)
km.fit(kcMtx)
objscore = km.score(kcMtx)
print 'With ' + str(i) +' components, the object score is ' + str(objscore)
nkcMtx = km.transform(kcMtx)
# io.mmwrite(ROOTDIR+"train_step_kmeans_"+str(i)+".txt", nkcMtx)
data = hstack((rdata, nkcMtx), format='csr')
# io.mmwrite(ROOTDIR+"TRAIN_KMEANS_"+str(i)+".txt", data)
# now train it!
data = scale(data, with_mean=False)
lrmodel = linear_model.LogisticRegression(max_iter=1000, penalty='l2', multi_class='ovr', verbose=0)
lrmodel.fit(data, label)
print 'Trainning Done!'
scr = lrmodel.score(data, label)
print 'accuracy on the training set is:' + str(scr)
predLabel = lrmodel.predict(data)
calcualteRMSE(label, predLabel)
print '************************'
示例7: process_vec_info
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def process_vec_info(g, n_clusters=8):
"""process_vec_info."""
# extract node vec information and make np data matrix
data_matrix = np.array([g.node[u]['vec'] for u in g.nodes()])
# cluster with kmeans
clu = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
clu.fit(data_matrix)
preds = clu.predict(data_matrix)
vecs = clu.transform(data_matrix)
vecs = 1 / (1 + vecs)
# replace node information
graph = g.copy()
for u in graph.nodes():
graph.node[u]['label'] = str(preds[u])
graph.node[u]['vec'] = list(vecs[u])
return graph
示例8: make_cluster
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def make_cluster(self):
texts = self.texts
print("texts are %(texts)s" %locals())
vectorizer = TfidfVectorizer(
analyzer=mecab_util.extractNoun,
max_df=self.max_df,
max_features=self.max_features
)
X = vectorizer.fit_transform(texts)
print("X values are %(X)s" %locals())
if self.minibatch:
km = MiniBatchKMeans(
n_clusters=self.num_clusters,
init='k-means++', batch_size=1000,
n_init=10, max_no_improvement=10,
verbose=True
)
else:
km = KMeans(
n_clusters=self.num_clusters,
init='k-means++',
n_init=10,
verbose=True
)
km.fit(X)
labels = km.labels_
transformed = km.transform(X)
dists = np.zeros(labels.shape)
for i in range(len(labels)):
dists[i] = transformed[i, labels[i]]
clusters = []
for i in range(self.num_clusters):
cluster = []
ii = np.where(labels == i)[0]
dd = dists[ii]
di = np.vstack([dd, ii]).transpose().tolist()
di.sort()
for d, j in di:
cluster.append(texts[int(j)])
clusters.append(cluster)
return clusters
示例9: auto_label
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def auto_label(graphs, n_clusters=16, **opts):
"""Label nodes with cluster id.
Cluster nodes using as features the output of vertex_vectorize.
"""
data_list = Vectorizer(**opts).vertex_transform(graphs)
data_matrix = vstack(data_list)
clu = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
clu.fit(data_matrix)
preds = clu.predict(data_matrix)
vecs = clu.transform(data_matrix)
sizes = [m.shape[0] for m in data_list]
label_list = []
vecs_list = []
pointer = 0
for size in sizes:
label_list.append(preds[pointer: pointer + size])
vecs_list.append(vecs[pointer: pointer + size])
pointer += size
return label_list, vecs_list
示例10: main
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def main(filename):
# load tweets
tweets = get_tweets_from_csv(filename)
# print tweets
# feature extraction
vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=MAX_DF)
vectorizer.max_features = MAX_FEATURES
X = vectorizer.fit_transform(tweets)
# dimensionality reduction by LSA
lsa = TruncatedSVD(LSA_DIM)
X = lsa.fit_transform(X)
X = Normalizer(copy=False).fit_transform(X)
# clustering by KMeans
if MINIBATCH:
km = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, init='k-means++', batch_size=1000, n_init=10, max_no_improvement=10, verbose=True)
else:
km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1, verbose=True)
km.fit(X)
labels = km.labels_
transformed = km.transform(X)
dists = np.zeros(labels.shape)
for i in range(len(labels)):
dists[i] = transformed[i, labels[i]]
# sort by distance
clusters = []
for i in range(NUM_CLUSTERS):
cluster = []
ii = np.where(labels==i)[0]
dd = dists[ii]
di = np.vstack([dd,ii]).transpose().tolist()
di.sort()
for d, j in di:
cluster.append(tweets[int(j)])
clusters.append(cluster)
return clusters
示例11: new_clustered_sortind
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def new_clustered_sortind(x, k=10, row_key=None, cluster_key=None):
"""
Uses MiniBatch k-means clustering to cluster matrix into groups.
Each cluster of rows is then sorted by `scorefunc` -- by default, the max
peak height when all rows in a cluster are averaged, or
cluster.mean(axis=0).max().
Returns the index that will sort the rows of `x` and a list of "breaks".
`breaks` is essentially a cumulative row count for each cluster boundary.
In other words, after plotting the array you can use axhline on each
"break" to plot the cluster boundary.
If `k` is a list or tuple, iteratively try each one and select the best
with the lowest mean distance from cluster centers.
:param x: Matrix whose rows are to be clustered
:param k: Number of clusters to create or a list of potential clusters; the
optimum will be chosen from the list
:param row_key:
Optional function to act as a sort key for sorting rows within
clusters. Signature should be `scorefunc(a)` where `a` is a 1-D NumPy
array.
:param cluster_key:
Optional function for sorting clusters. Signature is `clusterfunc(a)`
where `a` is a NumPy array containing all rows of `x` for cluster `i`.
It must return a single value.
"""
try:
from sklearn.cluster import MiniBatchKMeans
except ImportError:
raise ImportError('please install scikits.learn for '
'clustering.')
# If integer, do it once and we're done
if isinstance(k, int):
best_k = k
else:
mean_dists = {}
for _k in k:
mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
mbk.fit(x)
mean_dists[_k] = mbk.transform(x).mean()
best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]
mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
mbk.fit(x)
k = best_k
labels = mbk.labels_
scores = np.zeros(labels.shape, dtype=float)
if cluster_key:
# It's easier for calling code to provide something that operates on
# a cluster level, but here it's converted to work on a label level
# that looks in to the array `x`.
def _cluster_key(i):
return cluster_key(x[labels == i, :])
sorted_labels = sorted(range(k), key=_cluster_key)
else:
# Otherwise just use them as-is.
sorted_labels = range(k)
if row_key:
# Again, easier to provide a function to operate on a row. But here we
# need it to accept an index
def _row_key(i):
return row_key(x[i, :])
final_ind = []
breaks = []
pos = 0
for label in sorted_labels:
# which rows in `x` have this label
label_inds = np.nonzero(labels == label)[0]
if row_key:
label_sort_ind = sorted(label_inds, key=_row_key)
else:
label_sort_ind = label_inds
for li in label_sort_ind:
final_ind.append(li)
pos += len(label_inds)
breaks.append(pos)
return np.array(final_ind), np.array(breaks)
示例12: clustered_sortind
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def clustered_sortind(x, k=10, scorefunc=None):
"""
Uses MiniBatch k-means clustering to cluster matrix into groups.
Each cluster of rows is then sorted by `scorefunc` -- by default, the max
peak height when all rows in a cluster are averaged, or
cluster.mean(axis=0).max().
Returns the index that will sort the rows of `x` and a list of "breaks".
`breaks` is essentially a cumulative row count for each cluster boundary.
In other words, after plotting the array you can use axhline on each
"break" to plot the cluster boundary.
If `k` is a list or tuple, iteratively try each one and select the best
with the lowest mean distance from cluster centers.
:param x: Matrix whose rows are to be clustered
:param k: Number of clusters to create or a list of potential clusters; the
optimum will be chosen from the list
:param scorefunc: Optional function for sorting rows within clusters. Must
accept a single argument of a NumPy array.
"""
try:
from sklearn.cluster import MiniBatchKMeans
except ImportError:
raise ImportError('please install scikits.learn for '
'clustering.')
# If integer, do it once and we're done
if isinstance(k, int):
best_k = k
else:
mean_dists = {}
for _k in k:
mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
mbk.fit(x)
mean_dists[_k] = mbk.transform(x).mean()
best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]
mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
mbk.fit(x)
k = best_k
labels = mbk.labels_
scores = np.zeros(labels.shape, dtype=float)
if not scorefunc:
def scorefunc(x):
return x.mean(axis=0).max()
for label in range(k):
ind = labels == label
score = scorefunc(x[ind, :])
scores[ind] = score
pos = 0
breaks = []
ind = np.argsort(scores)
for k, g in groupby(labels[ind]):
pos += len(list(g))
breaks.append(pos)
return ind, breaks
示例13: main
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def main(args):
logger.debug("Arguments: %r", args)
vect = default_vectorizer()
vect.set_params(
ngram_range=(args.min_ngrams, args.max_ngrams),
max_df=args.max_df,
max_features=args.max_features,
sublinear_tf=args.sublinear_tf,
norm=args.norm,
)
with LogRuntime("Loaded input data in {elapsed} seconds", logger):
data = get_data(args)
if data:
logger.debug("Corpus size: {0}".format(len(data)))
else:
logger.error("Empty data")
return
with LogRuntime("Fitted in {0.elapsed} seconds", logger):
X = tfidf_vect.fit_transform(data)
logger.debug("Vocabulary size: {}".format(len(tfidf_vect.vocabulary_)))
logger.debug("Max DF stop words size: {}".format(len(tfidf_vect.stop_words_)))
logger.debug("Stop words size: {}".format(len(tfidf_vect.stop_words)))
if args.clusters:
true_k = args.clusters
else:
# ref: http://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set#Finding_Number_of_Clusters_in_Text_Databases
m_docs, n_terms = X.shape
t_nonzeros = len(X.nonzero()[0])
true_k = (m_docs * n_terms) / t_nonzeros
logger.debug("Calculated number of clusters: {}".format(true_k))
if args.minibatch:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=10,
init_size=1000, batch_size=1000, verbose=-1)
else:
km = KMeans(n_clusters=args.clusters, init='random', max_iter=100,
n_init=10, verbose=1, n_jobs=-1)
with LogRuntime("KMeans Fitted in {0.elapsed} seconds", logger):
km.fit(X)
if args.sample_random and args.sample_size:
sample = [
data[i] for i in np.random.random_integers(0, len(data), args.sample_size)
]
elif args.sample_size:
sample = data[args.sample_skip:args.sample_size]
else:
sample = data
Y = tfidf_vect.transform(sample)
sample_terms = tfidf_vect.inverse_transform(Y)
labels = km.predict(Y)
distances = km.transform(Y)
center_terms = tfidf_vect.inverse_transform(km.cluster_centers_)
clusters = defaultdict(list)
vocabulary = tfidf_vect.vocabulary_
for i, doc in enumerate(sample):
clusters[labels[i]].append((i, doc))
truncate = lambda t: t[:100] + '...' if len(t) > 100 else t
for label, result in sorted(clusters.iteritems()):
# skip single results
if len(result) < args.cluster_minsize:
continue
terms_joined = ', '.join(sorted(
center_terms[label], reverse=True,
key=lambda t: km.cluster_centers_[label, vocabulary[t]]
))
print '='*79
print '='*79
print '='*79
print '-> ' + truncate(terms_joined) + '\n\n'
result = sorted(
result,
key=lambda (i,_): distances[i,label],
)
j = 0
for i, doc in result:
j += 1
doc_terms = ', '.join(sorted(
sample_terms[i], reverse=True,
key=lambda t: Y[i, vocabulary[t]],
))
print doc['headline']
print get_corpus_key(doc)
print doc['url']
print 'distance:', distances[i,label]
print truncate(doc_terms)
print
if j > 10:
#.........这里部分代码省略.........
示例14: getClustersOfpapers
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def getClustersOfpapers(self,papers_list,cluster_num = 10):
corpus = list()
for paperID in papers_list:
corpus.append(self.lda_corpus[paperID])
#print len(corpus)
clusterModel = MiniBatchKMeans(init='k-means++', n_clusters=cluster_num, n_init=100)
clusterModel.fit(corpus)
clusterTopics = dict()
clusterPapers = dict()
clusterYear = dict()
clusterSize = dict()
for paperID in papers_list:
cluster = int(clusterModel.predict(self.lda_corpus[paperID])[0])
clusterSize[cluster] = clusterSize.get(cluster,0)
clusterSize[cluster] += 1
if cluster in clusterTopics:
clusterTopics[cluster] += np.array(self.lda_corpus[paperID])
else:
clusterTopics[cluster] = np.array(self.lda_corpus[paperID])
clusterPapers[cluster] = clusterPapers.get(cluster,{})
clusterPapers[cluster][paperID] = self.lda_corpus[paperID]
clusterYear[cluster] = clusterYear.get(cluster,dict())
year = self.abstracts[paperID]['year']
if year:
clusterYear[cluster][year] = clusterYear[cluster].get(year,0)
clusterYear[cluster][year]+=1
clusterCords = getPCAonDict(clusterTopics)
cluster_result = list()
paper_result = list()
for cluster in clusterPapers:
topPapers = list()
clusterPapers[cluster] = getPCAonDict(clusterPapers[cluster])
for paperID in clusterPapers[cluster]:
x = clusterPapers[cluster][paperID][0]
y = clusterPapers[cluster][paperID][1]
distance = clusterModel.transform(self.lda_corpus[paperID])[0][cluster]
#print 'papreID', cluster, distance, clusterModel.transform(self.lda_corpus[paperID])
topPapers.append([distance, paperID])
paper_result.append('%d,%d,%.3f,%.3f,%.3f,%s,%d,%s,%s,%s'%(cluster,
paperID,
x,
y,
distance,
str(self.abstracts[paperID]['year']),
self.abstracts[paperID]['pages'],
self.abstracts[paperID]['path'],
self.abstracts[paperID]['header'],
self.abstracts[paperID]['abstract']
))
topPapers.sort()
topNames = [self.abstracts[paperID[1]]['header'].replace(',','') for paperID in topPapers[:5]]
topNames_str = '"%s"'%(','.join(topNames))
topics = list(clusterTopics[cluster])
#print topics, topics.index(max(topics)),topics[topics.index(max(topics))]
#print len(self.topics_names),self.topics_names[topics.index(max(topics))]
name = '"'+','.join(self.topics_names[topics.index(max(topics))])+'"'
x = clusterCords[cluster][0]
y = clusterCords[cluster][1]
years = [[clusterYear[cluster][year],year] for year in clusterYear[cluster]]
if len(years) == 0:
year = 'None'
else:
years.sort(reverse=True)
year = years[0][1]
cluster_result.append('%s,%d,%.3f,%.3f,%d,%d,%s,%s'%(str(name),
cluster,
x,
y,
clusterSize[cluster],
cluster,
year,
topNames_str))
return paper_result, cluster_result
示例15: main
# 需要导入模块: from sklearn.cluster import MiniBatchKMeans [as 别名]
# 或者: from sklearn.cluster.MiniBatchKMeans import transform [as 别名]
def main(tcu_fpath):
data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath)
data = data[data['Situacao'] == 'Aceito e Habilitado']
desc_column = data['Descricao']
des_cmp_column = data['DescricaoComplementar']
unidade_column = data['UnidadeFornecimento']
qtd_column = [str(qtd) for qtd in data['Quantidade']]
#Transforms descriptions to base strings
as_docs = []
for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column):
doc = " ".join(as_text)
as_docs.append(doc)
#Vectorizes to TF-IDF
vectorizer = Vectorizer()
doc_sparse_matrix = vectorizer.fit_transform(as_docs)
#Compute clusters
inter = {}
intra = {}
n_runs = 20
k_vals = range(2, 16)
for i in xrange(n_runs):
for k in k_vals:
#Each K has n_runs clusterings
inter_array = inter.setdefault(k, np.zeros(n_runs))
intra_array = intra.setdefault(k, np.zeros(n_runs))
#Run K-Means
mbkm = MiniBatchKMeans(k, init = 'random')
mbkm.fit(doc_sparse_matrix)
centers = mbkm.cluster_centers_
labels = mbkm.labels_
#Inter distance. We use min because the ideia is to maximize this.
#Min serves as a penalty for worse case.
dist_centers = pairwise.euclidean_distances(centers)
min_dist_between_centers = \
np.min(dist_centers[dist_centers > 0])
inter_array[i] = min_dist_between_centers
#Intra distance
dist_all_centers = mbkm.transform(doc_sparse_matrix)
intra_dists = []
for doc_id, cluster in enumerate(labels):
dist = dist_all_centers[doc_id, cluster]
intra_dists.append(dist)
intra_array[i] = np.mean(intra_dists)
#Prints num elements per cluster
print('Run %d ; k = %d' %(i, k))
counter = Counter(labels)
for cluster, population in counter.items():
print('\tK = %d; Pop = %d' %(cluster, population))
print()
x = inter.keys()
y = []
c = []
for k in x:
div = inter[k] / intra[k]
y.append(np.mean(div))
c.append(half_confidence_interval_size(div, 0.90))
#hack for the zero to apper
x = [0] + x
y = [0] + y
c = [0] + c
ax = plt.gca()
ax.set_yscale('log')
ax.set_xticks(range(0, 16))
plt.ylabel('InterCluster/IntraCluster Ratio')
plt.xlabel('Number of clusters')
plt.errorbar(x, y, yerr=c, fmt='bo', markersize=8, elinewidth=2)
plt.show()