本文整理汇总了Python中sklearn.cluster.AgglomerativeClustering.fit_predict方法的典型用法代码示例。如果您正苦于以下问题:Python AgglomerativeClustering.fit_predict方法的具体用法?Python AgglomerativeClustering.fit_predict怎么用?Python AgglomerativeClustering.fit_predict使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.cluster.AgglomerativeClustering
的用法示例。
在下文中一共展示了AgglomerativeClustering.fit_predict方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: hierarchical
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def hierarchical(similarity, concepts=2, euclid=False):
if euclid:
model = AgglomerativeClustering(n_clusters=concepts)
return model.fit_predict(similarity)
else:
model = AgglomerativeClustering(n_clusters=concepts, affinity='precomputed', linkage='complete')
return model.fit_predict(1 - similarity)
示例2: HierarchicalTopics
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
class HierarchicalTopics(object):
def __init__(self, corpus):
"""
corpus is a corpus object, e.g. an HTMLCorpusReader()
or an HTMLPickledCorpusReader() object
"""
self.model = None
self.vocab = list(
set(normalize(corpus.words(categories=['news'])))
)
def vectorize(self, document):
"""
Vectorizes a document consisting of a list of part of speech
tagged tokens using the segmentation and tokenization methods.
One-hot encode the set of documents
"""
features = set(normalize(document))
return np.array([
token in features for token in self.vocab], np.short)
def cluster(self, corpus):
"""
Fits the AgglomerativeClustering model to the given data.
"""
self.model = AgglomerativeClustering()
self.model.fit_predict([
self.vectorize(
corpus.words(fileid)) for fileid in
corpus.fileids(categories=['news']
)
])
self.labels = self.model.labels_
self.children = self.model.children_
def plot_dendrogram(self, **kwargs):
# Distances between each pair of children
distance = np.arange(self.children.shape[0])
position = np.arange(self.children.shape[0])
# Create linkage matrix and then plot the dendrogram
linkage_matrix = np.column_stack([
self.children, distance, position]
).astype(float)
# Plot the corresponding dendrogram
fig, ax = plt.subplots(figsize=(15, 7)) # set size
ax = dendrogram(linkage_matrix, **kwargs)
plt.tick_params(axis='x', bottom='off', top='off', labelbottom='off')
plt.tight_layout()
plt.show()
示例3: buckshot
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def buckshot(k, mat):
size = int((k*mat.shape[0])**.5)
print size
samp = np.zeros((size, mat.shape[1]))
inds = np.random.randint(0, mat.shape[0], size)
print inds
for i in xrange(size):
samp[i] = mat[inds[i]]
#agglomerative clusting on sample
hier = AgglomerativeClustering(n_clusters=k, linkage='average', affinity='euclidean', compute_full_tree=True)
flat = hier.fit_predict(samp)
centroids = []
#find centroids
for j in xrange(k):
i_s = [i for i, l in enumerate(flat) if l == j]
print len(i_s)
points = [samp[m] for m in i_s]
points = np.array(points)
cent = np.mean(points, axis=0)
centroids.append(cent)
return centroids
示例4: sp_connectivity
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def sp_connectivity(self,X,connectivity, n_clusters, n):
# plt.figure(figsize=(10, 4))
# plt.subplot(1, 3, index + 1)
model = AgglomerativeClustering(linkage="ward",
connectivity=connectivity,
n_clusters=n_clusters)
#t0 = time.time()
y = np.zeros(shape=(n))
y = model.fit_predict(X, None)
#elapsed_time = time.time() - t0
return y
#plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
# cmap=plt.cm.spectral)
#plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),
# fontdict=dict(verticalalignment='top'))
#plt.axis('equal')
#plt.axis('off')
#plt.subplots_adjust(bottom=0, top=.89, wspace=0,
# left=0, right=1)
# plt.suptitle('n_cluster=%i, connectivity=%r' %
# (n_clusters, connectivity is not None), size=17)
#plt.show()
示例5: clustering_approach
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def clustering_approach(self):
'''
Cluster user data using various clustering algos
IN: self.df_full and self.labels
OUT: results to stdout
'''
print 'Fitting clustering model'
X = self.df_full.values
y = self.labels
# scale data
scaler = StandardScaler()
X = scaler.fit_transform(X)
# KMeans
km_clf = KMeans(n_clusters=2, n_jobs=6)
km_clf.fit(X)
# swap labels as super-users are in cluster 0 (messy!!)
temp = y.apply(lambda x: 0 if x == 1 else 1)
print '\nKMeans clustering: '
self.analyse_preds(temp, km_clf.labels_)
# Agglomerative clustering
print '\nAgglomerative clustering approach: '
ac_clf = AgglomerativeClustering()
ac_labels = ac_clf.fit_predict(X)
self.analyse_preds(y, ac_labels)
return None
示例6: calculateNumberOfIdealClusters
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def calculateNumberOfIdealClusters(maxAmount, corpus):
print "Initializing silhouette analysis"
range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs
silhouette_high = 0;
silhouette_high_n_clusters = 2;
for n_clusters in range_n_clusters:
# Initialize the clusterer with n_clusters value
cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean")
cluster_labels = cluster.fit_predict(corpus)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed clusters
silhouette_avg = silhouette_score(corpus, cluster_labels)
print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg)
if (silhouette_avg > silhouette_high):
silhouette_high = silhouette_avg
silhouette_high_n_clusters = n_clusters
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(corpus, cluster_labels)
print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters))
return silhouette_high_n_clusters
示例7: agglom
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def agglom(data, n_clusters):
knn_graph = kneighbors_graph(data, 30, include_self=False)
cluster = AgglomerativeClustering(n_clusters=n_clusters, connectivity=knn_graph, linkage='ward') # use ward / average / complete for different results
model = cluster.fit(data)
return cluster.fit_predict(data)
示例8: openfaceExp
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def openfaceExp(lfwAligned, net, cls):
df = pd.DataFrame(columns=('nPpl', 'nImgs',
'trainTimeSecMean', 'trainTimeSecStd',
'predictTimeSecMean', 'predictTimeSecStd',
'accsMean', 'accsStd'))
repCache = {}
df_i = 0
for nPpl in nPplVals:
print(" + nPpl: {}".format(nPpl))
cls = AgglomerativeClustering(n_clusters=nPpl)
(X, y) = getData(lfwAligned, nPpl, nImgs, size=96, mode='rgb')
nSampled = X.shape[0]
ss = ShuffleSplit(nSampled, n_iter=10, test_size=0.1, random_state=0)
allTrainTimeSec = []
allPredictTimeSec = []
accs = []
for train, test in ss:
X_train = []
for img in X[train]:
h = hash(str(img.data))
if h in repCache:
rep = repCache[h]
else:
rep = net.forward(img)
repCache[h] = rep
X_train.append(rep)
start = time.time()
X_train = np.array(X_train)
cls.fit(X_train, y[train])
trainTimeSec = time.time() - start
allTrainTimeSec.append(trainTimeSec)
start = time.time()
X_test = []
for img in X[test]:
X_test.append(net.forward(img))
y_predict = cls.fit_predict(X_test)
predictTimeSec = time.time() - start
allPredictTimeSec.append(predictTimeSec / len(test))
y_predict = np.array(y_predict)
print y[test], y_predict
acc = accuracy_score(y[test], y_predict)
print acc
accs.append(acc)
df.loc[df_i] = [nPpl, nImgs,
np.mean(allTrainTimeSec), np.std(allTrainTimeSec),
np.mean(allPredictTimeSec), np.std(allPredictTimeSec),
np.mean(accs), np.std(accs)]
df_i += 1
return df
示例9: get_topics
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def get_topics(X_lsi, text_names, nk=1):
ag = AgglomerativeClustering(n_clusters=nk, affinity='cosine', linkage='average')
topics = ag.fit_predict(X_lsi)
paper_to_topic = defaultdict(int)
topic_to_papers = defaultdict(list)
for paper,topic in zip(text_names,topics):
paper_to_topic[paper] = topic
topic_to_papers[topic].append(paper)
return (paper_to_topic, topic_to_papers)
示例10: hierarchicalCluster
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def hierarchicalCluster(corr_matrix_df, n_clusters):
"""calculate clustering from the correlation matrix using the hierarchical Ward method"""
#set method
ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',affinity='euclidean')
result=ward.fit_predict(corr_matrix_df)
cluster_df=pd.DataFrame(result, index=corr_matrix_df.index, columns= ['Cluster'])
return cluster_df
示例11: agglomerative_clustering
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def agglomerative_clustering(crime_rows, column_names, num_clusters):
crime_xy = [crime[0:2] for crime in crime_rows]
crime_info = [crime[2:] for crime in crime_rows]
print("Running Agglomerative Clustering")
agglo_clustering = AgglomerativeClustering(n_clusters=num_clusters,
connectivity=neighbors.kneighbors_graph(crime_xy, n_neighbors=2))
agglomerative_clustering_labels = agglo_clustering.fit_predict(crime_xy)
print("formatting....")
return _format_clustering(agglomerative_clustering_labels,
crime_xy, crime_info, column_names)
示例12: test_agglomerative_clustering_with_distance_threshold_edge_case
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def test_agglomerative_clustering_with_distance_threshold_edge_case(
linkage, threshold, y_true):
# test boundary case of distance_threshold matching the distance
X = [[0], [1]]
clusterer = AgglomerativeClustering(
n_clusters=None,
distance_threshold=threshold,
linkage=linkage)
y_pred = clusterer.fit_predict(X)
assert adjusted_rand_score(y_true, y_pred) == 1
示例13: agglomorative_clustering
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def agglomorative_clustering(df_in):
# Set model input args
n_clusters = 8
linkage = 'ward'
model = AgglomerativeClustering(linkage=linkage,
n_clusters=n_clusters)
# attach cluster-label to dataframe
df_in['cluster'] = model.fit_predict(df_in)
示例14: clusterize
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def clusterize(matrices):
#dbscan = DBSCAN(metric="precomputed", eps=25, min_samples=50)
cluster = AgglomerativeClustering(n_clusters=2, affinity="precomputed", linkage="complete")
distances = distance_matrix(matrices)
print("mean of distances is {} and std of norms is {}".format(numpy.mean(distances), numpy.std([numpy.linalg.norm(m, numpy.inf) for m in matrices])))
#pyplot.plot([numpy.linalg.norm(m, numpy.inf) for m in matrices], 'ro')
#pyplot.show()
#pyplot.hist(distances.flatten(), bins=20)
#pyplot.show()
return cluster.fit_predict(distances)
示例15: find_steady_coalition
# 需要导入模块: from sklearn.cluster import AgglomerativeClustering [as 别名]
# 或者: from sklearn.cluster.AgglomerativeClustering import fit_predict [as 别名]
def find_steady_coalition():
working_direcotry = r"C:\Users\ORI\Documents\IDC-non-sync\ML_Course\Election\Data\\"
file_name = os.path.join(working_direcotry, r'ElectionsData.csv')
train, validation, test, feature_categorical_dictionary, train_idx, test_idx, number_to_party_dictionary = prepare_the_data(file_name,
working_direcotry)
good_colation_found = False
for n_clusters in [5,4,3]:
print ("---------------")
linkage = 'ward'
X = train.data
clusters = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters)
clusters.fit_predict(X)
bin_count_of_kmeans_clusters = np.bincount(clusters.labels_)
normalized_bin_count_of_kmeans_clusters = bin_count_of_kmeans_clusters/np.sum(bin_count_of_kmeans_clusters).astype('float32')
#is there any cluster with more than 50% of the votes?
coalition_exists = np.any(normalized_bin_count_of_kmeans_clusters > 0.5)
print "number_of_clustes {0}".format(n_clusters)
print "coalition_exists: {0} ".format(coalition_exists)
# find all the parties belong to the cluster
biggest_cluster = np.argmax(normalized_bin_count_of_kmeans_clusters)
biggest_cluster_voters = np.bincount(train.labels[clusters.labels_ == biggest_cluster].astype('int64'))
#normalize the votes by the size of their parties:
votes_out_of_party = biggest_cluster_voters/np.bincount( train.labels.astype('int32')).astype('float32')
#commited_to_coalition_parties = partyw with majority of the votes in the cluster
commited_to_coalition_parties = votes_out_of_party > 0.5
percentage_of_voters_in_commited_coalition = np.sum(biggest_cluster_voters[votes_out_of_party > 0.5])*1.0/len(train.labels)*1.0
print percentage_of_voters_in_commited_coalition
if percentage_of_voters_in_commited_coalition> 0.5:
print "coalition found"
parties_in_coalition = number_to_party_dictionary.keys()
print "parties in coalition:{0}".format([number_to_party_dictionary[k] for k in np.array(number_to_party_dictionary.keys())[votes_out_of_party > 0.5]])
break
print ("---------------")