当前位置: 首页>>代码示例>>Python>>正文


Python cluster.DBSCAN类代码示例

本文整理汇总了Python中sklearn.cluster.DBSCAN的典型用法代码示例。如果您正苦于以下问题:Python DBSCAN类的具体用法?Python DBSCAN怎么用?Python DBSCAN使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了DBSCAN类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: current_datapoints_dbscan

	def current_datapoints_dbscan(self):
		"""
		Method clusters points-outliers (after current_datapoints_threshold_filter and current_datapoints_outliers_filter) into slice-clusters using DBSCAN.
		Returns dict of slice-clusters - base for event-candidates. Uses self.eps attribute to estimate cluster boundaries.
		"""
		nets = self.current_datapoints.keys()
		ids = concatenate([self.current_datapoints[x]['ids'] for x in nets])
		coords = concatenate([self.current_datapoints[x]['array'] for x in nets])
		weights = concatenate([self.current_datapoints[x]['weights'] for x in nets])
		if len(ids) > 0:
			clustering = DBSCAN(eps=self.eps, min_samples=5)
			labels = clustering.fit_predict(coords)
			core_ids = ids[clustering.core_sample_indices_]
			ids = ids[labels > -1]
			coords = coords[labels > -1]
			weights = weights[labels > -1]
			labels = labels[labels > -1]
			ret_tab = {}
			for i in range(len(labels)):
				try:
					ret_tab[labels[i]].append({'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids})
				except KeyError:
					ret_tab[labels[i]] = [{'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids}]
			return ret_tab
		else:
			return {}
开发者ID:city-pulse,项目名称:mskpulse.backend,代码行数:26,代码来源:detector.py

示例2: cluster_dbscan

def cluster_dbscan(matrix, distance_measure="sts", eps=1):
    """Clusters the distance matrix for a given epsilon value, if distance
    measure is sts. Other distance measures are: [‘cityblock’, ‘cosine’, 
    ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’, ‘braycurtis’, ‘canberra’, 
    ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, 
    ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, 
    ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’]

    Parameters
    ----------
    matrix: np.matrix
        The input matrix. If distance measure is sts, this should be the sts
        distance matrix. If other distance, this should be the time-series
        matrix of size ngenes x nsamples.
    distance_measure: str
        The distance measure, default is sts, short time-series distance.
        Any distance measure available in scikit-learn is available here.
        Note: multiple time-series is NOT supported for distances other than    
        "sts".

    Returns
    -------
    cluster_labels: list of int
        A list of size ngenes that defines cluster membership.
    """
    if (distance_measure == "sts"):
        dbs = DBSCAN(eps=eps, metric='precomputed', min_samples=2)
    else:
        dbs = DBSCAN(eps=eps, metric=distance_measure, min_samples=2)
    cluster_labels = dbs.fit_predict(matrix)
    return cluster_labels
开发者ID:beiko-lab,项目名称:ananke,代码行数:31,代码来源:_cluster.py

示例3: _fit_dbscan

    def _fit_dbscan(self, x):
        # clustering
        for r in xrange(self.repeats):
            # info
            if self.debug is True:
                print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1),

            # fit and evaluate model
            model = DBSCAN(eps=1.0, min_samples=100)
            model.fit_predict(x)
            k = len(set(model.labels_)) - (1 if -1 in model.labels_ else 0)
            self._labels[r] = model.labels_
            self._parameters[r] = model.core_sample_indices_

            # build equivalent gmm
            model_gmm = GMM(n_components=k, covariance_type="full")
            model_gmm.means_ = model.core_sample_indices_
            model_gmm.covars_ = sp.ones(
                (k, self.input_dim)) * self.sigma_factor
            model_gmm.weights_ = sp.array(
                [(self._labels[r] == i).sum() for i in xrange(k)])

            # evaluate goodness of fit
            self._ll[r] = model_gmm.score(x).sum()
            if self.gof_type == 'aic':
                self._gof[r] = model_gmm.aic(x)
            if self.gof_type == 'bic':
                self._gof[r] = model_gmm.bic(x)

            # debug info
            if self.debug is True:
                print self._gof[r]
开发者ID:pmeier82,项目名称:BOTMpy,代码行数:32,代码来源:cluster.py

示例4: cluster_mappings

def cluster_mappings(vector_inpath, do_pca=False, target_dim=100, indices_inpath=None, epsilon=2.5, min_s=20):
	# TODO: CLustering parameters
	# TODO: Metric cosine similarity or euclidian distance
	print alt("Load mappings...")
	indices, model = load_mappings_from_model(vector_inpath)
	X = numpy.array([model[key] for key in indices])
	# del model
	if do_pca:
		print alt("Truncate vectors with PCA to %i dimensions..." %(target_dim))
		pca = PCA(n_components=target_dim)
		pca.fit(X)
		X = pca.transform(X)
	print alt("Cluster points...")
	# k = 2 * X[0].shape[0] - 1
	# min_pts = k + 1
	#dbscan = DBSCAN(eps=0.1, min_samples=20, metric='cosine',algorithm='brute')
	dbscan = DBSCAN(eps=epsilon, min_samples=min_s)
	dbscan.fit(X)
	labels = dbscan.labels_
	print get_cluster_size(labels)
	print alt("Finished clustering!")
	sscore = silhouette_score(X, labels)
	print("Silhouette Coefficient: %0.3f" %(sscore))
	if indices_inpath:
		resolve_indices(indices, labels, indices_inpath, model)
开发者ID:dboth,项目名称:thesis_ba,代码行数:25,代码来源:cluster_mappings.py

示例5: dbscan_outliers

def dbscan_outliers(data, genes, eps, min_samples, max_samples=1, as_json=True):
    db = DBSCAN(eps=eps, min_samples=min_samples)
    # sd_scaler = StandardScaler()
    res = dr.get_dataset_ensembl_info()
    outliers_id = []
    for g in genes:
        # scaled = sd_scaler.fit(data.loc[g, :])
        fit = db.fit(np.reshape(data.loc[g, :], (196, 1)))

        candidates = itemfreq(fit.labels_)

        try:
            class_zero = candidates[0][1]
            class_one = candidates[1][1]

            support = min(class_one, class_zero)

            if min_samples < support <= max_samples:
                info = [gene for gene in res if gene.ensemblgeneid == g][0]
                formatted_info = {"id": g, "name": info.genename, "type": info.genetype, "samples": str(support),
                                  "distance": "NA"}
                jinfo = json.dumps(formatted_info)
                jinfo += ","
                outliers_id.append(g)
                print("outlier found :" + g)
                if as_json:
                    yield (jinfo)
                else:
                    yield (formatted_info)
        except:
            pass
开发者ID:armell,项目名称:RNASEqTool,代码行数:31,代码来源:outliers.py

示例6: cluster

def cluster():
    eps_set = 0.5 * np.arange(1, 7)
    npt_set = np.arange(1, 6)
    scores = []
    global res
    res = []
    for eps in eps_set:
        for npt in npt_set:
            est = DBSCAN(eps=eps, min_samples=npt)
            est.fit(x)
            ari = metrics.adjusted_rand_score(y, est.labels_)
            scores.append(ari)
            n_noise = len([ l for l in est.labels_ if l == -1])
            res.append((ari, np.max(est.labels_) + 1 , n_noise))
            print ari
    max_score = np.max(scores)
    max_idx = scores.index(max_score)
    max_eps = eps_set[max_idx / len(npt_set)]
    max_npt = npt_set[max_idx % len(npt_set)]
    print max_score, max_eps, max_npt
    scores = np.array(scores).reshape(len(eps_set), len(npt_set))
    pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral)
    pl.colorbar()
    pl.xticks(np.arange(len(npt_set)), npt_set)
    pl.yticks(np.arange(len(eps_set)), eps_set)
    pl.ylabel('eps')
    pl.xlabel('min_samples')
    pl.show()
开发者ID:harrylclc,项目名称:ist557,代码行数:28,代码来源:dbscan.py

示例7: clusterMalwareNames

def clusterMalwareNames(malwareNames):
    # strictly lexical clustering over malware-names
    wordCount = {}
    # create a distance matrix
    matrix = np.zeros((len(malwareNames), len(malwareNames)))
    for i in range(len(malwareNames)):
        for j in range(len(malwareNames)):
            if matrix[i, j] == 0.0:        
                matrix[i, j] = computeSimilarity(malwareNames[i], malwareNames[j])
                matrix[j, i] = matrix[i, j]
    
    # Scikit-Learn's DBSCAN implementation to cluster the malware-names
    clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed")
    clust.fit(matrix)    
    
    preds = clust.labels_
    clabels = np.unique(preds)
    
    # create Word-Count Map
    for i in range(clabels.shape[0]):
        if clabels[i] < 0:
            continue
        
        cmem_ids = np.where(preds == clabels[i])[0]
        cmembers = []
        
        for cmem_id in cmem_ids:
            cmembers.append(malwareNames[cmem_id])
        
        wordCount[", ".join(uniqueList(cmembers))] = len(cmem_ids)
    return wordCount
开发者ID:M0nk2y,项目名称:malware-crawler,代码行数:31,代码来源:vtTool.py

示例8: find_tracks

def find_tracks(data, eps=20, min_samples=20):
    """Applies the DBSCAN algorithm from scikit-learn to find tracks in the data.

    Parameters
    ----------
    data : array-like
        An array of (x, y, z, hits) data points
    eps : number, optional
        The minimum distance between adjacent points in a cluster
    min_samples : number, optional
        The min number of points in a cluster

    Returns
    -------
    tracks : list
        A list of tracks. Each track is an ndarray of points.

    """
    xyz = data[:, 0:3]
    dbs = DBSCAN(eps=eps, min_samples=min_samples)
    dbs.fit(xyz)

    tracks = []
    for track in (np.where(dbs.labels_ == n)[0] for n in np.unique(dbs.labels_) if n != -1):
        tracks.append(data[track])

    return tracks
开发者ID:tarvos14,项目名称:pytpc,代码行数:27,代码来源:tracking.py

示例9: classify_core

    def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time):

        BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING
        END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING

        data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X]

        labels = None
        if clusterType == 'kmeans':
            kmeans = KMeans(n_clusters=N_CLUSTERS)
            kmeans.fit(data)
            labels = kmeans.labels_
        elif clusterType == 'affinity_propagation':
            ap = AffinityPropagation(damping=0.75)
            ap.fit(data)
            labels = ap.labels_
            N_CLUSTERS = np.max(self.labels)+1
        elif clusterType == 'DBSCAN':
            dbscan = DBSCAN()
            dbscan.fit(data)
            labels = dbscan.labels_
            N_CLUSTERS = np.max(labels)+1
            print 'N_CLUSTERS=' + str(N_CLUSTERS)
        elif clusterType == 'AgglomerativeClustering':
            ac = AgglomerativeClustering(n_clusters=N_CLUSTERS)
            ac.fit(data)
            labels = ac.labels_
        else:
            print 'ERROR: clusterType: ' + clusterType + ' is not recognized'

        return (labels, N_CLUSTERS)
开发者ID:SashaRayshubskiy,项目名称:osmotropotaxis_analysis_python,代码行数:31,代码来源:fly_trajectory_classifier.py

示例10: cluster_tweets

def cluster_tweets(tweets):
    #TODO get TFIDF vector
    #do clustering
    ner_tags = [get_ner_tags(tweet).tolist() for tweet in tweets['tweet']]
    vectorizer = TfidfVectorizer(preprocessor=_dummy_preprocess, tokenizer=lambda x:x,
                                 binary=True,
                                 min_df=0, use_idf=True, smooth_idf=True)
    tfidf = vectorizer.fit_transform(ner_tags) 
    
    #ner_tags = [get_ner_tags(tweet) for tweet in tweets['tweet']]
    print "clustering started"
    t0 = time()
    #cluster = AgglomerativeClustering(n_clusters=3, affinity="cosine" )
    #cluster = MiniBatchKMeans(n_clusters=10, max_iter=100, batch_size=100) 
    #metric=sklearn.metrics.pairwise.cosine_distances
    cluster = DBSCAN(min_samples=2, eps=0.5)    
        
    clustered = cluster.fit(tfidf.todense())
       
    #clustered = cluster.fit(ner_tags)
    labels = clustered.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print "clustering finished in %.3f seconds"%(time()-t0)   
    print "%d clusters detected"%n_clusters_
    
    tweets['cluster'] = labels
    tweets['ner'] = ner_tags
    return tweets
开发者ID:Kaushalya,项目名称:tweet_summary,代码行数:28,代码来源:summarizer.py

示例11: cluster_with_dbscan

def cluster_with_dbscan(vectors, epsilon=0.5, min_samples=5, distances=None, metric="euclidean"):
    # precomputing our distances will be faster as we can use multiple cores
    if distances is None:
        distances = pairwise_distances(vectors, n_jobs=-1, metric=metric)

    dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric="precomputed")
    return dbscan.fit_predict(distances)
开发者ID:CylanceSPEAR,项目名称:NMAP-Cluster,代码行数:7,代码来源:clustering.py

示例12: search_charges

    def search_charges(self, data, z=0, threshold = 30):
        A = deriv(data,z)
        
        print 'Searching charges...'
        time0 = time.time()        
        
        det = A[3]*A[5]-A[4]**2

        dx = -(A[1]*A[5]-A[2]*A[4])/det
        dy = -(A[2]*A[3]-A[1]*Aa[4])/det
        
        datamax = A[0]+A[1]*dx+A[2]*dy+A[3]*dx**2/2+A[4]*dx*dy+A[5]*dy**2/2        
        t = np.where((np.abs(dx) < 1)*(np.abs(dy) < 1)*(np.abs(datamax) > threshold)*(det > 0))        
        
        x = np.array([t[1]+dx[t], t[0]+dy[t]]).T
        
        db = DBSCAN(min_samples = 1, eps = 1)
        db.fit_predict(x)
        
        n_charges = np.max(db.labels_)+1
        qi = np.zeros(n_charges)
        xi = np.zeros((3,n_charges))
        
        for i in range(0, n_charges):
            xi[0:2,i] = np.mean(x[db.labels_ == i,:], axis=0)
            qi[i] = np.mean(datamax[t][db.labels_ == i])
        
        
        self.set_charges(qi,xi)
        print 'Done! Elapsed time: '+str(time.time()-time0)
        return self
开发者ID:temik42,项目名称:lib,代码行数:31,代码来源:pyfield.py

示例13: on_squaremsg_received

    def on_squaremsg_received(self, msg):
        detected_squares = []
        for square_msg in msg.squares:
            detected_squares.append(TrackedSquare.from_msg(square_msg))

        self._prev_squares.append(detected_squares)
        
        all_squares = list(itertools.chain.from_iterable(self._prev_squares))
        square_centers = [list(s.center) + [s.hue] for s in all_squares]
        data = np.array(square_centers)

        ms = DBSCAN(eps=64, min_samples=3)
        ms.fit(data)
        labels = ms.labels_

        ts_msg = TrackedSquares()
        for i, s in enumerate(all_squares):
            label = np.int0(labels[i])
            if label < 0: 
                continue

            s.tracking_colour = TrackedSquare.TRACKING_COLOURS[label % len(TrackedSquare.TRACKING_COLOURS)]
            s.tracking_detected = True

            ts_msg.squares.append(s.to_msg())

        self._squares_pub.publish(ts_msg)
开发者ID:Knifa,项目名称:Glasgow-Baxter,代码行数:27,代码来源:understanding.py

示例14: plot_dbscan

def plot_dbscan():
    X, y = make_blobs(random_state=0, n_samples=12)

    dbscan = DBSCAN()
    clusters = dbscan.fit_predict(X)
    clusters

    fig, axes = plt.subplots(3, 4, figsize=(11, 8), subplot_kw={'xticks': (), 'yticks': ()})
    # Plot clusters as red, green and blue, and outliers (-1) as white
    colors = ['r', 'g', 'b']
    markers = ['o', '^', 'v']

    # iterate over settings of min_samples and eps
    for i, min_samples in enumerate([2, 3, 5]):
        for j, eps in enumerate([1, 1.5, 2, 3]):
            # instantiate DBSCAN with a particular setting
            dbscan = DBSCAN(min_samples=min_samples, eps=eps)
            # get cluster assignments
            clusters = dbscan.fit_predict(X)
            print("min_samples: %d eps: %f  cluster: %s" % (min_samples, eps, clusters))
            if np.any(clusters == -1):
                c = ['w'] + colors
                m = ['o'] + markers
            else:
                c = colors
                m = markers
            discrete_scatter(X[:, 0], X[:, 1], clusters, ax=axes[i, j], c=c, s=8, markers=m)
            inds = dbscan.core_sample_indices_
            # vizualize core samples and clusters.
            if len(inds):
                discrete_scatter(X[inds, 0], X[inds, 1], clusters[inds],
                                 ax=axes[i, j], s=15, c=colors,
                                 markers=markers)
            axes[i, j].set_title("min_samples: %d eps: %.1f" % (min_samples, eps))
    fig.tight_layout()
开发者ID:ABcDexter,项目名称:introduction_to_ml_with_python,代码行数:35,代码来源:plot_dbscan.py

示例15: cluster_DBSCAN

def cluster_DBSCAN(args):
	"""
	Clustering with Ward hierarchical clustering: constructs a tree and cuts it.
	"""
	#load data
	g_it = node_link_data.node_link_data_to_eden(input = args.input_file, input_type = "file")
	vec = graph.Vectorizer(r = args.radius,d = args.distance, nbits = args.nbits)
	logger.info('Vectorizer: %s' % vec)

	X = vec.transform(g_it, n_jobs = args.n_jobs)
	logger.info('Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz() / X.shape[0]))
	
	#project to lower dimensional space to use clustering algorithms
	transformer = TruncatedSVD(n_components=args.n_components)
	X_dense=transformer.fit_transform(X)

	#log statistics on data
	logger.info('Dimensionality reduction Instances: %d Features: %d with an avg of %d features per instance' % (X_dense.shape[0], X_dense.shape[1], X.getnnz() / X.shape[0]))

	#clustering
	clustering_algo = DBSCAN(eps = args.eps)
	y = clustering_algo.fit_predict(X_dense)
	msg = 'Predictions statistics: '
	msg += util.report_base_statistics(y)
	logger.info(msg)

	#save model for vectorizer
	out_file_name = "vectorizer"
	eden_io.dump(vec, output_dir_path = args.output_dir_path, out_file_name = out_file_name)
	logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)

	#save result
	out_file_name = "labels"
	eden_io.store_matrix(matrix = y, output_dir_path = args.output_dir_path, out_file_name = out_file_name, output_format = "text")
	logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
开发者ID:nickgentoo,项目名称:pyEDeN,代码行数:35,代码来源:cluster_DBSCAN.py


注:本文中的sklearn.cluster.DBSCAN类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。