Python DBSCAN.predict方法代码示例

本文整理汇总了Python中sklearn.cluster.DBSCAN.predict方法的典型用法代码示例。如果您正苦于以下问题：Python DBSCAN.predict方法的具体用法？Python DBSCAN.predict怎么用？Python DBSCAN.predict使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.cluster.DBSCAN的用法示例。

在下文中一共展示了DBSCAN.predict方法的2个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from sklearn.cluster import DBSCAN [as 别名]
# 或者: from sklearn.cluster.DBSCAN import predict [as 别名]
def main(argv):
    dbscan_heuristic_mode = False
    dpgmm_mode = False
    do_plot_clusters = False
    do_dump_clusters = False
    try:
        opts, args = getopt.getopt(argv,"hegdp")
    except getopt.GetoptError:
        print('elviz_cluster.py [-h] [-e] [-g] [-d] [-p]')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('elviz_cluster.py [-h] [-e]')
            print('  -h = help, -e = run dbscan' +
                  ' epsilon heuristic plot generation code')
            print('  -g = use a DPGMM for clustering')
            print('  -p = plot the clusters to a PDF file')
            print('  -d = dump the clusters to a text file')
            sys.exit()
        elif opt == '-e':
            dbscan_heuristic_mode = True
        elif opt == '-g':
            dpgmm_mode = True
        elif opt == '-p':
            do_plot_clusters = True
        elif opt == '-d':
            do_dump_clusters = True

    [elviz_data, combined_df] = read_pickle_or_CSVs(DATA_PICKLE, RAW_DATA_DIR)

    # Setup plotting limits
    print("determining plotting limits")
    limits = {"x": [combined_df['Average fold'].min(), MAX_AVG_FOLD],
              "y": [combined_df['Reference GC'].min(), combined_df['Reference GC'].max()]}
    # Below changed in favor of fixed MAX
    # limits["x"] = [combined_df['Average fold'].min(), combined_df['Average fold'].max()]
    # fixed MAX below

    print("normalizing data prior to clustering")
    # normalize the combined data to retrieve the normalization parameters
    scaler = StandardScaler().fit(combined_df[CLUSTER_COLUMNS])
    # serializing outputs

    if dbscan_heuristic_mode:
        print("making DBSCAN heuristic plots")
        dbscan_heuristic(elviz_data, scaler)
        os.sys.exit()

    print("serially processing files")
    for filename in elviz_data.keys():
        pdf_filename = filename.replace("csv", "pdf")
        # skip if the PDF already exists
        if os.path.isfile(RESULTS_DIR + pdf_filename):
            print("skiping file %s" % filename)
            continue
        print("processing file %s" % filename)

        df = elviz_data[filename]

        # create a multipage PDF for storing the plots
        with PdfPages(RESULTS_DIR + pdf_filename) as pdf:
            # find unique values of taxonomy columns
            dfgb = df.groupby(['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'])
            for key in dfgb.indices.keys():
                idx = dfgb.indices[key]
                tax_rows = df.iloc[idx]
                if len(tax_rows) < MIN_ROWS:
                    continue
                # normalize all dimensions to be used in clustering, e.g. GC, coverage, rpk
                # reuse the scaler we created from all of the data for the transform
                tax_rows_cluster_columns = scaler.transform(tax_rows[CLUSTER_COLUMNS])

                if not dpgmm_mode:
                    db = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES)
                    db.fit(tax_rows_cluster_columns)

                    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
                    core_samples_mask[db.core_sample_indices_] = True
                    labels = db.labels_
                else:
                    db = mixture.DPGMM(n_components=DPGMM_N_COMPONENTS, n_iter=100,
                                       covariance_type='full', alpha=100, verbose=0)
                    db.fit(tax_rows_cluster_columns)
                    Y_ = db.predict(tax_rows_cluster_columns)
                    for i, (mean, covar) in enumerate(zip(
                        db.means_, db._get_covars())):
                        if not np.any(Y_ == i):
                            continue
                        #plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
                    labels = Y_
                    core_samples_mask = np.zeros_like(labels, dtype=bool)
                    core_samples_mask[:] = True
                            
                #print(labels)
                #print(type(labels))

                # number of clusters in labels, ignoring noise if present.
                n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

                if n_clusters_ < 1:
#.........这里部分代码省略.........

开发者ID:JanetMatsen，项目名称:elvizAnalysis，代码行数:103，代码来源:elviz_cluster.py

示例2: get_hits

# 需要导入模块: from sklearn.cluster import DBSCAN [as 别名]
# 或者: from sklearn.cluster.DBSCAN import predict [as 别名]
def get_hits(file_path):

    branch_list = [
        #'spill',
        #'tdc_time_stamp',
        #'trigger_counter',
        'number_hits',
        'tdc_number',
        'hit_channel',
        'hit_time_bin',
        ]

    arr = rnp.root2array(file_path, 'DataQuality/mwpc', branch_list)

    #spill = arr['spill'].astype(np.int64)
    #tdc_time_stamp = arr['tdc_time_stamp'].astype(np.int64) #/ 106.208  # microseconds
    #trigger_counter = arr['trigger_counter']
    number_hits = arr['number_hits']
    tdc_number = arr['tdc_number']
    hit_channel = arr['hit_channel']
    hit_time_bin = arr['hit_time_bin']

    number_entries = arr.size
    number_tdcs = 16

    time_bin_scaling = 1.0 / 1280.0
    channel_scaling = 1.0 / 64.0
    dbscan = DBSCAN(eps=4.0/64.0, min_samples=1)
    #colors = np.array([ x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk' ])
    #colors = np.hstack([colors] * 20)

    good_hit_array = [ [] for i in range(number_tdcs) ]
    bad_hit_array = [ [] for i in range(number_tdcs) ]

    for entry in xrange(number_entries):

        hit_time_buffer = [ [] for i in range(number_tdcs) ]
        hit_channel_buffer = [ [] for i in range(number_tdcs) ]

        for tdc_index in xrange(number_tdcs):
            flag = (tdc_number[entry] == tdc_index + 1)
            hit_time_buffer[tdc_index].extend(hit_time_bin[entry][flag])
            hit_channel_buffer[tdc_index].extend(hit_channel[entry][flag])

        for tdc_index in xrange(number_tdcs):

            data = np.array([
                    np.array(hit_channel_buffer[tdc_index]).astype(np.int64),
                    np.array(hit_time_buffer[tdc_index]).astype(np.int64)
                ]).T

            scaled_data = np.array([
                    np.array(hit_channel_buffer[tdc_index]).astype(np.int64) \
                    * channel_scaling,
                    np.array(hit_time_buffer[tdc_index]).astype(np.int64) \
                    * time_bin_scaling
                ]).T

            if len(scaled_data) != 0:
                dbscan.fit(scaled_data)
                if hasattr(dbscan, 'labels_'):
                    y_pred = dbscan.labels_.astype(np.int)
                else:
                    y_pred = dbscan.predict(scaled_data)

                #print '///////////////////////////////////////'
                cluster_indices = np.unique(y_pred)
                #print cluster_indices
                #print data.shape
                #print y_pred.shape
                #print y_pred
                for cluster_index in cluster_indices:
                    cluster = data[y_pred == cluster_index]
                    if len(cluster) > 10:
                        bad_hit_array[tdc_index].extend(cluster)
                        continue
                    # z is the earliest hit in the cluster
                    z = cluster[np.where(cluster[:, 1] == cluster[:, 1].min())]
                    if len(z) == 1:
                        good_hit_array[tdc_index].append(z[0])
                    else:
                        bad_hit_array[tdc_index].extend(cluster)
                        continue
                    #else:
                    #    mean_channel = np.mean(z[:, 0])

                    dtype = np.dtype((np.void, (cluster.shape[1] *
                                                cluster.dtype.itemsize)))
                    mask = np.in1d(cluster.view(dtype), z.view(dtype))
                    bad_hits = cluster[~mask]
                    bad_hit_array[tdc_index].extend(bad_hits)

                #print '///////////////////////////////////////'

    good_hit_array = np.array(
            [ np.array(hits) for hits in good_hit_array ]
        )
    bad_hit_array = np.array(
            [ np.array(hits) for hits in bad_hit_array ]
        )
#.........这里部分代码省略.........

开发者ID:lariat，项目名称:dqm-v2，代码行数:103，代码来源:cluster.py

注：本文中的sklearn.cluster.DBSCAN.predict方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。