当前位置: 首页>>代码示例>>Python>>正文


Python utils.resample函数代码示例

本文整理汇总了Python中sklearn.utils.resample函数的典型用法代码示例。如果您正苦于以下问题:Python resample函数的具体用法?Python resample怎么用?Python resample使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了resample函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: initialize

    def initialize(self, X, k, random_seed, method='naive'):
        if method == 'naive':
            # Randomly pick k data points to be the centroids of the k clusters
            centroids = resample(X, n_samples=k, random_state=random_seed, replace=False)
        elif method == 'kmeans++': # https://en.wikipedia.org/wiki/K-means%2B%2B
            # Step 1: Choose one center uniformly at random from among the data points
            centroids = resample(X, n_samples=1, random_state=random_seed, replace=False)
            N = len(X)
            # Sampling the 1~k centroids
            for i in range(1, k):
                distances = [ -1 ] * N
                # Step 2: For each data point x, compute D(x)
                for j in range(N):
                    # The distance between x and the nearest center that has already been chosen
                    distances[j] = min(np.linalg.norm(X[j] - centroid) for centroid in centroids)

                # Step 3: Choose one new data point at randome as a new center,
                # using a weighted probability distribution where a point x is chosen with probability proportional to D(x)^2
                square_distances = [ distance ** 2 for distance in distances ]
                total_square_distance = sum(square_distances)
                # Naturally excluded already selected data points, because their probability is 0
                probabilities = [ square_distance / total_square_distance for square_distance in square_distances ]

                new_centroid_index = np.random.choice(range(N), size=1, replace=False, p=probabilities)[0]

                centroids = np.append(centroids, [ X[new_centroid_index] ], axis=0)

        return centroids
开发者ID:bluesilence,项目名称:python,代码行数:28,代码来源:KMeans.py

示例2: test_resample

def test_resample():
    # Border case not worth mentioning in doctests
    assert resample() is None

    # Check that invalid arguments yield ValueError
    assert_raises(ValueError, resample, [0], [0, 1])
    assert_raises(ValueError, resample, [0, 1], [0, 1],
                  replace=False, n_samples=3)
    assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42)
    # Issue:6581, n_samples can be more when replace is True (default).
    assert_equal(len(resample([1, 2], n_samples=5)), 5)
开发者ID:allefpablo,项目名称:scikit-learn,代码行数:11,代码来源:test_utils.py

示例3: run_scikit_digits

def run_scikit_digits(epochs=0, layers=0, neuron_count=0):
    """ Run Handwritten Digits dataset from Scikit-Learn.  Learning set is split
    into 70% for training, 15% for testing, and 15% for validation.

    Parameters
    ----------
    epochs : int
        Number of iterations of the the traininng loop for the whole dataset
    layers : int
        Number of layers (not counting the input layer, but does count output
        layer)
    neuron_count : list
        The number of neurons in each of the layers (in order), does not count
        the bias term

    Attributes
    ----------
    target_values : list
        The possible values for each training vector

    """

    # Imported from linear_neuron
    temp_digits = datasets.load_digits()
    digits = utils.resample(temp_digits.data, random_state=3)
    temp_answers = utils.resample(temp_digits.target, random_state=3)
    # images = utils.resample(temp_digits.images, random_state=0)
    num_of_training_vectors = 1250
    answers, answers_to_test, validation_answers = (
        temp_answers[:num_of_training_vectors],
        temp_answers[num_of_training_vectors : num_of_training_vectors + 260],
        temp_answers[num_of_training_vectors + 260 :],
    )
    training_set, testing_set, validation_set = (
        digits[:num_of_training_vectors],
        digits[num_of_training_vectors : num_of_training_vectors + 260],
        digits[num_of_training_vectors + 260 :],
    )

    ###########
    # network.visualization(training_set[10], answers[10])
    # network.visualization(training_set[11], answers[11])
    # network.visualization(training_set[12], answers[12])

    network = Network(layers, neuron_count, training_set[0])
    network.train(training_set, answers, epochs)
    f = open("my_net.pickle", "wb")
    # fr = open('my_net.pickle', 'rb')
    dill.dump(network, f)
    # network = pickle.load(fr)
    # fr.close()
    f.close()
    # guess_list = network.run_unseen(testing_set)
    return network.run_unseen(testing_set)
开发者ID:totalgood,项目名称:capstone,代码行数:54,代码来源:net_launch.py

示例4: resample_training_dataset

    def resample_training_dataset(self, labels, feature_array, sizes = (5000,500)):
        """
        Inputs:
            - labels
            - features
            - sizes: tuple, for each class (0,1,etc)m the number of training chunks you want.
            i.e for 500 seizures, 5000 baseline, sizes = (5000, 500), as 0 is baseline, 1 is Seizure
        Takes labels and features an

        WARNING: Up-sampling target class prevents random forest oob from being accurate.
        """
        if len (labels.shape) == 1:
            labels = labels[:, None]

        resampled_labels = []
        resampled_features = []
        for i,label in enumerate(np.unique(labels.astype('int'))):
            class_inds = np.where(labels==label)[0]

            class_labels = labels[class_inds]
            class_features = feature_array[class_inds,:]

            if class_features.shape[0] < sizes[i]: # need to oversample
                class_features_duplicated = np.vstack([class_features for i in range(int(sizes[i]/class_features.shape[0]))])
                class_labels_duplicated  = np.vstack([class_labels for i in range(int(sizes[i]/class_labels.shape[0]))])
                n_extra_needed = sizes[i] - class_labels_duplicated.shape[0]
                extra_features = resample(class_features, n_samples =  n_extra_needed,random_state = 7, replace = False)
                extra_labels = resample(class_labels, n_samples =  n_extra_needed,random_state = 7, replace = False)

                boot_array  = np.vstack([class_features_duplicated,extra_features])
                boot_labels = np.vstack([class_labels_duplicated,extra_labels])

            elif class_features.shape[0] > sizes[i]: # need to undersample
                boot_array  = resample(class_features, n_samples =  sizes[i],random_state = 7, replace = False)
                boot_labels = resample(class_labels,   n_samples =  sizes[i],random_state = 7, replace = False)

            elif class_features.shape[0] == sizes[i]:
                logging.debug('label '+str(label)+ ' had exact n as sample, doing nothing!')
                boot_array  = class_features
                boot_labels = class_labels
            else:
                print(class_features.shape[0], sizes[i])
                print ('fuckup')
            resampled_features.append(boot_array)
            resampled_labels.append(boot_labels)
        # stack both up...
        resampled_labels = np.vstack(resampled_labels)
        resampled_features = np.vstack(resampled_features)

        logging.debug('Original label counts: '+str(pd.Series(labels[:,0]).value_counts()))
        logging.debug('Resampled label counts: '+str(pd.Series(resampled_labels[:,0]).value_counts()))

        return resampled_labels, resampled_features
开发者ID:jcornford,项目名称:pyecog,代码行数:53,代码来源:classifier.py

示例5: run_mnist

def run_mnist(epochs, layers, neuron_count):
    """ Run Mnist dataset and output a guess list on the Kaggle test_set

    Parameters
    ----------
    epochs : int
        Number of iterations of the the traininng loop for the whole dataset
    layers : int
        Number of layers (not counting the input layer, but does count output
        layer)
    neuron_count : list
        The number of neurons in each of the layers (in order), does not count
        the bias term

    Attributes
    ----------
    target_values : list
        The possible values for each training vector

    """

    with open('train.csv', 'r') as f:
        reader = csv.reader(f)
        t = list(reader)
        train = [[int(x) for x in y] for y in t[1:]]

    with open('test.csv', 'r') as f:
        reader = csv.reader(f)
        raw_nums = list(reader)
        test_set = [[int(x) for x in y] for y in raw_nums[1:]]

    ans_train = [x[0] for x in train]
    train_set = [x[1:] for x in train]
    ans_train.pop(0)
    train_set.pop(0)

    train_set = utils.resample(train_set, random_state=2)
    ans_train = utils.resample(ans_train, random_state=2)

    network = Network(layers, neuron_count, train_set[0])
    network.train(train_set, ans_train, epochs)

    # For validation purposes
    # guess_list = network.run_unseen(train_set[4000:4500])
    # network.report_results(guess_list, ans_train[4000:4500])
    # guess_list = network.run_unseen(train_set[4500:5000])
    # network.report_results(guess_list, ans_train[4500:5000])

    guess_list = network.run_unseen(test_set)
    with open('digits.txt', 'w') as d:
        for elem in guess_list:
            d.write(str(elem)+'\n')
开发者ID:uglyboxer,项目名称:finnegan,代码行数:52,代码来源:net_launch.py

示例6: test_resample_stratified

def test_resample_stratified():
    # Make sure resample can stratify
    rng = np.random.RandomState(0)
    n_samples = 100
    p = .9
    X = rng.normal(size=(n_samples, 1))
    y = rng.binomial(1, p, size=n_samples)

    _, y_not_stratified = resample(X, y, n_samples=10, random_state=0,
                                   stratify=None)
    assert np.all(y_not_stratified == 1)

    _, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
    assert not np.all(y_stratified == 1)
    assert np.sum(y_stratified) == 9  # all 1s, one 0
开发者ID:daniel-perry,项目名称:scikit-learn,代码行数:15,代码来源:test_utils.py

示例7: eval_prox_random

    def eval_prox_random(self, n_sample_node=5, sample_nodes=[]):
        cs = self.cs
        measurements = {}
        nodes = cs.nodes()

        test_nodes = []
        if len(sample_nodes):
            if type(sample_nodes[0]) is str:
                test_nodes = sample_nodes
            elif type(sample_nodes[0]) is int:
                test_nodes = [nodes[i] for i in sample_nodes]
        else:
            test_nodes = resample(nodes, n_samples=n_sample_node)

        # nae of coordinate-based proximity vs ground-proximity
        coor_test = self.coor_all[test_nodes]

        ground_prox = (
            cs.proximity_to(sources=test_nodes, dests=cs.nodes()).as_matrix().transpose()
        )  # shape: test_nodes x all_nodes
        coor_prox = np.dot(coor_test.as_matrix().transpose(), self.coor_all.as_matrix())

        nae = pd.Series.combine(
            pd.Series(coor_prox.flatten()), pd.Series(ground_prox.flatten()), lambda c, g: abs(c - g) / g
        )
        nae_plot = pd.Series(np.linspace(0.0, 1.0, num=len(nae)), index=nae.order())
        measurements["nae"] = nae
        measurements["nae_plot"] = nae_plot

        return measurements
开发者ID:blublud,项目名称:coordinate_learning,代码行数:30,代码来源:path_accum_coorsys.py

示例8: bootstrap_auc

def bootstrap_auc(df, col, pred_col, n_bootstrap=1000):
    """
    Calculate the boostrapped AUC for a given col trying to predict a pred_col.

    Parameters
    ----------
    df : pandas.DataFrame
    col : str
        column to retrieve the values from
    pred_col : str
        the column we're trying to predict
    n_boostrap : int
        the number of bootstrap samples

    Returns
    -------
    list : AUCs for each sampling
    """
    scores = np.zeros(n_bootstrap)
    old_len = len(df)
    df.dropna(subset=[col], inplace=True)
    new_len = len(df)
    if new_len < old_len:
        logger.info("Dropping NaN values in %s to go from %d to %d rows" % (col, old_len, new_len))
    preds = df[pred_col].astype(int)
    for i in range(n_bootstrap):
        sampled_counts, sampled_pred = resample(df[col], preds)
        if is_single_class(sampled_pred, col=pred_col):
            continue
        scores[i] = roc_auc_score(sampled_pred, sampled_counts)
    return scores
开发者ID:hammerlab,项目名称:cohorts,代码行数:31,代码来源:model.py

示例9: fit

 def fit(self, dataSet):
     for clt in self.forest:
         randSet= resample(dataSet)
         #print "randSet size = %d" % len(randSet)
         target = [x[0] for x in randSet]
         train = [x[1:] for x in randSet]
         clt.fit(train, target)
开发者ID:agag4510118,项目名称:CS412-Introduction-to-Data-Mining,代码行数:7,代码来源:RandomForest.py

示例10: boot_estimates

def boot_estimates(model, X, y, nboot):
    '''
    Evaluate coefficient estimates for nboot boostrap samples
    '''
    coefs = [np.hstack([model.fit(iX, iy).intercept_, model.fit(iX, iy).coef_.ravel()]) 
            for iX, iy in (resample(X, y) for i in xrange(nboot))]  
    return np.vstack(coefs)
开发者ID:thomasbrawner,项目名称:python_tools,代码行数:7,代码来源:marginal_effects_example.py

示例11: downsample

def downsample(y, sizes = [30000, 3000]):
#     classes = Counter(y)
    res = []
    for class_i, sz in enumerate(sizes):
        indices = [x for x in y == class_i if x]
        res.append(resample(indices, replace = True, n_samples = sz))
    return tuple(res)
开发者ID:vadimnazarov,项目名称:llama,代码行数:7,代码来源:llama.py

示例12: run_method_usage

def run_method_usage(methods,cases):
    methods = [m[0] for m in methods]
    # Bootstrap the percentage error bars:
    percents =[]
    for i in range(10000):
        nc = resample(cases)
        percents.append(100*np.sum(nc,axis=0)/len(nc))
    percents=np.array(percents)
    mean_percents = np.mean(percents,axis=0)
    std_percents = np.std(percents,axis=0)*1.96
    inds=np.argsort(mean_percents).tolist()
    inds.reverse()
    avg_usage = np.mean(mean_percents)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    x=np.arange(len(methods))
    ax.plot(x,[avg_usage]*len(methods),'-',color='0.25',lw=1,alpha=0.2)
    ax.bar(x, mean_percents[inds], 0.6, color=paired[0],linewidth=0,
           yerr=std_percents[inds],ecolor=paired[1])
    #ax.set_title('Method Occurrence')
    ax.set_ylabel('Occurrence %',fontsize=30)
    ax.set_xlabel('Method',fontsize=30)
    ax.set_xticks(np.arange(len(methods)))
    ax.set_xticklabels(np.array(methods)[inds],fontsize=8)
    fig.autofmt_xdate()
    fix_axes()
    plt.tight_layout()
    fig.savefig(figure_path+'method_occurrence.pdf', bbox_inches=0)
    fig.show()
    return inds,mean_percents[inds]
开发者ID:IDEALLab,项目名称:design_method_recommendation_JMD_2014,代码行数:30,代码来源:paper_experiments.py

示例13: balanced_resample

def balanced_resample(data, labels):
    """Do a balanced resampling of data and labels, returning them
    See the test routine at the bottom for an example of behavior
    """
    most_common, num_required = mstats.mode(labels)
    possible_labels = np.unique(labels)

    data_resampled = []
    labels_resampled = []

    for possible_label in possible_labels:
        in_this_label = labels == possible_label

        data_buffered = np.array([])
        data_buffered = np.reshape(data_buffered, (0, data.shape[1]))
        labels_buffered = np.array([])

        while len(data_buffered) < num_required:
            data_buffered = np.vstack([data_buffered, data[in_this_label]])
            labels_buffered = np.hstack([labels_buffered, labels[in_this_label]])

        single_data_resampled, single_labels_resampled = utils.resample(
            data_buffered,
            labels_buffered,
            n_samples=int(num_required),
            replace=True
        )
        data_resampled.append(single_data_resampled)
        labels_resampled.append(single_labels_resampled)

    return np.vstack(data_resampled).astype(data.dtype), np.hstack(labels_resampled).astype(labels.dtype)
开发者ID:DSsoto,项目名称:Sub8,代码行数:31,代码来源:utils.py

示例14: fit

    def fit(self, X, Y):
        num_examples = len(X)
        data_indices = np.arange(num_examples)
        self.data = X
        Y = np.array(Y, dtype=float)

        sample = resample(data_indices, replace=False, n_samples=min(20, num_examples), random_state=0)
        for i in sample:
            y = Y[i]
            self.S.add(i)
            self.y[i] = y
            self.alpha[i] = 0.0
            self.g[i] = y
       

        for i in xrange(5):
            min_delta = 999999999
            for i in data_indices:
                self.process(i, Y[i])
                delta = self.reprocess()
                min_delta = min(min_delta, delta)
            if min_delta < self.tau: break

            data_indices = shuffle(data_indices)

        while True:
            delta = self.reprocess()
            if delta < self.tau: break
开发者ID:woohp,项目名称:ai_tidbits,代码行数:28,代码来源:lasvm.py

示例15: test_mnist

    def test_mnist(self):
        mnist = fetch_mldata('MNIST original')
        X, Y = resample(mnist.data, mnist.target, replace=False, n_samples=1000, random_state=0)
        X = X.astype(float)
        Y = [1 if y == 0 else -1 for y in Y]

        svm = LASVM(C=10, tau=0.001)
        svm.fit(X, Y)

        X_test, Y_test = resample(mnist.data, mnist.target, replace=False, n_samples=300, random_state=2)
        X_test = X_test.astype(float)
        Y_test = [1 if y == 0 else -1 for y in Y_test]
        Y_predict = svm.predict(X_test)
        percent_correct = np.sum(Y_predict == Y_test) / 300.0

        self.assertGreater(percent_correct, 0.95)
开发者ID:woohp,项目名称:ai_tidbits,代码行数:16,代码来源:lasvm.py


注:本文中的sklearn.utils.resample函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。