当前位置: 首页>>代码示例>>Python>>正文


Python RandomOverSampler.fit_sample方法代码示例

本文整理汇总了Python中imblearn.over_sampling.RandomOverSampler.fit_sample方法的典型用法代码示例。如果您正苦于以下问题:Python RandomOverSampler.fit_sample方法的具体用法?Python RandomOverSampler.fit_sample怎么用?Python RandomOverSampler.fit_sample使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在imblearn.over_sampling.RandomOverSampler的用法示例。


在下文中一共展示了RandomOverSampler.fit_sample方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: resample

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
def resample(X, y, sample_fraction=0.1, test_size=0.3):
    X_columns = X.columns
    y_columns = y.columns
    n = len(X_columns)

    print('~' * 80)
    print('@@-\n', y.converted.value_counts())
    print('@@0 - Original')
    show_balance(y.values)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    print('@@2 - y_train')
    show_balance(y_train)
    print('@@2 -  y_test')
    show_balance(y_test)
    assert X_train.shape[1] == n and X_test.shape[1] == n

    ros = RandomOverSampler(random_state=42)
    X_train, y_train = ros.fit_sample(X_train, y_train)
    X_test, y_test = ros.fit_sample(X_test, y_test)
    print('@@3 - Oversampled y_train')
    show_balance(y_train)
    print('@@3 - Oversampled y_test')
    show_balance(y_test)
    assert X_train.shape[1] == n and X_test.shape[1] == n

    if sample_fraction < 1.0:
        _, X_train, _, y_train = train_test_split(X_train, y_train, test_size=sample_fraction, random_state=43)
        _, X_test, _, y_test = train_test_split(X_test, y_test, test_size=sample_fraction, random_state=44)
        print('@@2 - Downsampled y_train')
        show_balance(y_train)
        print('@@2 - Downsampled y_test')
        show_balance(y_test)
        assert len(X_train.shape) == 2 and len(X_test.shape) == 2, (X_train.shape, X_test.shape)
        assert X_train.shape[1] == n and X_test.shape[1] == n, (X_train.shape, X_test.shape)

    print('X_columns=%d %s' % (len(X_columns), X_columns))
    print('y_columns=%d %s' % (len(y_columns), y_columns))
    print('X_train=%-10s y_train=%s' % (list(X_train.shape), list(y_train.shape)))
    print('X_test =%-10s y_test =%s' % (list(X_test.shape), list(y_test.shape)))
    assert X_train.shape[1] == n and X_test.shape[1] == n

    X_train = pd.DataFrame(X_train, columns=X_columns)
    y_train = pd.DataFrame(y_train, columns=y_columns, index=X_train.index)
    X_test = pd.DataFrame(X_test, columns=X_columns)
    y_test = pd.DataFrame(y_test, columns=y_columns, index=X_test.index)
    print('@@+ y_train\n', y_train.converted.value_counts(), flush=True)
    print('@@+ y_test\n', y_test.converted.value_counts(), flush=True)

    return (X_train, y_train), (X_test, y_test)
开发者ID:peterwilliams97,项目名称:Butt-Head-Astronomer,代码行数:52,代码来源:feature_select.py

示例2: transform

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
    def transform(self, X, y=None):
        # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the
        # TODO      simple trainer in the correct order and leave this to advanced users?

        # Extract predicted column
        y = np.squeeze(X[[self.predicted_column]])

        # Copy the dataframe without the predicted column
        temp_dataframe = X.drop([self.predicted_column], axis=1)

        # Initialize and fit the under sampler
        over_sampler = RandomOverSampler(random_state=self.random_seed)
        x_over_sampled, y_over_sampled = over_sampler.fit_sample(temp_dataframe, y)

        # Build the resulting under sampled dataframe
        result = pd.DataFrame(x_over_sampled)

        # Restore the column names
        result.columns = temp_dataframe.columns

        # Restore the y values
        y_over_sampled = pd.Series(y_over_sampled)
        result[self.predicted_column] = y_over_sampled

        return result
开发者ID:xtaraim,项目名称:healthcareai-py,代码行数:27,代码来源:transformers.py

示例3: oversample

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
 def oversample(self):
     self._X_original = self._X
     self._y_original = self._y
     ros = RandomOverSampler(random_state=0)
     X, y = ros.fit_sample(self._X, self._y)
     self._X = X
     self._y = y
开发者ID:dermatologist,项目名称:nlp-qrmine,代码行数:9,代码来源:mlqrmine.py

示例4: test_ros_fit_sample

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
def test_ros_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ros = RandomOverSampler(random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ros_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ros_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
开发者ID:vivounicorn,项目名称:imbalanced-learn,代码行数:14,代码来源:test_random_over_sampler.py

示例5: oversample

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
    def oversample(self):
        """Balance class data based on outcome"""
        print('Current outcome sampling {}'.format(Counter(self.y)))
        
        # to use a random sampling seed at random:
        ros = RandomOverSampler()
        #ros = SMOTE()
        #ros = ADASYN()

        self.X, self.y = ros.fit_sample(self.X, self.y)

        self.Xview = self.X.view()[:, :self.n_features]
        print('Resampled dataset shape {}'.format(Counter(self.y)))
开发者ID:kellyhennigan,项目名称:cueexp_scripts,代码行数:15,代码来源:sgdrfe_GMmask.py

示例6: oversample

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
 def oversample(self):
     """Balance class data based on outcome"""
     print('Current outcome sampling {}'.format(Counter(self.y)))
     
     # to use a random sampling seed at random:
     ros = RandomOverSampler()
     
     # to fix the random sampling seed at a certain value & return indices: 
     #ros = RandomOverSampler(random_state=2)
     
     self.X, self.y = ros.fit_sample(self.X, self.y)
     
     self.Xview = self.X.view()[:, :self.n_features]
     print('Resampled dataset shape {}'.format(Counter(self.y)))
开发者ID:kellyhennigan,项目名称:cueexp_scripts,代码行数:16,代码来源:sgdrfe_oversample.py

示例7: test_ros_fit_sample_half

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
def test_ros_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.5
    ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, Y)

    X_gt = np.array([[0.04352327, -0.20515826], [0.20792588, 1.49407907],
                     [0.22950086, 0.33367433], [0.15490546, 0.3130677],
                     [0.09125309, -0.85409574], [0.12372842, 0.6536186],
                     [0.094035, -2.55298982], [0.92923648, 0.76103773],
                     [0.47104475, 0.44386323], [0.13347175, 0.12167502]])
    y_gt = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
开发者ID:kellyhennigan,项目名称:cueexp_scripts,代码行数:18,代码来源:test_random_over_sampler.py

示例8: test_multiclass_fit_sample

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ros = RandomOverSampler(random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 3600)
    assert_equal(count_y_res[1], 3600)
    assert_equal(count_y_res[2], 3600)
开发者ID:integrallyclosed,项目名称:imbalanced-learn,代码行数:18,代码来源:test_random_over_sampler.py

示例9: runns

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
def runns(resp_var, size_of_test_data,dataset,positive_class,n_estimators,important_features,dealing_with_nulls):
	dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes
	#----DATA PREPROCESSING
	#-------dealing with NULL values in the data
	#----------remove the rows in which the response is null
	dataset=dataset.dropna(subset=[resp_var])
	#----------dealing with nulls
	dataset=deal_with_nulls(dealing_with_nulls,dataset)
	#----FEATURE SELECTION
	#-------get predictors important in predicting the response
	#-----------transform categorical predictors to dummy variables
	predictors=dataset.drop(resp_var,axis=1,inplace=False)
	predictors=pd.get_dummies(predictors)
	#-----------balance the classes in the response var
	ros = RandomOverSampler(random_state=0)
	resp=dataset[resp_var]
	prds, resp = ros.fit_sample(predictors, resp)
	#-----------fit the random forest classifier to give us the important predictors
	rf_clf = RandomForestClassifier(n_estimators=n_estimators)
	rf_clf.fit(prds,resp)
	#-------get the important predictors
	feature_imp = pd.Series(rf_clf.feature_importances_,
                    index=list(predictors.iloc[:,0:])).sort_values(ascending=False)
	#-------names of the important predictors
	important_predictor_names = feature_imp.index[0:important_features]
	#-------subset the data to get only the important predictors and the response
	resp=pd.DataFrame(data=resp,columns=[resp_var])
	predictors=pd.DataFrame(prds,columns=list(predictors))
	dataset=pd.concat([resp,predictors],axis=1)
	#---------------------------------------------------------
	#----MODEL TRAINING
	#--------Remove the response variables from the features variables - axis 1 refers to the columns
	m_data= dataset.drop(resp_var, axis = 1,inplace=False) 
	# Response variables are the values we want to predict
	resp_var = np.array(dataset[resp_var])

	dataset = pd.get_dummies(m_data)
    
	# Saving feature names for later use
	feature_list = list(m_data.columns)
	# Convert to numpy array
	dataset = np.array(dataset)

	# Split the data into training and testing sets
	train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = size_of_test_data, random_state = 402)

	# Instantiate model with n_estimators decision trees
	clf = SVC(kernel='rbf',probability=True)

	# Train the model on training data
	clf.fit(train_features, train_labels)
    # evaluation
	predicted = clf.predict(test_features)
	pred_prob = clf.predict_proba(test_features)
    
	accuracy = accuracy_score(test_labels, predicted)
	#confusion matrix
	cnf = (confusion_matrix(test_labels,predicted))
	#precision score
	precision = precision_score(test_labels,predicted,pos_label=positive_class)
	#avg pres
	avg_precision = average_precision_score(test_labels,pred_prob[:,[1]])
	#recall score
	rec = recall_score(test_labels,predicted,pos_label=positive_class)
	#f1 scorea
	fscore = f1_score(test_labels,predicted,pos_label=positive_class)
	#fbeta score
	fbeta = fbeta_score(test_labels,predicted,beta=0.5)
	#hamming_loss
	hamming = hamming_loss(test_labels,predicted)
	#jaccard similarity score
	jaccard = jaccard_similarity_score(test_labels,predicted)
	#logloss
	logloss = log_loss(test_labels,predicted)
	#zero-oneloss
	zero_one = zero_one_loss(test_labels,predicted)
	#auc roc 
	area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]])
	#cohen_score
	cohen = cohen_kappa_score(test_labels,predicted)
	#mathews corr
	mathews = matthews_corrcoef(test_labels,predicted)
	# Variable importances from the important features selection stage
	variable_importance_list = list(zip(prds, feature_imp))
	output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews}
	output=json.dumps(output)
	return jsonify({"Predictions": output})
开发者ID:ghollah,项目名称:ServingMLAPIs,代码行数:89,代码来源:svm.py

示例10: train_test_split

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
# Split train_val data into training set and validation set
X_train, X_val, y_train, y_val \
    = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# ==========================================================================================

# Over-sampled data

# Generate the new dataset using under-sampling method
verbose = False
ratio = 'auto'

# 'Random over-sampling'
OS = RandomOverSampler(ratio=ratio, verbose=verbose)
X_train_os, y_train_os = OS.fit_sample(X_train, y_train)

# 'SMOTE'
smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
X_train_smo, y_train_smo = smote.fit_sample(X_train, y_train)

# 'SMOTE bordeline 1'
bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1')
X_train_bs1, y_train_bs1 = bsmote1.fit_sample(X_train, y_train)

# 'SMOTE bordeline 2'
bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2')
X_train_bs2, y_train_bs2 = bsmote2.fit_sample(X_train, y_train)

# 'SMOTE SVM'
svm_args={'class_weight': 'auto'}
开发者ID:apyeh,项目名称:BaseballHOF,代码行数:32,代码来源:Model_Building.py

示例11: print

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
# summarize the number of rows and columns in the dataset after listwise drop
(sample, vnum) = dataset.shape
print(sample, vnum)

# Get the number of variables
vnum = vnum - 1

# splice into IVs and DV
values = dataset.values
X = values[:, 0:vnum]
y = values[:, vnum]

# Oversampling
ros = RandomOverSampler(random_state=0)
X_R, y_R = ros.fit_sample(X, y)

# create model
model = Sequential()
model.add(Dense(12, input_dim=vnum, kernel_initializer='uniform', activation='relu'))
model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X_R, y_R, epochs=150, batch_size=10, verbose=2)

# calculate predictions
predictions = model.predict(X)
# round predictions
rounded = [round(x[0]) for x in predictions]
开发者ID:dermatologist,项目名称:nlp-qrmine,代码行数:32,代码来源:nnet.py

示例12: balance_data

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
def balance_data(X, y):
    # Apply the random over-sampling
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_sample(X, y)
    return X_resampled, y_resampled
开发者ID:JDwangmo,项目名称:nlp_util,代码行数:7,代码来源:data_util.py

示例13: open

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
import sys, os, csv
from imblearn.over_sampling import RandomOverSampler
input_csv_file = sys.argv[1]
input_csv = input_csv_file.split(".csv")[0]
with open(input_csv_file, newline="") as input_file:
    reader = csv.reader(input_file, delimiter=',')
    with open(input_csv + "-ro-.csv", 'w', newline='') as output_file:
        writer = csv.writer(output_file, delimiter=',')
        skip_header = True
        X = []
        y = []
        ros = RandomOverSampler()
        for x in reader:
            if skip_header:
                skip_header = False
                continue
            y.append(x[-1])
            X.append(list(map(int, x[:len(x) - 1])))
            #print (X)
        X_res, y_res = ros.fit_sample(X, y)        
        print (len(X_res))
        print (len(y_res))
        for idx, s in enumerate(X_res):
            #print (list(s) + list(y_res[idx]))
            writer.writerow(list(s) + list(y_res[idx]))
            #break;
            
开发者ID:punkie,项目名称:master-thesis,代码行数:28,代码来源:random_oversampling.py

示例14: list

# 需要导入模块: from imblearn.over_sampling import RandomOverSampler [as 别名]
# 或者: from imblearn.over_sampling.RandomOverSampler import fit_sample [as 别名]
for feature in list(data.columns):

	# onehot encode the feature
	feature_data = data[[feature]]
	encoded_feature_data = pd.get_dummies(feature_data)

	print '\n'
	print feature
	print feature_data.shape
	print encoded_feature_data.shape
	print y.shape

	# upsample minority class
	from imblearn.over_sampling import RandomOverSampler
	ros = RandomOverSampler(ratio=0.5)
	X_resampled, y_resampled = ros.fit_sample(encoded_feature_data, y)

	print '\n'
	print X_resampled.shape
	print y_resampled.shape

	# create train and test split
	X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=0, test_size=0.2)

	print '\n'
	print 'Training data'
	print X_train.shape
	print y_train.shape

	print 'Testing data'
	print X_test.shape
开发者ID:amkent5,项目名称:MachineLearning,代码行数:33,代码来源:example_model_based_ranking_for_univariate_selection.py


注:本文中的imblearn.over_sampling.RandomOverSampler.fit_sample方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。