本文整理汇总了Python中pandas.qcut方法的典型用法代码示例。如果您正苦于以下问题:Python pandas.qcut方法的具体用法?Python pandas.qcut怎么用?Python pandas.qcut使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pandas
的用法示例。
在下文中一共展示了pandas.qcut方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _compute_stats
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def _compute_stats(self, pred, expo, loss, prem):
n_samples, n_groups = pred.shape[0], self.n_groups
pred_ser = pd.Series(pred)
loss_to_returns = np.sum(loss) / np.sum(prem)
rank = pd.qcut(pred_ser, n_groups, labels=False)
n_groups = np.amax(rank) + 1
groups = np.arange(n_groups) # if we ever go back to using n_groups...
tab = pd.DataFrame({
'rank': rank,
'pred': pred,
'prem': prem,
'loss': loss,
'expo': expo
})
grouped = tab[['rank', 'pred', 'prem', 'loss', 'expo']].groupby('rank')
agg_rlr = (grouped['loss'].agg(np.sum) / grouped['prem'].agg(np.sum)) / loss_to_returns
return tab, agg_rlr, n_groups
示例2: create_features
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def create_features(self):
data = train.append(test)
age_mean = data['Age'].mean()
age_std = data['Age'].std()
self.train['Age'] = pd.qcut(
train['Age'].fillna(
np.random.randint(age_mean - age_std, age_mean + age_std)
),
5,
labels=False
)
self.test['Age'] = pd.qcut(
test['Age'].fillna(
np.random.randint(age_mean - age_std, age_mean + age_std)
),
5,
labels=False
)
示例3: upload
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def upload():
df = pd.read_csv("bitmex_candles1.csv")
df['change'] = df['close'].diff()
df['roc'] = df['change']/df['close']
df['Quantile_rank']=pd.qcut(df['roc'],4,labels=False)
print (df)
df['roc'].plot()
key = "bitmex_minute"
#with open('temp.json', 'w') as f:
candle_json = df.to_json(orient='records', lines=True)
#f.write(df.to_json(orient='records', lines=True))
key = "bitmex_history_0404"
put_s3_public(bucket_name, key, candle_json)
示例4: var_bins
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def var_bins(quality):
quality.sort_values(by='iv', ascending=False, inplace=True)
var_group_list = []
if len(quality) < 10:
for temp in quality.index.tolist():
var_group_list.append([temp])
else:
bins = pd.qcut(range(len(quality)), 10, labels=False)
df_var = pd.DataFrame(columns=['num', 'var', 'iv'])
df_var['num'] = bins
df_var['var'] = quality.index
for group, temp in df_var.groupby(by='num'):
var_group_list.append(temp['var'].tolist())
return var_group_list
# 用woe替换离散变量
示例5: cohort_plot
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def cohort_plot(data_set_path, metric_to_plot='',ncohort=10):
assert os.path.isfile(data_set_path),'"{}" is not a valid dataset path'.format(data_set_path)
churn_data = pd.read_csv(data_set_path,index_col=[0,1])
groups = pd.qcut(churn_data[metric_to_plot], ncohort, duplicates='drop')
cohort_means = churn_data.groupby(groups)[metric_to_plot].mean()
cohort_churns = churn_data.groupby(groups)['is_churn'].mean()
plot_frame = pd.DataFrame({metric_to_plot: cohort_means.values, 'churn_rate': cohort_churns})
plt.figure(figsize=(6, 4))
plt.plot(metric_to_plot, 'churn_rate', data=plot_frame,marker='o', color='black', linewidth=2, label=metric_to_plot)
plt.xlabel('Cohort Average of "%s"' % metric_to_plot)
plt.ylabel('Cohort Churn Rate')
plt.grid()
plt.gca().set_ylim(bottom=0)
save_path = data_set_path.replace('.csv', '_' + metric_to_plot + '_churn_corhort.svg')
plt.savefig(save_path)
print('Saving plot to %s' % save_path)
示例6: sample_431
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def sample_431():
"""
4.3.1 数据的离散化
:return:
"""
tsla_df.p_change.hist(bins=80)
plt.show()
cats = pd.qcut(np.abs(tsla_df.p_change), 10)
print('cats.value_counts():\n', cats.value_counts())
# 将涨跌幅数据手工分类,从负无穷到-7,-5,-3,0, 3, 5, 7,正无穷
bins = [-np.inf, -7.0, -5, -3, 0, 3, 5, 7, np.inf]
cats = pd.cut(tsla_df.p_change, bins)
print('bins cats.value_counts():\n', cats.value_counts())
# cr_dummies为列名称前缀
change_ration_dummies = pd.get_dummies(cats, prefix='cr_dummies')
print('change_ration_dummies.head():\n', change_ration_dummies.head())
示例7: IV_calc
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def IV_calc(data,var):
if data[var].dtypes == "object":
dataf = data.groupby([var])['class'].agg(['count','sum'])
dataf.columns = ["Total","bad"]
dataf["good"] = dataf["Total"] - dataf["bad"]
dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
dataf["good_per"] = dataf["good"]/dataf["good"].sum()
dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
return dataf
else:
data['bin_var'] = pd.qcut(data[var].rank(method='first'),10)
dataf = data.groupby(['bin_var'])['class'].agg(['count','sum'])
dataf.columns = ["Total","bad"]
dataf["good"] = dataf["Total"] - dataf["bad"]
dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
dataf["good_per"] = dataf["good"]/dataf["good"].sum()
dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
return dataf
开发者ID:PacktPublishing,项目名称:Statistics-for-Machine-Learning,代码行数:20,代码来源:Chapter 03_Logistic Regression vs Random Forest.py
示例8: generateInputs
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def generateInputs(RunnerObj):
'''
Function to generate desired inputs for SINCERITIES.
If the folder/files under RunnerObj.datadir exist,
this function will not do anything.
:param RunnerObj: An instance of the :class:`BLRun`
'''
if not RunnerObj.inputDir.joinpath("SINCERITIES").exists():
print("Input folder for SINCERITIES does not exist, creating input folder...")
RunnerObj.inputDir.joinpath("SINCERITIES").mkdir(exist_ok = False)
ExpressionData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.exprData),
header = 0, index_col = 0)
PTData = pd.read_csv(RunnerObj.inputDir.joinpath(RunnerObj.cellData),
header = 0, index_col = 0)
colNames = PTData.columns
for idx in range(len(colNames)):
# Select cells belonging to each pseudotime trajectory
colName = colNames[idx]
index = PTData[colName].index[PTData[colName].notnull()]
exprName = "SINCERITIES/ExpressionData"+str(idx)+".csv"
newExpressionData = ExpressionData.loc[:,index].T
# Perform quantile binning as recommeded in the paper
# http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html#pandas.qcut
nBins = int(RunnerObj.params['nBins'])
tQuantiles = pd.qcut(PTData.loc[index,colName], q = nBins, duplicates ='drop')
mid = [(a.left + a.right)/2 for a in tQuantiles]
newExpressionData['Time'] = mid
newExpressionData.to_csv(RunnerObj.inputDir.joinpath(exprName),
sep = ',', header = True, index = False)
示例9: qcut_safe
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def qcut_safe(prices, q):
nbins=min(q, len(prices))
result = pd.qcut(prices, nbins, labels=np.arange(nbins) )
return result
示例10: _add_bins
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def _add_bins(df, feats, n_bins=10):
"""Finds n_bins bins of equal size for each feature in dataframe and outputs the result as a dataframe.
Parameters
----------
df : pandas.DataFrame
dataframe with features
feats : list
list of features you would like to consider for splitting into bins (the ones you want to evaluate NWOE, NIV etc for)
n_bins = number of even sized (no. of data points) bins to use for each feature (this is chosen based on both t and c datasets)
Returns
----------
df_new : pandas.DataFrame
original dataframe with bin intervals for each feature included as new columns (labelled as original column name + '_bin')
"""
df_new = df.copy()
for feat in feats:
# check number of unique values of feature -- if low (close to the number of bins), we need to be careful
num_unique_elements = len(df[feat].unique())
# we should be more careful with how we make bins
# we really want to make this independent of bins
if num_unique_elements > n_bins*2: # x2 because we need intervals
bin_intervals = pd.qcut(df[feat],n_bins,duplicates='drop') # !!! make sure there's nothing funny happening with duplicates
# include bins in new column
df_new[str(feat)+'_bin'] = bin_intervals
else:
df_new[str(feat)+'_bin'] = df_new[feat]
return df_new
示例11: y_transform
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def y_transform(Y, data, flatten):
df_y = data[Y]
# if user input 'int' then function will be "greater than value"
# if user input 'float' then function will be IQR range
# below is for case where prediction is true or false
# but the y-feature is in different format (e.g continuous)
if flatten == 'mean':
df_y = pd.DataFrame(df_y >= df_y.mean())
elif flatten == 'median':
df_y = pd.DataFrame(df_y >= df_y.median())
elif flatten == 'mode':
df_y = pd.DataFrame(df_y >= df_y.mode()[0])
elif type(flatten) == int:
df_y = pd.DataFrame(df_y >= flatten)
elif type(flatten) == float:
df_y = pd.DataFrame(df_y >= df_y.quantile(flatten))
# below is for case where the y-feature is converted in
# to a categorical, either if it's a number or string.
elif flatten == 'cat_string':
df_y = pd.Categorical(df_y)
df_y = pd.DataFrame(pd.Series(df_y).cat.codes)
elif flatten == 'cat_numeric':
df_y = pd.qcut(df_y, 5, duplicates='drop')
df_y = pd.DataFrame(pd.Series(df_y).cat.codes)
# for cases when y-feature is already in the format
# where the prediction output will be.
elif flatten == 'none':
df_y = pd.DataFrame(df_y)
return df_y
示例12: build_column
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def build_column(self, data):
col, operation, bins, labels = (
self.cfg.get(p) for p in ["col", "operation", "bins", "labels"]
)
bins = int(bins)
if operation == "cut":
bin_data = pd.cut(data[col], bins=bins)
else:
bin_data = pd.qcut(data[col], q=bins)
if labels:
cats = {idx: str(cat) for idx, cat in enumerate(labels.split(","))}
else:
cats = {idx: str(cat) for idx, cat in enumerate(bin_data.cat.categories)}
return pd.Series(bin_data.cat.codes.map(cats), index=data.index, name=self.name)
示例13: build_code
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def build_code(self):
col, operation, bins, labels = (
self.cfg.get(p) for p in ["col", "operation", "bins", "labels"]
)
bins_code = []
if operation == "cut":
bins_code.append(
"{name}_data = pd.cut(df['{col}'], bins={bins})".format(
name=self.name, col=col, bins=bins
)
)
else:
bins_code.append(
"{name}_data = pd.qcut(df['{col}'], bins={bins})".format(
name=self.name, col=col, bins=bins
)
)
if labels:
labels_str = ", ".join(
["{}: {}".format(idx, cat) for idx, cat in enumerate(labels.split(","))]
)
labels_str = "{" + labels_str + "}"
bins_code.append(
"{name}_cats = {labels}".format(name=self.name, labels=labels_str)
)
else:
bins_code.append(
"{name}_cats = {idx: str(cat) for idx, cat in enumerate({name}_data.cat.categories)}"
)
s_str = "df.loc[:, '{name}'] = pd.Series({name}_data.cat.codes.map({name}_cats), index=df.index, name='{name}')"
bins_code.append(s_str.format(name=self.name))
return "\n".join(bins_code)
示例14: add_returns_in_place
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def add_returns_in_place(df): # modifies df
close_prices_returns = compute_returns(df)
num_bins = 10
returns_bins = pd.qcut(close_prices_returns, num_bins)
bins_categories = returns_bins.values.categories
returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False)
df['close_price_returns'] = close_prices_returns
df['close_price_returns_bins'] = returns_bins
df['close_price_returns_labels'] = returns_labels
return df, bins_categories
示例15: fit
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import qcut [as 别名]
def fit(self, X, y=None):
"""
Learns the limits of the equal frequency intervals, that is the
quantiles for each variable.
Parameters
----------
X : pandas dataframe of shape = [n_samples, n_features]
The training input samples.
Can be the entire dataframe, not just the variables to be transformed.
y : None
y is not needed in this encoder. You can pass y or None.
Attributes
----------
binner_dict_: dictionary
The dictionary containing the {variable: interval limits} pairs used
to sort the values into discrete intervals.
"""
# check input dataframe
X = super().fit(X, y)
self.binner_dict_ = {}
for var in self.variables:
tmp, bins = pd.qcut(x=X[var], q=self.q, retbins=True, duplicates='drop')
# Prepend/Append infinities to accommodate outliers
bins = list(bins)
bins[0] = float("-inf")
bins[len(bins) - 1] = float("inf")
self.binner_dict_[var] = bins
self.input_shape_ = X.shape
return self