本文整理汇总了Python中pandas.value_counts方法的典型用法代码示例。如果您正苦于以下问题:Python pandas.value_counts方法的具体用法?Python pandas.value_counts怎么用?Python pandas.value_counts使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pandas
的用法示例。
在下文中一共展示了pandas.value_counts方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: spatial_heatmap
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def spatial_heatmap(array, path, title=None, color="Greens", figformat="png"):
"""Taking channel information and creating post run channel activity plots."""
logging.info("Nanoplotter: Creating heatmap of reads per channel using {} reads."
.format(array.size))
activity_map = Plot(
path=path + "." + figformat,
title="Number of reads generated per channel")
layout = make_layout(maxval=np.amax(array))
valueCounts = pd.value_counts(pd.Series(array))
for entry in valueCounts.keys():
layout.template[np.where(layout.structure == entry)] = valueCounts[entry]
plt.figure()
ax = sns.heatmap(
data=pd.DataFrame(layout.template, index=layout.yticks, columns=layout.xticks),
xticklabels="auto",
yticklabels="auto",
square=True,
cbar_kws={"orientation": "horizontal"},
cmap=color,
linewidths=0.20)
ax.set_title(title or activity_map.title)
activity_map.fig = ax.get_figure()
activity_map.save(format=figformat)
plt.close("all")
return [activity_map]
示例2: _analyze_field
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def _analyze_field(self, df, id, output_folder=".", n_head=10):
id = str(id)
m = df.shape[1]
df.columns = [str(i) for i in range(m)]
agg_dict = dict()
agg_dict[id] = "size"
for i in range(int(id)):
agg_dict[str(i)] = lambda x: ", ".join(
pd.value_counts(x).index[:n_head])
name_dict = dict()
name_dict[id] = "count"
df_analyze = df.groupby(id).agg(agg_dict).rename(
columns=name_dict).reset_index()
filename = join(output_folder, "column-%s-analyze.xlsx" % id)
log = u""
log += u"Tags : {}\n".format(df_analyze.shape[0])
tags = df_analyze[id].to_dict().values()
tags = sorted(tags)
log += u"List tags : {}\n".format(u", ".join(tags))
df_analyze.to_excel(filename, index=False)
return log
示例3: test_blockwise_shufflesplit
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def test_blockwise_shufflesplit():
splitter = dask_ml.model_selection.ShuffleSplit(random_state=0)
assert splitter.get_n_splits() == 10
gen = splitter.split(dX)
train_idx, test_idx = next(gen)
assert isinstance(train_idx, da.Array)
assert isinstance(test_idx, da.Array)
assert train_idx.shape == (99,) # 90% of 110
assert test_idx.shape == (11,)
assert train_idx.chunks == ((45, 45, 9),)
assert test_idx.chunks == ((5, 5, 1),)
counts = pd.value_counts(train_idx.compute())
assert counts.max() == 1
N = len(X)
np.testing.assert_array_equal(
np.unique(da.concatenate([train_idx, test_idx])), np.arange(N)
)
示例4: _visualize_helper
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def _visualize_helper(
model_dir, output_dir, subsample_rate, save_type, ordering, external_legend_params
):
logger.info("Generating figures")
# Data
metadata_df = pd.read_csv(os.path.join(model_dir, "metadata.csv"))
cluster_ids = np.load(os.path.join(model_dir, "cluster_ids.npy"))
metadata_df["ax_1"] = cluster_ids[:, 0]
metadata_df["ax_2"] = cluster_ids[:, 1]
metadata_df["opponent_id"] = metadata_df["opponent_id"].apply(ABBREVIATIONS.get)
def save_path(prefix):
return osp.join(output_dir, f"{prefix}.{save_type}")
counts = pd.value_counts(metadata_df["opponent_id"])
min_counts = counts.min()
opponent_groups = metadata_df.groupby("opponent_id")
opponent_dfs = {name: group.sample(n=min_counts) for name, group in opponent_groups}
opponent_dfs = [opponent_dfs[label] for label in ordering]
metadata_df = pd.concat(opponent_dfs)
_plot_and_save_chart(save_path("combined"), [metadata_df])
_plot_and_save_chart(save_path("subsampled"), [metadata_df.sample(frac=subsample_rate)])
_plot_and_save_chart(save_path("sidebyside"), opponent_dfs)
if external_legend_params is not None:
_external_legend(osp.join(output_dir, "external_legend.pdf"))
logger.info("Visualization complete")
示例5: calc_tvd
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def calc_tvd(label_dict,attr):
'''
attr should be a 0,1 pandas dataframe with
columns corresponding to label names
for example:
names=zip(*self.graph)[0]
calc_tvd(label_dict,attr[names])
label_dict should be a dictionary key:1d-array of samples
'''
####Calculate Total Variation####
if np.min(attr.values)<0:
raise ValueError('calc_tvd received \
attr that may not have been in {0,1}')
label_names=label_dict.keys()
attr=attr[label_names]
df2=attr.drop_duplicates()
df2 = df2.reset_index(drop = True).reset_index()
df2=df2.rename(columns = {'index':'ID'})
real_data_id=pd.merge(attr,df2)
real_counts = pd.value_counts(real_data_id['ID'])
real_pdf=real_counts/len(attr)
label_list_dict={k:np.round(v.ravel()) for k,v in label_dict.items()}
df_dat=pd.DataFrame.from_dict(label_list_dict)
dat_id=pd.merge(df_dat,df2,on=label_names,how='left')
dat_counts=pd.value_counts(dat_id['ID'])
dat_pdf = dat_counts / dat_counts.sum()
diff=real_pdf.subtract(dat_pdf, fill_value=0)
tvd=0.5*diff.abs().sum()
return tvd
示例6: class_weights
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def class_weights(self):
import pandas as pd
label_freq = pd.value_counts(self.labels)
class_weights = label_freq.median() / label_freq
class_weights = class_weights.sort_index().values
class_weights = torch.from_numpy(class_weights.astype(np.float32))
return class_weights
示例7: test_hash_uniformity
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def test_hash_uniformity(map_size_and_hashed_values):
n, h = map_size_and_hashed_values
k = len(h)
num_bins = k//5 # Want about 5 items per bin for chi-squared
bins = np.linspace(0, n + 1, num_bins)
binned_data = pd.cut(h, bins)
distribution = pd.value_counts(binned_data).sort_index()
c, p = chisquare(distribution)
assert p > 0.05, "Data not uniform"
示例8: columns_types
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def columns_types(self):
return pd.value_counts(self.columns_stats.loc['types'])
示例9: _get_deviation_of_mean
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def _get_deviation_of_mean(self, series, multiplier=3):
"""
Returns count of values deviating of the mean, i.e. larger than `multiplier` * `std`.
:type series:
:param multiplier:
:return:
"""
capped_series = np.minimum(
series, series.mean() + multiplier * series.std())
count = pd.value_counts(series != capped_series)
count = count[True] if True in count else 0
perc = self._percent(count / self.length)
return count, perc
示例10: _get_median_absolute_deviation
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def _get_median_absolute_deviation(self, series, multiplier=3):
"""
Returns count of values larger than `multiplier` * `mad`
:type series:
:param multiplier:
:return (array):
"""
capped_series = np.minimum(
series, series.median() + multiplier * series.mad())
count = pd.value_counts(series != capped_series)
count = count[True] if True in count else 0
perc = self._percent(count / self.length)
return count, perc
示例11: _get_categorical_summary
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def _get_categorical_summary(self, column):
series = self.df[column]
# Only run if at least 1 non-missing value
value_counts = series.value_counts()
stats = {
'top': '{}: {}'.format(value_counts.index[0], value_counts.iloc[0]),
}
return pd.concat([pd.Series(stats, name=column),
self.columns_stats[column]],
sort=True)
示例12: _get_bool_summary
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def _get_bool_summary(self, column):
series = self.df[column]
stats = {}
for class_name, class_value in dict(series.value_counts()).items():
stats['"{}" count'.format(class_name)] = '{}'.format(class_value)
stats['"{}" perc'.format(class_name)] = '{}'.format(
self._percent(class_value / self.length))
return pd.concat([pd.Series(stats, name=column),
self.columns_stats[column]],
sort=True)
示例13: _analyze_first_token
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def _analyze_first_token(self, df, id, output_folder="."):
filename = join(output_folder, "column-%s-analyze.xlsx" % id)
df_analyze = df[id].value_counts().reset_index(name="count")
df_analyze = df_analyze.rename(columns={"index": "0"})
df_analyze.to_excel(filename, index=False)
log = u""
log += u"Unique words : {}\n".format(df_analyze.shape[0])
log += u"Top words : {}\n".format(
u", ".join(list(df_analyze["0"].to_dict().values())[:20]))
return log
示例14: garbage_symbols
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def garbage_symbols(df: pd.DataFrame) -> Result:
"""Find unwanted symbols in `np.object` columns.
Returns:
A result containing item keys per field which contained any trash symbol
"""
garbage = (
r"(?P<spaces>^\s|\s$)"
r"|(?P<html_entities>&[a-zA-Z]{2,}?;|&#\d*?;)"
r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?\s*?})"
r"|(?P<html_tags></??(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
r"blockquote)\s*?/??>|<!--|-->)"
)
errors = {}
row_keys: Set = set()
rule_result = Result("Garbage Symbols", items_count=len(df))
for column in tqdm(df.select_dtypes([np.object]).columns, desc="Garbage Symbols"):
matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE)
if not matches.empty:
error_keys = df.loc[matches.unstack().index.values].index
bad_texts = matches.stack().value_counts().index.sort_values().tolist()
# escape backslashes for markdown repr, `\n > \\n`
bad_texts = [
f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'"
for bx in bad_texts
]
error = (
f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
f"values contain `{', '.join(bad_texts)}`"
)
errors[error] = list(error_keys)
row_keys = row_keys.union(error_keys)
if errors:
rule_result.add_error(
f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected",
errors=errors,
)
return rule_result
示例15: split_dataset
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import value_counts [as 别名]
def split_dataset(dataset, seed):
logger.info('Splitting the dataset')
scaffolds = pd.value_counts(dataset['scaffold'])
scaffolds = sorted(scaffolds.items(), key=lambda x: (-x[1], x[0]))
test_scaffolds = set([x[0] for x in scaffolds[9::10]])
dataset['SPLIT'] = 'train'
test_scaf_idx = [x in test_scaffolds for x in dataset['scaffold']]
dataset.loc[test_scaf_idx, 'SPLIT'] = 'test_scaffolds'
test_idx = dataset.loc[dataset['SPLIT'] == 'train'].sample(
frac=0.1, random_state=seed
).index
dataset.loc[test_idx, 'SPLIT'] = 'test'
dataset.drop('scaffold', axis=1, inplace=True)
return dataset