本文整理汇总了Python中pandas.api.types.is_categorical_dtype方法的典型用法代码示例。如果您正苦于以下问题:Python types.is_categorical_dtype方法的具体用法?Python types.is_categorical_dtype怎么用?Python types.is_categorical_dtype使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pandas.api.types
的用法示例。
在下文中一共展示了types.is_categorical_dtype方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: write_series
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def write_series(group, key, series, dataset_kwargs=MappingProxyType({})):
# group here is an h5py type, otherwise categoricals won’t write
if series.dtype == object: # Assuming it’s string
group.create_dataset(
key,
data=series.values,
dtype=h5py.special_dtype(vlen=str),
**dataset_kwargs,
)
elif is_categorical_dtype(series):
# This should work for categorical Index and Series
categorical: pd.Categorical = series.values
categories: np.ndarray = categorical.categories.values
codes: np.ndarray = categorical.codes
category_key = f"__categories/{key}"
write_array(group, category_key, categories, dataset_kwargs=dataset_kwargs)
write_array(group, key, codes, dataset_kwargs=dataset_kwargs)
group[key].attrs["categories"] = group[category_key].ref
group[category_key].attrs["ordered"] = categorical.ordered
else:
group[key] = series.values
示例2: write_series
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def write_series(group, key, series, dataset_kwargs=MappingProxyType({})):
if series.dtype == object:
group.create_dataset(
key,
shape=series.shape,
dtype=object,
object_codec=numcodecs.VLenUTF8(),
**dataset_kwargs,
)
group[key][:] = series.values
elif is_categorical_dtype(series):
# This should work for categorical Index and Series
categorical: pd.Categorical = series.values
categories: np.ndarray = categorical.categories.values
codes: np.ndarray = categorical.codes
category_key = f"__categories/{key}"
write_array(group, category_key, categories, dataset_kwargs=dataset_kwargs)
write_array(group, key, codes, dataset_kwargs=dataset_kwargs)
group[key].attrs["categories"] = category_key
# Must coerce np.bool_ to bool for json writing
group[category_key].attrs["ordered"] = bool(categorical.ordered)
else:
group[key] = series.values
示例3: _fit
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def _fit(self, X: DataFrameType):
if self.columns is None:
columns = X.select_dtypes(include=["object", "category"]).columns
else:
columns = self.columns
categories = {}
for name in columns:
col = X[name]
if not is_categorical_dtype(col):
# This shouldn't ever be hit on a dask.array, since
# the object columns would have been converted to known cats
# already
col = pd.Series(col, index=X.index).astype("category")
if _HAS_CTD:
categories[name] = col.dtype
else:
categories[name] = (col.cat.categories, col.cat.ordered)
return columns, categories
示例4: _add_group_columns
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def _add_group_columns(data, gdf):
"""
Add group columns to data with a value from the grouped dataframe
It is assumed that the grouped dataframe contains a single group
>>> data = pd.DataFrame({
... 'x': [5, 6, 7]})
>>> gdf = GroupedDataFrame({
... 'g': list('aaa'),
... 'x': range(3)}, groups=['g'])
>>> _add_group_columns(data, gdf)
g x
0 a 5
1 a 6
2 a 7
"""
n = len(data)
if isinstance(gdf, GroupedDataFrame):
for i, col in enumerate(gdf.plydata_groups):
if col not in data:
group_values = [gdf[col].iloc[0]] * n
# Need to be careful and maintain the dtypes
# of the group columns
if pdtypes.is_categorical_dtype(gdf[col]):
col_values = pd.Categorical(
group_values,
categories=gdf[col].cat.categories,
ordered=gdf[col].cat.ordered
)
else:
col_values = pd.Series(
group_values,
index=data.index,
dtype=gdf[col].dtype
)
# Group columns come first
data.insert(i, col, col_values)
return data
示例5: test_summarize
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def test_summarize():
df = pd.DataFrame({'x': [1, 5, 2, 2, 4, 0, 4],
'y': [1, 2, 3, 4, 5, 6, 5],
'z': [1, 3, 3, 4, 5, 5, 5]})
result = df >> summarize('np.sum(x)', max='np.max(x)')
assert result.loc[0, 'max'] == np.max(df['x'])
assert result.loc[0, 'np.sum(x)'] == np.sum(df['x'])
result = df >> group_by('y', 'z') >> summarize(mean_x='np.mean(x)')
assert 'y' in result
assert 'z' in result
assert all(result['mean_x'] == [1, 5, 2, 2, 4, 0])
# (Name, Expression) tuples
result = df >> summarize(('sum', 'np.sum(x)'), ('max', 'np.max(x)'))
assert 'sum' in result
assert 'max' in result
# Branches
result = df >> group_by('y') >> summarize('np.sum(z)', constant=1)
assert 'y' in result
assert result.loc[0, 'constant'] == 1
# Category stays category
df1 = df.copy()
df1['z'] = pd.Categorical(df1['z'])
result = df1 >> group_by('y', 'z') >> summarize(mean_x='np.mean(x)')
assert result['y'].dtype == np.int
assert pdtypes.is_categorical_dtype(result['z'])
示例6: test_group_by_all
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def test_group_by_all():
df = pd.DataFrame({
'alpha': list('aaabbb'),
'beta': list('babruq'),
'theta': list('cdecde'),
'x': [1, 2, 3, 4, 5, 6],
'y': [6, 5, 4, 3, 2, 1],
'z': [7, 9, 11, 8, 10, 12]
})
result = df >> group_by_all()
assert len(df.columns) == len(result.columns)
assert len(df.columns) == len(result.plydata_groups)
result = df >> group_by_all(pd.Categorical)
assert len(df.columns) == len(result.columns)
assert len(df.columns) == len(result.plydata_groups)
result = df >> group_by_all(dict(cat=pd.Categorical))
assert len(df.columns)*2 == len(result.columns)
for col in df.columns:
col_cat = '{}_cat'.format(col)
assert not pdtypes.is_categorical_dtype(result[col])
assert pdtypes.is_categorical_dtype(result[col_cat])
result = (df
>> group_by('x')
>> group_by_all(dict(cat=pd.Categorical)))
assert result.plydata_groups == [
'{}_cat'.format(col) for col in df.columns if col != 'x']
assert len(df.columns)*2-1 == len(result.columns)
assert 'x_cat' not in result
示例7: get_errors
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def get_errors(self, series: pd.Series, column: 'column.Column'):
errors = []
# Calculate which columns are valid using the child class's validate function, skipping empty entries if the
# column specifies to do so
simple_validation = ~self.validate(series)
if column.allow_empty:
# Failing results are those that are not empty, and fail the validation
# explicitly check to make sure the series isn't a category because issubdtype will FAIL if it is
if is_categorical_dtype(series) or is_numeric_dtype(series):
validated = ~series.isnull() & simple_validation
else:
validated = (series.str.len() > 0) & simple_validation
else:
validated = simple_validation
# Cut down the original series to only ones that failed the validation
indices = series.index[validated]
# Use these indices to find the failing items. Also print the index which is probably a row number
for i in indices:
element = series[i]
errors.append(ValidationWarning(
message=self.message,
value=element,
row=i,
column=series.name
))
return errors
示例8: _id_var
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def _id_var(x, drop=False):
"""
Assign ids to items in x. If two items
are the same, they get the same id.
Parameters
----------
x : array-like
items to associate ids with
drop : bool
Whether to drop unused factor levels
"""
if len(x) == 0:
return []
categorical = pdtypes.is_categorical_dtype(x)
if categorical:
if drop:
x = x.cat.remove_unused_categories()
lst = list(x.cat.codes + 1)
else:
has_nan = any(np.isnan(i) for i in x if isinstance(i, float))
if has_nan:
# NaNs are -1, we give them the highest code
nan_code = -1
new_nan_code = np.max(x.cat.codes) + 1
lst = [val if val != nan_code else new_nan_code for val in x]
else:
lst = list(x.cat.codes + 1)
else:
try:
levels = np.sort(np.unique(x))
except TypeError:
# x probably has NANs
levels = multitype_sort(set(x))
lst = match(x, levels)
lst = [item + 1 for item in lst]
return lst
示例9: get_var_type
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def get_var_type(col):
"""
Return var_type (for KDEMultivariate) of the column
Parameters
----------
col : pandas.Series
A dataframe column.
Returns
-------
out : str
One of ['c', 'o', 'u'].
See Also
--------
The origin of the character codes is
:class:`statsmodels.nonparametric.kernel_density.KDEMultivariate`.
"""
if pdtypes.is_numeric_dtype(col):
# continuous
return 'c'
elif pdtypes.is_categorical_dtype(col):
# ordered or unordered
return 'o' if col.cat.ordered else 'u'
else:
# unordered if unsure, e.g string columns that
# are not categorical
return 'u'
示例10: fix_known_differences
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def fix_known_differences(orig, result):
"""
Helper function for reducing anndata's to only the elements we expect to be
equivalent after concatenation.
Only for the case where orig is the ground truth result of what concatenation should be.
"""
orig = orig.copy()
result = result.copy()
result.obs.drop(columns=["batch"], inplace=True)
result.strings_to_categoricals() # Should this be implicit in concatenation?
# TODO
# * merge varm, varp similar to uns
# * merge obsp, but some information should be lost
del orig.varm
del orig.varp
del orig.obsp # TODO
# Possibly need to fix this, ordered categoricals lose orderedness
for k, dtype in orig.obs.dtypes.items():
if is_categorical_dtype(dtype) and dtype.ordered:
result.obs[k] = result.obs[k].astype(dtype)
return orig, result
示例11: describe
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def describe(data):
'''
对每个变量生成统计指标特征
对于每一个变量,生成如下字段:
数据类型:
最大值/频数最大的那个:
最小值/频数最小的那个:
均值/频数中间的那个:
缺失率:
范围/唯一数:
'''
data=pd.DataFrame(data)
n_sample=len(data)
var_type=type_of_var(data,copy=True)
summary=pd.DataFrame(columns=data.columns,index=['dtype','max','min','mean','missing_pct','std/nuniue'])
for c in data.columns:
missing_pct=1-data[c].count()/n_sample
if var_type[c] == 'number':
max_value,min_value,mean_value=data[c].max(),data[c].min(),data[c].mean()
std_value=data[c].std()
summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,std_value]
elif var_type[c] == 'category' or is_categorical_dtype(data[c].dtype):
tmp=data[c].value_counts()
max_value,min_value=tmp.argmax(),tmp.argmin()
mean_value_index=tmp[tmp==tmp.median()].index
mean_value=mean_value_index[0] if len(mean_value_index)>0 else np.nan
summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,len(tmp)]
elif var_type[c] == 'datetime':
max_value,min_value=data[c].max(),data[c].min()
summary.loc[:,c]=[var_type[c],max_value,min_value,np.nan,missing_pct,np.nan]
else:
summary.loc[:,c]=[var_type[c],np.nan,np.nan,np.nan,missing_pct,np.nan]
return summary
示例12: fit
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def fit(
self, X: DataFrameType, y: Optional[Union[ArrayLike, SeriesType]] = None
) -> "OrdinalEncoder":
"""Determine the categorical columns to be encoded.
Parameters
----------
X : pandas.DataFrame or dask.dataframe.DataFrame
y : ignored
Returns
-------
self
"""
self.columns_ = X.columns
columns = self.columns
if columns is None:
columns = X.select_dtypes(include=["category"]).columns
else:
for column in columns:
assert is_categorical_dtype(X[column]), "Must be categorical"
self.categorical_columns_ = columns
self.non_categorical_columns_ = X.columns.drop(self.categorical_columns_)
if _HAS_CTD:
self.dtypes_ = {col: X[col].dtype for col in self.categorical_columns_}
else:
self.dtypes_ = {
col: (X[col].cat.categories, X[col].cat.ordered)
for col in self.categorical_columns_
}
return self
示例13: test_ce
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def test_ce(self):
ce = dpp.Categorizer()
original = raw.copy()
trn = ce.fit_transform(raw)
assert is_categorical_dtype(trn["A"])
assert is_categorical_dtype(trn["B"])
assert is_categorical_dtype(trn["C"])
assert trn["D"].dtype == np.dtype("int64")
tm.assert_index_equal(ce.columns_, pd.Index(["A", "B", "C"]))
tm.assert_frame_equal(raw, original)
示例14: test_dask
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def test_dask(self):
a = dd.from_pandas(raw, npartitions=2)
ce = dpp.Categorizer()
trn = ce.fit_transform(a)
assert is_categorical_dtype(trn["A"])
assert is_categorical_dtype(trn["B"])
assert is_categorical_dtype(trn["C"])
assert trn["D"].dtype == np.dtype("int64")
tm.assert_index_equal(ce.columns_, pd.Index(["A", "B", "C"]))
示例15: test_upload_pandas_categorical_ipc
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_categorical_dtype [as 别名]
def test_upload_pandas_categorical_ipc(self, con):
con.execute("DROP TABLE IF EXISTS test_categorical;")
df = pd.DataFrame({"A": ["a", "b", "c", "a"]})
df["B"] = df["A"].astype('category')
# test that table created correctly when it doesn't exist on server
con.load_table("test_categorical", df)
ans = con.execute("select * from test_categorical").fetchall()
assert ans == [('a', 'a'), ('b', 'b'), ('c', 'c'), ('a', 'a')]
assert con.get_table_details("test_categorical") == [
ColumnDetails(
name='A',
type='STR',
nullable=True,
precision=0,
scale=0,
comp_param=32,
encoding='DICT',
is_array=False,
),
ColumnDetails(
name='B',
type='STR',
nullable=True,
precision=0,
scale=0,
comp_param=32,
encoding='DICT',
is_array=False,
),
]
# load row-wise
con.load_table("test_categorical", df, method="rows")
# load columnar
con.load_table("test_categorical", df, method="columnar")
# load arrow
con.load_table("test_categorical", df, method="arrow")
# test end result
df_ipc = con.select_ipc("select * from test_categorical")
assert df_ipc.shape == (16, 2)
res = df.append([df, df, df]).reset_index(drop=True)
res["A"] = res["A"].astype('category')
res["B"] = res["B"].astype('category')
assert pd.DataFrame.equals(df_ipc, res)
# test that input df wasn't mutated
# original input is object, categorical
# to load via Arrow, converted internally to object, object
assert is_object_dtype(df["A"])
assert is_categorical_dtype(df["B"])
con.execute("DROP TABLE IF EXISTS test_categorical;")