本文整理汇总了Python中dask.dataframe.DataFrame方法的典型用法代码示例。如果您正苦于以下问题:Python dataframe.DataFrame方法的具体用法?Python dataframe.DataFrame怎么用?Python dataframe.DataFrame使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dask.dataframe
的用法示例。
在下文中一共展示了dataframe.DataFrame方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_dask_data
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def extract_dask_data(data):
"""Extract data from dask.Series or dask.DataFrame for predictors.
Given a distributed dask.DataFrame or dask.Series containing columns or names
for one or more predictors, this operation returns a single dask.DataFrame or
dask.Series that can be iterated over.
Args:
data: A distributed dask.DataFrame or dask.Series.
Returns:
A dask.DataFrame or dask.Series that can be iterated over.
If the supplied argument is neither a dask.DataFrame nor a dask.Series this
operation returns it without modification.
"""
if isinstance(data, allowed_classes):
return _construct_dask_df_with_divisions(data)
else:
return data
示例2: _load_table
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _load_table(self, src, fmt, dst=None, post=None, *args, **kwargs):
""" Load a data frame from table formats: csv, hdf5, feather """
if fmt == 'csv':
_data = pd.read_csv(src, *args, **kwargs)
elif fmt == 'feather':
_data = feather.read_dataframe(src, *args, **kwargs)
elif fmt == 'hdf5':
_data = pd.read_hdf(src, *args, **kwargs)
# Put into this batch only part of it (defined by index)
if isinstance(_data, pd.DataFrame):
_data = _data.loc[self.indices]
elif isinstance(_data, dd.DataFrame):
# dask.DataFrame.loc supports advanced indexing only with lists
_data = _data.loc[list(self.indices)].compute()
if callable(post):
_data = post(_data, src=src, fmt=fmt, dst=dst, **kwargs)
self.load(src=_data, dst=dst)
示例3: groupby_pandas
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def groupby_pandas(self_or_cls, ndmapping, dimensions, container_type,
group_type, sort=False, **kwargs):
if 'kdims' in kwargs:
idims = [ndmapping.get_dimension(d) for d in kwargs['kdims']]
else:
idims = [dim for dim in ndmapping.kdims if dim not in dimensions]
all_dims = [d.name for d in ndmapping.kdims]
inds = [ndmapping.get_dimension_index(dim) for dim in idims]
getter = operator.itemgetter(*inds) if inds else lambda x: tuple()
multi_index = pd.MultiIndex.from_tuples(ndmapping.keys(), names=all_dims)
df = pd.DataFrame(list(map(wrap_tuple, ndmapping.values())), index=multi_index)
# TODO: Look at sort here
kwargs = dict(dict(get_param_values(ndmapping), kdims=idims), sort=sort, **kwargs)
groups = ((wrap_tuple(k), group_type(OrderedDict(unpack_group(group, getter)), **kwargs))
for k, group in df.groupby(level=[d.name for d in dimensions], sort=sort))
if sort:
selects = list(get_unique_keys(ndmapping, dimensions))
groups = sorted(groups, key=lambda x: selects.index(x[0]))
return container_type(groups, kdims=dimensions, sort=sort)
示例4: _get_group_info
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _get_group_info(path, grouppath, keys):
with h5py.File(path, "r") as f:
grp = f[grouppath]
if keys is None:
keys = list(grp.keys())
nrows = len(grp[keys[0]])
categoricals = {}
for key in keys:
dt = h5py.check_dtype(enum=grp[key].dtype)
if dt is not None:
categoricals[key] = sorted(dt, key=dt.__getitem__)
# Meta is an empty dataframe that serves as a compound "dtype"
meta = pd.DataFrame(
{key: np.array([], dtype=grp[key].dtype) for key in keys}, columns=keys
)
for key in categoricals:
meta[key] = pd.Categorical([], categories=categoricals[key], ordered=True)
return nrows, keys, meta, categoricals
示例5: pack_payload_pandas
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def pack_payload_pandas(partition: pd.DataFrame, group_key: List[str]) -> pd.DataFrame:
try:
# Technically distributed is an optional dependency
from distributed.protocol import serialize_bytes
except ImportError:
_logger.warning(
"Shuffle payload columns cannot be compressed since distributed is not installed."
)
return partition
if partition.empty:
res = partition[group_key]
res[_PAYLOAD_COL] = b""
else:
res = partition.groupby(
group_key,
sort=False,
observed=True,
# Keep the as_index s.t. the group values are not dropped. With this
# the behaviour seems to be consistent along pandas versions
as_index=True,
).apply(lambda x: pd.Series({_PAYLOAD_COL: serialize_bytes(x)}))
res = res.reset_index()
return res
示例6: unpack_payload_pandas
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def unpack_payload_pandas(
partition: pd.DataFrame, unpack_meta: pd.DataFrame
) -> pd.DataFrame:
"""
Revert ``pack_payload_pandas`` and restore packed payload
unpack_meta:
A dataframe indicating the sc
"""
try:
# Technically distributed is an optional dependency
from distributed.protocol import deserialize_bytes
except ImportError:
_logger.warning(
"Shuffle payload columns cannot be compressed since distributed is not installed."
)
return partition
if partition.empty:
return unpack_meta.iloc[:0]
mapped = partition[_PAYLOAD_COL].map(deserialize_bytes)
return pd.concat(mapped.values, copy=False, ignore_index=True)
示例7: unpack_payload
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def unpack_payload(df: dd.DataFrame, unpack_meta: pd.DataFrame) -> dd.DataFrame:
"""Revert payload packing of ``pack_payload`` and restores full dataframe."""
if (
# https://github.com/pandas-dev/pandas/issues/34455
isinstance(df._meta.index, pd.Float64Index)
# TODO: Try to find out what's going on an file a bug report
# For datetime indices the apply seems to be corrupt
# s.t. apply(lambda x:x) returns different values
or isinstance(df._meta.index, pd.DatetimeIndex)
):
return df
return df.map_partitions(
unpack_payload_pandas, unpack_meta=unpack_meta, meta=unpack_meta
)
示例8: to_indexable
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def to_indexable(*args, **kwargs):
"""Ensure that all args are an indexable type.
Conversion runs lazily for dask objects, immediately otherwise.
Parameters
----------
args : array_like or scalar
allow_scalars : bool, optional
Whether to allow scalars in args. Default is False.
"""
if kwargs.get("allow_scalars", False):
indexable = _maybe_indexable
else:
indexable = _indexable
for x in args:
if x is None or isinstance(x, (da.Array, dd.DataFrame)):
yield x
elif is_dask_collection(x):
yield delayed(indexable, pure=True)(x)
else:
yield indexable(x)
示例9: to_keys
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def to_keys(dsk, *args):
for x in args:
if x is None:
yield None
elif isinstance(x, (da.Array, dd.DataFrame)):
x = delayed(x)
dsk.update(x.dask)
yield x.key
elif isinstance(x, Delayed):
dsk.update(x.dask)
yield x.key
else:
assert not is_dask_collection(x)
key = type(x).__name__ + "-" + tokenize(x)
dsk[key] = x
yield key
示例10: inverse_transform
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def inverse_transform(
self,
X: Union[ArrayLike, DataFrameType],
y: Optional[Union[ArrayLike, SeriesType]] = None,
copy: Optional[bool] = None,
) -> Union[ArrayLike, DataFrameType]:
if not hasattr(self, "scale_"):
raise Exception(
"This %(name)s instance is not fitted yet. "
"Call 'fit' with appropriate arguments before "
"using this method."
)
X = X.copy()
if isinstance(X, dd.DataFrame):
X = X.sub(self.min_)
X = X.div(self.scale_)
else:
X -= self.min_
X /= self.scale_
return X
示例11: _check_inputs
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _check_inputs(
self,
X: Union[ArrayLike, DataFrameType],
accept_sparse_negative: bool = False,
copy: bool = False,
in_fit: bool = True,
) -> Union[ArrayLike, DataFrameType]:
if isinstance(X, (pd.DataFrame, dd.DataFrame)):
X = X.values
if isinstance(X, np.ndarray):
C = len(X) // min(multiprocessing.cpu_count(), 2)
X = da.from_array(X, chunks=C)
rng = check_random_state(self.random_state)
# TODO: non-float dtypes?
# TODO: sparse arrays?
# TODO: mix of sparse, dense?
sample = rng.uniform(size=(5, X.shape[1])).astype(X.dtype)
super(QuantileTransformer, self)._check_inputs(
sample,
accept_sparse_negative=accept_sparse_negative,
copy=copy,
in_fit=in_fit,
)
return X
示例12: _construct_dask_df_with_divisions
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _construct_dask_df_with_divisions(df):
"""Construct the new task graph and make a new dask.dataframe around it."""
divisions = _get_divisions(df)
# pylint: disable=protected-access
name = 'csv-index' + df._name
dsk = {(name, i): (_add_to_index, (df._name, i), divisions[i])
for i in range(df.npartitions)}
# pylint: enable=protected-access
from toolz import merge # pylint: disable=g-import-not-at-top
if isinstance(df, dd.DataFrame):
return dd.DataFrame(merge(dsk, df.dask), name, df.columns, divisions)
elif isinstance(df, dd.Series):
return dd.Series(merge(dsk, df.dask), name, df.name, divisions)
示例13: extract_dask_labels
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def extract_dask_labels(labels):
"""Extract data from dask.Series or dask.DataFrame for labels.
Given a distributed dask.DataFrame or dask.Series containing exactly one
column or name, this operation returns a single dask.DataFrame or dask.Series
that can be iterated over.
Args:
labels: A distributed dask.DataFrame or dask.Series with exactly one
column or name.
Returns:
A dask.DataFrame or dask.Series that can be iterated over.
If the supplied argument is neither a dask.DataFrame nor a dask.Series this
operation returns it without modification.
Raises:
ValueError: If the supplied dask.DataFrame contains more than one
column or the supplied dask.Series contains more than
one name.
"""
if isinstance(labels, dd.DataFrame):
ncol = labels.columns
elif isinstance(labels, dd.Series):
ncol = labels.name
if isinstance(labels, allowed_classes):
if len(ncol) > 1:
raise ValueError('Only one column for labels is allowed.')
return _construct_dask_df_with_divisions(labels)
else:
return labels
示例14: _access
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _access(data, iloc):
"""Accesses an element from collection, using integer location based indexing.
Args:
data: array-like. The collection to access
iloc: `int` or `list` of `int`s. Location(s) to access in `collection`
Returns:
The element of `a` found at location(s) `iloc`.
"""
if HAS_PANDAS:
import pandas as pd # pylint: disable=g-import-not-at-top
if isinstance(data, pd.Series) or isinstance(data, pd.DataFrame):
return data.iloc[iloc]
return data[iloc]
示例15: _dump_table
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _dump_table(self, dst, fmt='feather', components=None, *args, **kwargs):
""" Save batch data to table formats
Args:
dst: str - a path to dump into
fmt: str - format: feather, hdf5, csv
components: str or tuple - one or several component names
"""
filename = dst
components = tuple(components or self.components)
data_dict = {}
for comp in components:
comp_data = self.get(component=comp)
if isinstance(comp_data, pd.DataFrame):
data_dict.update(comp_data.to_dict('series'))
elif isinstance(comp_data, np.ndarray):
if comp_data.ndim > 1:
columns = [comp + str(i) for i in range(comp_data.shape[1])]
comp_dict = zip(columns, (comp_data[:, i] for i in range(comp_data.shape[1])))
data_dict.update({comp: comp_dict})
else:
data_dict.update({comp: comp_data})
else:
data_dict.update({comp: comp_data})
_data = pd.DataFrame(data_dict)
if fmt == 'feather':
feather.write_dataframe(_data, filename, *args, **kwargs)
elif fmt == 'hdf5':
_data.to_hdf(filename, *args, **kwargs) # pylint:disable=no-member
elif fmt == 'csv':
_data.to_csv(filename, *args, **kwargs) # pylint:disable=no-member
else:
raise ValueError('Unknown format %s' % fmt)
return self