本文整理汇总了Python中dask.dataframe方法的典型用法代码示例。如果您正苦于以下问题:Python dask.dataframe方法的具体用法?Python dask.dataframe怎么用?Python dask.dataframe使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dask
的用法示例。
在下文中一共展示了dask.dataframe方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_datasource_discover
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def test_datasource_discover(source_dataframe):
r = source_dataframe.discover()
assert source_dataframe.container == 'dataframe'
row_dtype = np.dtype([('x', np.int64), ('y', np.int64)])
assert r == {
'datashape': 'datashape',
'dtype': row_dtype,
'shape': (6,),
'npartitions': 2,
'metadata': dict(a=1, b=2, c=3, d=4),
}
# check attributes have been set
assert source_dataframe.datashape == 'datashape'
assert source_dataframe.dtype == row_dtype
assert source_dataframe.shape == (6,)
assert source_dataframe.npartitions == 2
assert source_dataframe.metadata == dict(a=1, b=2, c=3, d=4)
# check that _get_schema is only called once
assert source_dataframe.call_count['_get_schema'] == 1
source_dataframe.discover()
assert source_dataframe.call_count['_get_schema'] == 1
示例2: turn_dict_of_well_dfs_to_single_df
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def turn_dict_of_well_dfs_to_single_df(dictOfWellDf):
"""
Takes in a dict of dataframes, where each dataframe is for a well created by LASIO. Likely created by load_all_wells_in function and is the first item in the returned list.
and returns a single dataframe of all wells
"""
# start by creating empty dataframe and list
data_df = pd.DataFrame()
list_of_df = []
keys = list(dictOfWellDf.keys())
# get dict of well data frames into values format
values = dictOfWellDf.values()
# go through each item in values and add to a list
count = 0
for each in values:
each["UWI"] = keys[count]
count += 1
list_of_df.append(each)
# concat the list into a single dataframe
data_df = pd.concat(list_of_df)
return data_df
示例3: normalize
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def normalize(cls, df, target_var, mean_list, stddev_list):
"""Normalizes the numerical columns in a dataframe.
Arguments:
df : dask dataframe, The dataframe to normalize
target_var : string, Dependent variable for the analysis
mean_list : dask series, Series with all the mean values
stddev_list : dask series, Series with all the standard deviation values
Returns:
df : Dataframe with mean normalized numerical columns
"""
continuous_cols = [
col for col in df.columns if df[col].dtype != 'object' and col != target_var]
for col in continuous_cols:
df[col] = df[col].sub(mean_list[col]).div(stddev_list[col])
return df
示例4: calculate_stats
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def calculate_stats(cls, df, target_var):
"""Calculates descriptive stats of the dataframe required for cleaning.
Arguments:
df : dask dataframe, The dataframe at hand
target_var : string, Dependent variable for the analysis
Returns:
mean : dask series, mean of each column
median : dask series, median of each column
dict(zip(categorical_cols, mode)) : dict, Dictionary containing
categorical column as keys and their modes as values
std : dask series, standard deviation of each column
"""
categorical_columns = [
col for col in df.columns if col != target_var and df[col].dtype == 'object']
mean_op = df.mean()
std_op = df.std()
median_op = df.quantile(0.5)
mode_op = [df[col].value_counts().idxmax()
for col in categorical_columns]
mean, median, mode, std = dask.compute(
mean_op, median_op, mode_op, std_op)
return mean, median, dict(zip(categorical_columns, mode)), std
示例5: _get_dask_meta_for_dataset
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def _get_dask_meta_for_dataset(
ds_factory, table, columns, categoricals, dates_as_object
):
"""
Calculate a schema suitable for the dask dataframe meta from the dataset.
"""
table_schema = ds_factory.table_meta[table]
meta = empty_dataframe_from_schema(
table_schema, columns=columns, date_as_object=dates_as_object
)
if categoricals:
meta = meta.astype({col: "category" for col in categoricals})
meta = dd.utils.clear_known_categories(meta, categoricals)
categoricals_from_index = _maybe_get_categoricals_from_index(
ds_factory, {table: categoricals}
)
if categoricals_from_index:
meta = meta.astype(categoricals_from_index[table])
return meta
示例6: test_hash_bucket
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def test_hash_bucket(col, num_buckets=5):
df = pd.DataFrame(
{
"range": np.arange(10),
"range_duplicated": np.repeat(np.arange(2), 5),
"random": np.random.randint(0, 100, 10),
}
)
hashed = _hash_bucket(df, [col], num_buckets)
assert (hashed.groupby(col).agg({_KTK_HASH_BUCKET: "nunique"}) == 1).all().all()
# Check that hashing is consistent for small dataframe sizes (where df.col.nunique() < num_buckets)
df_sample = df.iloc[[0, 7]]
hashed_sample = _hash_bucket(df_sample, [col], num_buckets)
expected = hashed.loc[df_sample.index]
pdt.assert_frame_equal(expected, hashed_sample)
示例7: test_transform
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def test_transform(kind):
X, y = make_classification(chunks=100)
if kind == "numpy":
X, y = dask.compute(X, y)
elif kind == "dask.dataframe":
X = dd.from_dask_array(X)
y = dd.from_dask_array(y)
base = PCA(random_state=0)
wrap = ParallelPostFit(PCA(random_state=0))
base.fit(X, y)
wrap.fit(X, y)
assert_estimator_equal(wrap.estimator, base)
result = base.transform(X)
expected = wrap.transform(X)
assert_eq_ar(result, expected)
示例8: test_transformed_shape
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def test_transformed_shape(self):
# checks if the transformed objects have the correct columns
a = dpp.PolynomialFeatures()
a.fit(X)
n_cols = len(a.get_feature_names())
# dask array
assert a.transform(X).shape[1] == n_cols
# numpy array
assert a.transform(X.compute()).shape[1] == n_cols
# dask dataframe
assert a.transform(df).shape[1] == n_cols
# pandas dataframe
assert a.transform(df.compute()).shape[1] == n_cols
X_nan_rows = df.values
df_none_divisions = X_nan_rows.to_dask_dataframe(columns=df.columns)
# dask array with nan rows
assert a.transform(X_nan_rows).shape[1] == n_cols
# dask data frame with nan rows
assert a.transform(df_none_divisions).shape[1] == n_cols
示例9: to_dask
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def to_dask(self):
import dask.dataframe as dd
import dask
return dd.from_delayed([dask.delayed(self._get_partition)(i)
for i in range(self.npartitions)])
示例10: _open_dataset
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def _open_dataset(self, urlpath):
"""Open dataset using dask and use pattern fields to set new columns
"""
import dask.dataframe
if self.pattern is None:
self._dataframe = dask.dataframe.read_csv(
urlpath, storage_options=self._storage_options,
**self._csv_kwargs)
return
if not (DASK_VERSION >= '0.19.0'):
raise ValueError("Your version of dask is '{}'. "
"The ability to include filenames in read_csv output "
"(``include_path_column``) was added in 0.19.0, so "
"pattern urlpaths are not supported.".format(DASK_VERSION))
drop_path_column = 'include_path_column' not in self._csv_kwargs
path_column = self._path_column()
self._dataframe = dask.dataframe.read_csv(
urlpath, storage_options=self._storage_options, **self._csv_kwargs)
# add the new columns to the dataframe
self._set_pattern_columns(path_column)
if drop_path_column:
self._dataframe = self._dataframe.drop([path_column], axis=1)
示例11: to_data_frame
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def to_data_frame(self):
"""
Converts trajectory data to DataFrame format.
Returns
-------
dask.dataframe.DataFrame
Represents the trajectory in DataFrame format.
"""
return self._data
示例12: generate_weekend_features
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def generate_weekend_features(self):
"""Create or update the feature weekend to the dataframe."""
raise NotImplementedError('To be implemented')
示例13: show_trajectories_info
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def show_trajectories_info(self):
"""Show dataset information from dataframe."""
raise NotImplementedError('To be implemented')
示例14: min
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def min(self, axis=None, skipna=True, split_every=False, out=None):
"""
Return the minimum of the values for the requested axis.
Parameters
----------
axis: int, optional, default None, {index (0), columns (1)}.
Axis for the function to be applied on.
skipna: bool, optional, default None.
Exclude NA/null values when computing the result.
split_every:
?
out:
?
Returns
-------
max:Series or DataFrame (if level specified)
The minimum values for the request axis.
References
----------
https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.min
"""
return self._data.min(axis, skipna, split_every, out)
示例15: max
# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def max(self, axis=None, skipna=True, split_every=False, out=None):
"""
Return the maximum of the values for the requested axis..
Parameters
----------
axis: int, optional, default None, {index (0), columns (1)}.
Axis for the function to be applied on.
skipna: bool, optional, default None.
Exclude NA/null values when computing the result.
split_every:
?
out:
?
Returns
-------
max:Series or DataFrame (if level specified)
The maximum values for the request axis.
References
----------
https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.max
"""
return self._data.max(axis, skipna, split_every, out)