当前位置: 首页>>代码示例>>Python>>正文

Python dask.dataframe方法代码示例

本文整理汇总了Python中dask.dataframe方法的典型用法代码示例。如果您正苦于以下问题:Python dask.dataframe方法的具体用法?Python dask.dataframe怎么用?Python dask.dataframe使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在dask的用法示例。


示例1: test_datasource_discover

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def test_datasource_discover(source_dataframe):
    r = source_dataframe.discover()

    assert source_dataframe.container == 'dataframe'

    row_dtype = np.dtype([('x', np.int64), ('y', np.int64)])
    assert r == {
        'datashape': 'datashape',
        'dtype': row_dtype,
        'shape': (6,),
        'npartitions': 2,
        'metadata': dict(a=1, b=2, c=3, d=4),

    # check attributes have been set
    assert source_dataframe.datashape == 'datashape'
    assert source_dataframe.dtype == row_dtype
    assert source_dataframe.shape == (6,)
    assert source_dataframe.npartitions == 2
    assert source_dataframe.metadata == dict(a=1, b=2, c=3, d=4)

    # check that _get_schema is only called once
    assert source_dataframe.call_count['_get_schema'] == 1
    assert source_dataframe.call_count['_get_schema'] == 1 

示例2: turn_dict_of_well_dfs_to_single_df

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def turn_dict_of_well_dfs_to_single_df(dictOfWellDf):
    Takes in a dict of dataframes, where each dataframe is for a well created by LASIO. Likely created by load_all_wells_in function and is the first item in the returned list.
    and returns a single dataframe of all wells
    # start by creating empty dataframe and list
    data_df = pd.DataFrame()
    list_of_df = []
    keys = list(dictOfWellDf.keys())
    # get dict of well data frames into values format
    values = dictOfWellDf.values()
    # go through each item in values and add to a list
    count = 0
    for each in values:
        each["UWI"] = keys[count]
        count += 1
    # concat the list into a single dataframe
    data_df = pd.concat(list_of_df)
    return data_df 

示例3: normalize

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def normalize(cls, df, target_var, mean_list, stddev_list):
        """Normalizes the numerical columns in a dataframe.

                df : dask dataframe, The dataframe to normalize
                target_var : string, Dependent variable for the analysis
                mean_list : dask series, Series with all the mean values
                stddev_list : dask series, Series with all the standard deviation values

                df : Dataframe with mean normalized numerical columns
        continuous_cols = [
            col for col in df.columns if df[col].dtype != 'object' and col != target_var]
        for col in continuous_cols:
            df[col] = df[col].sub(mean_list[col]).div(stddev_list[col])

        return df 

示例4: calculate_stats

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def calculate_stats(cls, df, target_var):
        """Calculates descriptive stats of the dataframe required for cleaning.

                df : dask dataframe, The dataframe at hand
                target_var : string, Dependent variable for the analysis

                mean : dask series, mean of each column
                median : dask series, median of each column
                dict(zip(categorical_cols, mode)) : dict, Dictionary containing
                        categorical column as keys and their modes as values
                std : dask series, standard deviation of each column
        categorical_columns = [
            col for col in df.columns if col != target_var and df[col].dtype == 'object']
        mean_op = df.mean()
        std_op = df.std()
        median_op = df.quantile(0.5)
        mode_op = [df[col].value_counts().idxmax()
                   for col in categorical_columns]
        mean, median, mode, std = dask.compute(
            mean_op, median_op, mode_op, std_op)
        return mean, median, dict(zip(categorical_columns, mode)), std 

示例5: _get_dask_meta_for_dataset

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def _get_dask_meta_for_dataset(
    ds_factory, table, columns, categoricals, dates_as_object
    Calculate a schema suitable for the dask dataframe meta from the dataset.
    table_schema = ds_factory.table_meta[table]
    meta = empty_dataframe_from_schema(
        table_schema, columns=columns, date_as_object=dates_as_object

    if categoricals:
        meta = meta.astype({col: "category" for col in categoricals})
        meta = dd.utils.clear_known_categories(meta, categoricals)

    categoricals_from_index = _maybe_get_categoricals_from_index(
        ds_factory, {table: categoricals}
    if categoricals_from_index:
        meta = meta.astype(categoricals_from_index[table])
    return meta 

示例6: test_hash_bucket

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def test_hash_bucket(col, num_buckets=5):
    df = pd.DataFrame(
            "range": np.arange(10),
            "range_duplicated": np.repeat(np.arange(2), 5),
            "random": np.random.randint(0, 100, 10),
    hashed = _hash_bucket(df, [col], num_buckets)
    assert (hashed.groupby(col).agg({_KTK_HASH_BUCKET: "nunique"}) == 1).all().all()

    # Check that hashing is consistent for small dataframe sizes (where df.col.nunique() < num_buckets)
    df_sample = df.iloc[[0, 7]]
    hashed_sample = _hash_bucket(df_sample, [col], num_buckets)
    expected = hashed.loc[df_sample.index]
    pdt.assert_frame_equal(expected, hashed_sample) 

示例7: test_transform

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def test_transform(kind):
    X, y = make_classification(chunks=100)

    if kind == "numpy":
        X, y = dask.compute(X, y)
    elif kind == "dask.dataframe":
        X = dd.from_dask_array(X)
        y = dd.from_dask_array(y)

    base = PCA(random_state=0)
    wrap = ParallelPostFit(PCA(random_state=0))

    base.fit(X, y)
    wrap.fit(X, y)

    assert_estimator_equal(wrap.estimator, base)

    result = base.transform(X)
    expected = wrap.transform(X)
    assert_eq_ar(result, expected) 

示例8: test_transformed_shape

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def test_transformed_shape(self):
        # checks if the transformed objects have the correct columns
        a = dpp.PolynomialFeatures()
        n_cols = len(a.get_feature_names())
        # dask array
        assert a.transform(X).shape[1] == n_cols
        # numpy array
        assert a.transform(X.compute()).shape[1] == n_cols
        # dask dataframe
        assert a.transform(df).shape[1] == n_cols
        # pandas dataframe
        assert a.transform(df.compute()).shape[1] == n_cols
        X_nan_rows = df.values
        df_none_divisions = X_nan_rows.to_dask_dataframe(columns=df.columns)
        # dask array with nan rows
        assert a.transform(X_nan_rows).shape[1] == n_cols
        # dask data frame with nan rows
        assert a.transform(df_none_divisions).shape[1] == n_cols 

示例9: to_dask

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def to_dask(self):
        import dask.dataframe as dd
        import dask
        return dd.from_delayed([dask.delayed(self._get_partition)(i)
                                for i in range(self.npartitions)]) 

示例10: _open_dataset

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def _open_dataset(self, urlpath):
        """Open dataset using dask and use pattern fields to set new columns
        import dask.dataframe

        if self.pattern is None:
            self._dataframe = dask.dataframe.read_csv(
                urlpath, storage_options=self._storage_options,

        if not (DASK_VERSION >= '0.19.0'):
            raise ValueError("Your version of dask is '{}'. "
                "The ability to include filenames in read_csv output "
                "(``include_path_column``) was added in 0.19.0, so "
                "pattern urlpaths are not supported.".format(DASK_VERSION))

        drop_path_column = 'include_path_column' not in self._csv_kwargs
        path_column = self._path_column()

        self._dataframe = dask.dataframe.read_csv(
            urlpath, storage_options=self._storage_options, **self._csv_kwargs)

        # add the new columns to the dataframe

        if drop_path_column:
            self._dataframe = self._dataframe.drop([path_column], axis=1) 

示例11: to_data_frame

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def to_data_frame(self):
        Converts trajectory data to DataFrame format.

            Represents the trajectory in DataFrame format.


        return self._data 

示例12: generate_weekend_features

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def generate_weekend_features(self):
        """Create or update the feature weekend to the dataframe."""
        raise NotImplementedError('To be implemented') 

示例13: show_trajectories_info

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def show_trajectories_info(self):
        """Show dataset information from dataframe."""
        raise NotImplementedError('To be implemented') 

示例14: min

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def min(self, axis=None, skipna=True, split_every=False, out=None):
        Return the minimum of the values for the requested axis.

        axis: int, optional, default None, {index (0), columns (1)}.
            Axis for the function to be applied on.
        skipna: bool, optional, default None.
            Exclude NA/null values when computing the result.

        max:Series or DataFrame (if level specified)
            The minimum values for the request axis.



        return self._data.min(axis, skipna, split_every, out) 

示例15: max

# 需要导入模块: import dask [as 别名]
# 或者: from dask import dataframe [as 别名]
def max(self, axis=None, skipna=True, split_every=False, out=None):
        Return the maximum of the values for the requested axis..

        axis: int, optional, default None, {index (0), columns (1)}.
            Axis for the function to be applied on.
        skipna: bool, optional, default None.
            Exclude NA/null values when computing the result.

        max:Series or DataFrame (if level specified)
            The maximum values for the request axis.



        return self._data.max(axis, skipna, split_every, out) 
