当前位置: 首页>>代码示例>>Python>>正文


Python dataframe.DataFrame方法代码示例

本文整理汇总了Python中dask.dataframe.DataFrame方法的典型用法代码示例。如果您正苦于以下问题:Python dataframe.DataFrame方法的具体用法?Python dataframe.DataFrame怎么用?Python dataframe.DataFrame使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在dask.dataframe的用法示例。


在下文中一共展示了dataframe.DataFrame方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract_dask_data

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def extract_dask_data(data):
  """Extract data from dask.Series or dask.DataFrame for predictors.

  Given a distributed dask.DataFrame or dask.Series containing columns or names
  for one or more predictors, this operation returns a single dask.DataFrame or
  dask.Series that can be iterated over.

  Args:
    data: A distributed dask.DataFrame or dask.Series.

  Returns:
    A dask.DataFrame or dask.Series that can be iterated over.
    If the supplied argument is neither a dask.DataFrame nor a dask.Series this
    operation returns it without modification.
  """
  if isinstance(data, allowed_classes):
    return _construct_dask_df_with_divisions(data)
  else:
    return data 
开发者ID:ryfeus,项目名称:lambda-packs,代码行数:21,代码来源:dask_io.py

示例2: _load_table

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _load_table(self, src, fmt, dst=None, post=None, *args, **kwargs):
        """ Load a data frame from table formats: csv, hdf5, feather """
        if fmt == 'csv':
            _data = pd.read_csv(src, *args, **kwargs)
        elif fmt == 'feather':
            _data = feather.read_dataframe(src, *args, **kwargs)
        elif fmt == 'hdf5':
            _data = pd.read_hdf(src, *args, **kwargs)

        # Put into this batch only part of it (defined by index)
        if isinstance(_data, pd.DataFrame):
            _data = _data.loc[self.indices]
        elif isinstance(_data, dd.DataFrame):
            # dask.DataFrame.loc supports advanced indexing only with lists
            _data = _data.loc[list(self.indices)].compute()

        if callable(post):
            _data = post(_data, src=src, fmt=fmt, dst=dst, **kwargs)

        self.load(src=_data, dst=dst) 
开发者ID:analysiscenter,项目名称:batchflow,代码行数:22,代码来源:batch.py

示例3: groupby_pandas

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def groupby_pandas(self_or_cls, ndmapping, dimensions, container_type,
                       group_type, sort=False, **kwargs):
        if 'kdims' in kwargs:
            idims = [ndmapping.get_dimension(d) for d in kwargs['kdims']]
        else:
            idims = [dim for dim in ndmapping.kdims if dim not in dimensions]

        all_dims = [d.name for d in ndmapping.kdims]
        inds = [ndmapping.get_dimension_index(dim) for dim in idims]
        getter = operator.itemgetter(*inds) if inds else lambda x: tuple()

        multi_index = pd.MultiIndex.from_tuples(ndmapping.keys(), names=all_dims)
        df = pd.DataFrame(list(map(wrap_tuple, ndmapping.values())), index=multi_index)

        # TODO: Look at sort here
        kwargs = dict(dict(get_param_values(ndmapping), kdims=idims), sort=sort, **kwargs)
        groups = ((wrap_tuple(k), group_type(OrderedDict(unpack_group(group, getter)), **kwargs))
                   for k, group in df.groupby(level=[d.name for d in dimensions], sort=sort))

        if sort:
            selects = list(get_unique_keys(ndmapping, dimensions))
            groups = sorted(groups, key=lambda x: selects.index(x[0]))

        return container_type(groups, kdims=dimensions, sort=sort) 
开发者ID:holoviz,项目名称:holoviews,代码行数:26,代码来源:util.py

示例4: _get_group_info

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _get_group_info(path, grouppath, keys):
    with h5py.File(path, "r") as f:
        grp = f[grouppath]

        if keys is None:
            keys = list(grp.keys())

        nrows = len(grp[keys[0]])

        categoricals = {}
        for key in keys:
            dt = h5py.check_dtype(enum=grp[key].dtype)
            if dt is not None:
                categoricals[key] = sorted(dt, key=dt.__getitem__)

        # Meta is an empty dataframe that serves as a compound "dtype"
        meta = pd.DataFrame(
            {key: np.array([], dtype=grp[key].dtype) for key in keys}, columns=keys
        )

        for key in categoricals:
            meta[key] = pd.Categorical([], categories=categoricals[key], ordered=True)

    return nrows, keys, meta, categoricals 
开发者ID:mirnylab,项目名称:cooler,代码行数:26,代码来源:dask.py

示例5: pack_payload_pandas

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def pack_payload_pandas(partition: pd.DataFrame, group_key: List[str]) -> pd.DataFrame:
    try:
        # Technically distributed is an optional dependency
        from distributed.protocol import serialize_bytes
    except ImportError:
        _logger.warning(
            "Shuffle payload columns cannot be compressed since distributed is not installed."
        )
        return partition

    if partition.empty:
        res = partition[group_key]
        res[_PAYLOAD_COL] = b""
    else:
        res = partition.groupby(
            group_key,
            sort=False,
            observed=True,
            # Keep the as_index s.t. the group values are not dropped. With this
            # the behaviour seems to be consistent along pandas versions
            as_index=True,
        ).apply(lambda x: pd.Series({_PAYLOAD_COL: serialize_bytes(x)}))

        res = res.reset_index()
    return res 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:27,代码来源:_update.py

示例6: unpack_payload_pandas

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def unpack_payload_pandas(
    partition: pd.DataFrame, unpack_meta: pd.DataFrame
) -> pd.DataFrame:
    """
    Revert ``pack_payload_pandas`` and restore packed payload

    unpack_meta:
        A dataframe indicating the sc
    """
    try:
        # Technically distributed is an optional dependency
        from distributed.protocol import deserialize_bytes
    except ImportError:
        _logger.warning(
            "Shuffle payload columns cannot be compressed since distributed is not installed."
        )
        return partition

    if partition.empty:
        return unpack_meta.iloc[:0]

    mapped = partition[_PAYLOAD_COL].map(deserialize_bytes)

    return pd.concat(mapped.values, copy=False, ignore_index=True) 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:26,代码来源:_update.py

示例7: unpack_payload

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def unpack_payload(df: dd.DataFrame, unpack_meta: pd.DataFrame) -> dd.DataFrame:
    """Revert payload packing of ``pack_payload`` and restores full dataframe."""

    if (
        # https://github.com/pandas-dev/pandas/issues/34455
        isinstance(df._meta.index, pd.Float64Index)
        # TODO: Try to find out what's going on an file a bug report
        # For datetime indices the apply seems to be corrupt
        # s.t. apply(lambda x:x) returns different values
        or isinstance(df._meta.index, pd.DatetimeIndex)
    ):
        return df

    return df.map_partitions(
        unpack_payload_pandas, unpack_meta=unpack_meta, meta=unpack_meta
    ) 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:18,代码来源:_update.py

示例8: to_indexable

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def to_indexable(*args, **kwargs):
    """Ensure that all args are an indexable type.

    Conversion runs lazily for dask objects, immediately otherwise.

    Parameters
    ----------
    args : array_like or scalar
    allow_scalars : bool, optional
        Whether to allow scalars in args. Default is False.
    """
    if kwargs.get("allow_scalars", False):
        indexable = _maybe_indexable
    else:
        indexable = _indexable
    for x in args:
        if x is None or isinstance(x, (da.Array, dd.DataFrame)):
            yield x
        elif is_dask_collection(x):
            yield delayed(indexable, pure=True)(x)
        else:
            yield indexable(x) 
开发者ID:dask,项目名称:dask-ml,代码行数:24,代码来源:utils.py

示例9: to_keys

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def to_keys(dsk, *args):
    for x in args:
        if x is None:
            yield None
        elif isinstance(x, (da.Array, dd.DataFrame)):
            x = delayed(x)
            dsk.update(x.dask)
            yield x.key
        elif isinstance(x, Delayed):
            dsk.update(x.dask)
            yield x.key
        else:
            assert not is_dask_collection(x)
            key = type(x).__name__ + "-" + tokenize(x)
            dsk[key] = x
            yield key 
开发者ID:dask,项目名称:dask-ml,代码行数:18,代码来源:utils.py

示例10: inverse_transform

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def inverse_transform(
        self,
        X: Union[ArrayLike, DataFrameType],
        y: Optional[Union[ArrayLike, SeriesType]] = None,
        copy: Optional[bool] = None,
    ) -> Union[ArrayLike, DataFrameType]:
        if not hasattr(self, "scale_"):
            raise Exception(
                "This %(name)s instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before "
                "using this method."
            )
        X = X.copy()
        if isinstance(X, dd.DataFrame):
            X = X.sub(self.min_)
            X = X.div(self.scale_)
        else:
            X -= self.min_
            X /= self.scale_

        return X 
开发者ID:dask,项目名称:dask-ml,代码行数:23,代码来源:data.py

示例11: _check_inputs

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _check_inputs(
        self,
        X: Union[ArrayLike, DataFrameType],
        accept_sparse_negative: bool = False,
        copy: bool = False,
        in_fit: bool = True,
    ) -> Union[ArrayLike, DataFrameType]:
        if isinstance(X, (pd.DataFrame, dd.DataFrame)):
            X = X.values
        if isinstance(X, np.ndarray):
            C = len(X) // min(multiprocessing.cpu_count(), 2)
            X = da.from_array(X, chunks=C)

        rng = check_random_state(self.random_state)
        # TODO: non-float dtypes?
        # TODO: sparse arrays?
        # TODO: mix of sparse, dense?
        sample = rng.uniform(size=(5, X.shape[1])).astype(X.dtype)
        super(QuantileTransformer, self)._check_inputs(
            sample,
            accept_sparse_negative=accept_sparse_negative,
            copy=copy,
            in_fit=in_fit,
        )
        return X 
开发者ID:dask,项目名称:dask-ml,代码行数:27,代码来源:data.py

示例12: _construct_dask_df_with_divisions

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _construct_dask_df_with_divisions(df):
  """Construct the new task graph and make a new dask.dataframe around it."""
  divisions = _get_divisions(df)
  # pylint: disable=protected-access
  name = 'csv-index' + df._name
  dsk = {(name, i): (_add_to_index, (df._name, i), divisions[i])
         for i in range(df.npartitions)}
  # pylint: enable=protected-access
  from toolz import merge  # pylint: disable=g-import-not-at-top
  if isinstance(df, dd.DataFrame):
    return dd.DataFrame(merge(dsk, df.dask), name, df.columns, divisions)
  elif isinstance(df, dd.Series):
    return dd.Series(merge(dsk, df.dask), name, df.name, divisions) 
开发者ID:ryfeus,项目名称:lambda-packs,代码行数:15,代码来源:dask_io.py

示例13: extract_dask_labels

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def extract_dask_labels(labels):
  """Extract data from dask.Series or dask.DataFrame for labels.

  Given a distributed dask.DataFrame or dask.Series containing exactly one
  column or name, this operation returns a single dask.DataFrame or dask.Series
  that can be iterated over.

  Args:
    labels: A distributed dask.DataFrame or dask.Series with exactly one
            column or name.

  Returns:
    A dask.DataFrame or dask.Series that can be iterated over.
    If the supplied argument is neither a dask.DataFrame nor a dask.Series this
    operation returns it without modification.

  Raises:
    ValueError: If the supplied dask.DataFrame contains more than one
                column or the supplied dask.Series contains more than
                one name.
  """
  if isinstance(labels, dd.DataFrame):
    ncol = labels.columns
  elif isinstance(labels, dd.Series):
    ncol = labels.name
  if isinstance(labels, allowed_classes):
    if len(ncol) > 1:
      raise ValueError('Only one column for labels is allowed.')
    return _construct_dask_df_with_divisions(labels)
  else:
    return labels 
开发者ID:ryfeus,项目名称:lambda-packs,代码行数:33,代码来源:dask_io.py

示例14: _access

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _access(data, iloc):
  """Accesses an element from collection, using integer location based indexing.

  Args:
    data: array-like. The collection to access
    iloc: `int` or `list` of `int`s. Location(s) to access in `collection`

  Returns:
    The element of `a` found at location(s) `iloc`.
  """
  if HAS_PANDAS:
    import pandas as pd  # pylint: disable=g-import-not-at-top
    if isinstance(data, pd.Series) or isinstance(data, pd.DataFrame):
      return data.iloc[iloc]
  return data[iloc] 
开发者ID:ryfeus,项目名称:lambda-packs,代码行数:17,代码来源:data_feeder.py

示例15: _dump_table

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import DataFrame [as 别名]
def _dump_table(self, dst, fmt='feather', components=None, *args, **kwargs):
        """ Save batch data to table formats

        Args:
          dst: str - a path to dump into
          fmt: str - format: feather, hdf5, csv
          components: str or tuple - one or several component names
        """
        filename = dst

        components = tuple(components or self.components)
        data_dict = {}
        for comp in components:
            comp_data = self.get(component=comp)
            if isinstance(comp_data, pd.DataFrame):
                data_dict.update(comp_data.to_dict('series'))
            elif isinstance(comp_data, np.ndarray):
                if comp_data.ndim > 1:
                    columns = [comp + str(i) for i in range(comp_data.shape[1])]
                    comp_dict = zip(columns, (comp_data[:, i] for i in range(comp_data.shape[1])))
                    data_dict.update({comp: comp_dict})
                else:
                    data_dict.update({comp: comp_data})
            else:
                data_dict.update({comp: comp_data})
        _data = pd.DataFrame(data_dict)

        if fmt == 'feather':
            feather.write_dataframe(_data, filename, *args, **kwargs)
        elif fmt == 'hdf5':
            _data.to_hdf(filename, *args, **kwargs)   # pylint:disable=no-member
        elif fmt == 'csv':
            _data.to_csv(filename, *args, **kwargs)   # pylint:disable=no-member
        else:
            raise ValueError('Unknown format %s' % fmt)

        return self 
开发者ID:analysiscenter,项目名称:batchflow,代码行数:39,代码来源:batch.py


注:本文中的dask.dataframe.DataFrame方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。