本文整理汇总了Python中dask.dataframe.from_pandas方法的典型用法代码示例。如果您正苦于以下问题:Python dataframe.from_pandas方法的具体用法?Python dataframe.from_pandas怎么用?Python dataframe.from_pandas使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dask.dataframe
的用法示例。
在下文中一共展示了dataframe.from_pandas方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_lf_applier_pandas_spacy_preprocessor_memoized
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def test_lf_applier_pandas_spacy_preprocessor_memoized(self) -> None:
spacy = SpacyPreprocessor(text_field="text", doc_field="doc")
spacy.memoize = True
@labeling_function(pre=[spacy])
def first_is_name(x: DataPoint) -> int:
return 0 if x.doc[0].pos_ == "PROPN" else -1
@labeling_function(pre=[spacy])
def has_verb(x: DataPoint) -> int:
return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1
df = pd.DataFrame(dict(text=TEXT_DATA))
df = dd.from_pandas(df, npartitions=2)
applier = DaskLFApplier([first_is_name, has_verb])
L = applier.apply(df)
np.testing.assert_equal(L, L_TEXT_EXPECTED)
示例2: test_to_holomap_dask
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def test_to_holomap_dask(self):
if dd is None:
raise SkipTest("Dask required to test .to with dask dataframe.")
ddf = dd.from_pandas(self.df, npartitions=2)
dds = Dataset(
ddf,
kdims=[
Dimension('a', label="The a Column"),
Dimension('b', label="The b Column"),
Dimension('c', label="The c Column"),
Dimension('d', label="The d Column"),
]
)
curve_hmap = dds.to(Curve, 'a', 'b', groupby=['c'])
# Check HoloMap element datasets
for v in self.df.c.drop_duplicates():
curve = curve_hmap.data[(v,)]
self.assertEqual(
curve.dataset, self.ds
)
# Execute pipeline
self.assertEqual(curve.pipeline(curve.dataset), curve)
示例3: mock_dask_fit_data
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def mock_dask_fit_data(
periods=DEF_N,
start_date=None,
ids=[0],
embedding_dim=DEF_EMB_DIM,
seq_length=DEF_SEQ_LENGTH
):
"""Create example fit data as a dask DataFrame.
DataFrame is partitioned by ID.
"""
df = mock_fit_data(
periods=periods,
start_date=start_date,
ids=ids,
embedding_dim=embedding_dim,
seq_length=seq_length
)
ddf = dd.from_pandas(df, chunksize=periods)
return ddf
示例4: mock_dask_raw_data
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def mock_dask_raw_data(
periods=DEF_N,
start_date=None,
ids=[0]
):
"""Create example fit data as a dask DataFrame.
DataFrame is partitioned by ID.
"""
df = mock_raw_data(
periods=periods,
start_date=start_date,
ids=ids,
)
ddf = dd.from_pandas(df, chunksize=periods)
return ddf
示例5: test_typed_dask_dataframe
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def test_typed_dask_dataframe(builder):
df_value = pd.DataFrame()
df_value["int"] = [1, 2, 3]
df_value["float"] = [1.0, 1.5, float("nan")]
df_value["str"] = ["red", "blue", None]
df_value["time"] = pd.to_datetime(["2011-02-07", "2011-03-17", "2011-04-27"])
dask_df = dd.from_pandas(df_value, npartitions=1)
@builder
@bn.protocol.dask
def df():
return dask_df
assert equal_frame_and_index_content(
builder.build().get("df").compute(), dask_df.compute()
)
assert (
builder.build().get("df").compute().dtypes.to_dict()
== dask_df.compute().dtypes.to_dict()
)
示例6: test_incremental_text_pipeline
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def test_incremental_text_pipeline(container):
X = pd.Series(["a list", "of words", "for classification"] * 100)
X = dd.from_pandas(X, npartitions=3)
if container == "bag":
X = X.to_bag()
y = da.from_array(np.array([0, 0, 1] * 100), chunks=(100,) * 3)
assert tuple(X.map_partitions(len).compute()) == y.chunks[0]
sgd = SGDClassifier(max_iter=5, tol=1e-3)
clf = Incremental(sgd, scoring="accuracy", assume_equal_chunks=True)
vect = dask_ml.feature_extraction.text.HashingVectorizer()
pipe = make_pipeline(vect, clf)
pipe.fit(X, y, incremental__classes=[0, 1])
X2 = pipe.steps[0][1].transform(X)
assert hasattr(clf, "coef_")
X2.compute_chunk_sizes()
assert X2.shape == (300, vect.n_features)
示例7: test_grid_search_dask_dataframe
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def test_grid_search_dask_dataframe():
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X)
ddf = dd.from_pandas(df, 2)
dy = pd.Series(y)
ddy = dd.from_pandas(dy, 2)
clf = LogisticRegression(multi_class="auto", solver="lbfgs", max_iter=200)
param_grid = {"C": [0.1, 1, 10]}
gs = GridSearchCV(clf, param_grid, cv=5)
dgs = dcv.GridSearchCV(clf, param_grid, cv=5)
gs.fit(df, dy)
dgs.fit(ddf, ddy)
assert gs.best_params_ == dgs.best_params_
示例8: test_frame_strategies
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def test_frame_strategies(daskify, strategy):
df = pd.DataFrame({"A": [1, 1, np.nan, np.nan, 2, 2]})
if daskify:
df = dd.from_pandas(df, 2)
if strategy == "constant":
fill_value = 2
else:
fill_value = None
b = dask_ml.impute.SimpleImputer(strategy=strategy, fill_value=fill_value)
b.fit(df)
if not daskify and strategy == "median":
expected = pd.Series([1.5], index=["A"])
else:
expected = pd.Series([2], index=["A"])
tm.assert_series_equal(b.statistics_, expected, check_dtype=False)
示例9: test_block_transform_multiply
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def test_block_transform_multiply(self, daskify, validation, factor):
X = np.arange(100).reshape((25, 4))
df = pd.DataFrame(X).rename(columns=str)
if daskify:
X = da.from_array(X, chunks=(5, 4))
df = dd.from_pandas(df, npartitions=2)
if factor:
bt = BlockTransformer(multiply, validate=validation, factor=factor)
else:
bt = BlockTransformer(multiply, validate=validation)
if daskify:
assert dask.is_dask_collection(bt.transform(X))
assert dask.is_dask_collection(bt.transform(df))
if factor:
da.utils.assert_eq(bt.transform(X), multiply(X, factor=factor))
dd.utils.assert_eq(bt.transform(df), multiply(df, factor=factor))
else:
da.utils.assert_eq(bt.transform(X), multiply(X))
dd.utils.assert_eq(bt.transform(df), multiply(df))
示例10: test_validate
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def test_validate(self, mocker, daskify, validate):
X = np.arange(100).reshape((25, 4))
df = pd.DataFrame(X).rename(columns=str)
if daskify:
X = da.from_array(X, chunks=(5, 4))
df = dd.from_pandas(df, npartitions=2)
m = mocker.patch("dask_ml.preprocessing._block_transformer.check_array")
bt = BlockTransformer(lambda x: x, validate=validate)
if validate:
_ = bt.transform(X)
m.assert_called_once()
m.reset_mock()
_ = bt.transform(df)
m.assert_called_once()
else:
_ = bt.transform(X)
m.assert_not_called()
_ = bt.transform(df)
m.assert_not_called()
示例11: test_inverse_transform
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def test_inverse_transform(self):
enc = dpp.OrdinalEncoder()
df = dd.from_pandas(
pd.DataFrame(
{"A": np.arange(10), "B": pd.Categorical(["a"] * 4 + ["b"] * 6)}
),
npartitions=2,
)
enc.fit(df)
assert dask.is_dask_collection(enc.inverse_transform(enc.transform(df).values))
assert dask.is_dask_collection(enc.inverse_transform(enc.transform(df)))
assert_eq_df(df, enc.inverse_transform(enc.transform(df)))
assert_eq_df(df, enc.inverse_transform(enc.transform(df)))
assert_eq_df(df, enc.inverse_transform(enc.transform(df).values))
assert_eq_df(df, enc.inverse_transform(enc.transform(df).values))
示例12: test_categorical
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def test_categorical(self, categories, transformed, daskify, ordered):
cat = pd.Series(
["a", "b", "a"],
dtype=pd.api.types.CategoricalDtype(categories=categories, ordered=ordered),
)
if daskify:
cat = dd.from_pandas(cat, npartitions=2)
transformed = da.from_array(transformed, chunks=(2, 1))
if daskify == "unknown":
cat = cat.cat.as_unknown()
a = dpp.LabelEncoder().fit(cat)
if daskify != "unknown":
assert a.dtype_ == cat.dtype
np.testing.assert_array_equal(a.classes_, categories)
result = a.transform(cat)
da.utils.assert_eq(result, transformed)
inv_transformed = a.inverse_transform(result)
if daskify:
# manually set the divisions for the test
inv_transformed.divisions = (0, 2)
dd.utils.assert_eq(inv_transformed, cat)
示例13: test_use_categorical
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def test_use_categorical(self, daskify):
data = pd.Series(
["b", "c"], dtype=pd.api.types.CategoricalDtype(["c", "a", "b"])
)
if daskify:
data = dd.from_pandas(data, npartitions=2)
a = dpp.LabelEncoder(use_categorical=False).fit(data)
b = spp.LabelEncoder().fit(data)
assert_estimator_equal(a, b, exclude={"dtype_"})
assert a.dtype_ is None
da.utils.assert_eq(a.transform(data), b.transform(data))
a_trn = a.transform(data)
b_trn = b.transform(data)
da.utils.assert_eq(a_trn, b_trn)
da.utils.assert_eq(a.inverse_transform(a_trn), b.inverse_transform(b_trn))
示例14: __init__
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def __init__(
self,
pandas_obj,
npartitions=None,
dask_threshold=1,
scheduler="processes",
progress_bar=True,
progress_bar_desc=None,
allow_dask_on_strings=False,
):
super(Transformation, self).__init__(
pandas_obj, npartitions, dask_threshold, scheduler, progress_bar, progress_bar_desc, allow_dask_on_strings
)
self._sample_pd = pandas_obj.iloc[: self._SAMPLE_SIZE]
self._obj_pd = pandas_obj
self._obj_dd = dd.from_pandas(pandas_obj, npartitions=npartitions)
self._nrows = pandas_obj.shape[0]
示例15: _dask_apply
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import from_pandas [as 别名]
def _dask_apply(self, func, *args, **kwds):
try:
# check that the dask rolling apply matches the pandas apply
with suppress_stdout_stderr():
tmp_df = (
dd.from_pandas(self._comparison_pd, npartitions=self._npartitions)
.rolling(**{k: v for k, v in self._rolling_kwds.items() if k not in ["on", "closed"]})
.apply(func, *args, **kwds)
.compute(scheduler=self._scheduler)
)
self._validate_apply(
tmp_df.equals(self._comparison_pd.rolling(**self._rolling_kwds).apply(func, *args, **kwds)),
error_message="Dask rolling apply sample does not match pandas rolling apply sample.",
)
if self._progress_bar:
with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"):
return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler)
else:
return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler)
except ERRORS_TO_HANDLE:
if self._progress_bar:
tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
return self._obj_pd.progress_apply(func, *args, **kwds)
else:
return self._obj_pd.apply(func, *args, **kwds)