本文整理汇总了Python中dask.dataframe.read_parquet方法的典型用法代码示例。如果您正苦于以下问题:Python dataframe.read_parquet方法的具体用法?Python dataframe.read_parquet怎么用?Python dataframe.read_parquet使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dask.dataframe
的用法示例。
在下文中一共展示了dataframe.read_parquet方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def main(data_path, output_path):
# Read data
logging.info(f"Reading data from {data_path}")
data = dd.read_parquet(data_path)
data = data.repartition(npartitions=2)
# Build label matrix
logging.info("Applying LFs")
lfs = [article_mentions_person, body_contains_fortune, person_in_db]
applier = DaskLFApplier(lfs)
L = applier.apply(data)
# Train label model
logging.info("Training label model")
label_model = LabelModel(cardinality=2)
label_model.fit(L)
# Generate training labels
logging.info("Generating probabilistic labels")
y_prob = label_model.predict_proba(L)[:, 1]
data = data.reset_index().set_index("index")
data_labeled = data.assign(y_prob=dd.from_array(y_prob))
dd.to_parquet(data_labeled, output_path)
logging.info(f"Labels saved to {output_path}")
示例2: test_formats
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def test_formats(cleanup):
def helper(data,TaskClass,format=None):
class TestTask(TaskClass):
def run(self):
self.save(data)
TestTask().run()
if format=='pd':
assert TestTask().output().load().equals(data)
else:
assert TestTask().output().load()==data
helper(df, d6tflow.tasks.TaskCachePandas, 'pd')
helper({'test': 1}, d6tflow.tasks.TaskJson)
helper({'test': 1}, d6tflow.tasks.TaskPickle)
from d6tflow.tasks.h5 import TaskH5Pandas
helper(df, TaskH5Pandas, 'pd')
try:
from d6tflow.tasks.dt import TaskDatatable
import datatable as dt
dt = dt.Frame(df)
helper(dt, TaskH5Pandas)
except:
warnings.warn('datatable failed')
if 0==1: # todo:
import dask.dataframe as dd
t1 = Task1();
t1.run();
ddf = dd.read_parquet(t1.output().path)
from d6tflow.tasks.dask import TaskPqDask
helper(ddf, TaskPqDask, 'pd')
t1.invalidate(confirm=False)
示例3: __init__
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def __init__(self, item, datastore, collection,
snapshot=None, filters=None, columns=None,
engine="fastparquet"):
self.engine = engine
self.datastore = datastore
self.collection = collection
self.snapshot = snapshot
self.item = item
self._path = utils.make_path(datastore, collection, item)
if not self._path.exists():
raise ValueError(
"Item `%s` doesn't exist. "
"Create it using collection.write(`%s`, data, ...)" % (
item, item))
if snapshot:
snap_path = utils.make_path(
datastore, collection, "_snapshots", snapshot)
self._path = utils.make_path(snap_path, item)
if not utils.path_exists(snap_path):
raise ValueError("Snapshot `%s` doesn't exist" % snapshot)
if not utils.path_exists(self._path):
raise ValueError(
"Item `%s` doesn't exist in this snapshot" % item)
self.metadata = utils.read_metadata(self._path)
self.data = dd.read_parquet(
self._path, engine=self.engine, filters=filters, columns=columns)
示例4: index
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def index(self, item, last=False):
data = dd.read_parquet(self._item_path(item, as_string=True),
columns="index", engine=self.engine)
if not last:
return data.index.compute()
return float(str(data.index).split(
"\nName")[0].split("\n")[-1].split(" ")[0])
示例5: test_copy_dask_to_dir
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def test_copy_dask_to_dir(tmp_path, expected_dask_df, dask_flow):
destination = tmp_path / "output"
destination.mkdir()
expected_dir_path = destination / "dask_df.pq.dask"
dask_flow.get("dask_df", mode="FileCopier").copy(destination=destination)
actual = dd.read_parquet(expected_dir_path)
assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute())
示例6: test_copy_dask_to_gcs_dir
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def test_copy_dask_to_gcs_dir(
tmp_path, tmp_gcs_url_prefix, expected_dask_df, dask_flow
):
cloud_url = tmp_gcs_url_prefix + "output"
dask_flow.get("dask_df", mode="FileCopier").copy(destination=cloud_url)
check_call(f"gsutil -m cp -r {cloud_url} {tmp_path}", shell=True)
actual = dd.read_parquet(tmp_path / "output")
assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute())
示例7: process_dataframe
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def process_dataframe(client, hdfs_dir_input, hdfs_dir_output):
dask_df = client.persist(dd.read_parquet(hdfs_dir_input))
st = ScalerTransformer(dask_df)
scaled_features = st.get_transformed_data()
scaled_features.repartition(npartitions=32).to_parquet(hdfs_dir_output)
示例8: main
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def main():
client = Client()
dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT))
ModelGenerator(dask_df).generate_and_save_model()
示例9: append
# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def append(self, item, data, npartitions=None, epochdate=False,
threaded=False, reload_items=False, **kwargs):
if not utils.path_exists(self._item_path(item)):
raise ValueError(
"""Item do not exists. Use `<collection>.write(...)`""")
# work on copy
data = data.copy()
try:
if epochdate or ("datetime" in str(data.index.dtype) and
any(data.index.nanosecond) > 0):
data = utils.datetime_to_int64(data)
old_index = dd.read_parquet(self._item_path(item, as_string=True),
columns=[], engine=self.engine
).index.compute()
data = data[~data.index.isin(old_index)]
except Exception:
return
if data.empty:
return
if data.index.name == "":
data.index.name = "index"
# combine old dataframe with new
current = self.item(item)
new = dd.from_pandas(data, npartitions=1)
combined = dd.concat([current.data, new]).drop_duplicates(keep="last")
if npartitions is None:
memusage = combined.memory_usage(deep=True).sum()
if isinstance(combined, dd.DataFrame):
memusage = memusage.compute()
npartitions = int(1 + memusage // DEFAULT_PARTITION_SIZE)
# write data
write = self.write_threaded if threaded else self.write
write(item, combined, npartitions=npartitions, chunksize=None,
metadata=current.metadata, overwrite=True,
epochdate=epochdate, reload_items=reload_items, **kwargs)