当前位置: 首页>>代码示例>>Python>>正文


Python dataframe.read_parquet方法代码示例

本文整理汇总了Python中dask.dataframe.read_parquet方法的典型用法代码示例。如果您正苦于以下问题:Python dataframe.read_parquet方法的具体用法?Python dataframe.read_parquet怎么用?Python dataframe.read_parquet使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在dask.dataframe的用法示例。


在下文中一共展示了dataframe.read_parquet方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    data = dd.read_parquet(data_path)
    data = data.repartition(npartitions=2)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = DaskLFApplier(lfs)
    L = applier.apply(data)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    data = data.reset_index().set_index("index")
    data_labeled = data.assign(y_prob=dd.from_array(y_prob))
    dd.to_parquet(data_labeled, output_path)
    logging.info(f"Labels saved to {output_path}") 
开发者ID:snorkel-team,项目名称:snorkel-tutorials,代码行数:26,代码来源:drybell_dask.py

示例2: test_formats

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def test_formats(cleanup):
    def helper(data,TaskClass,format=None):
        class TestTask(TaskClass):
            def run(self):
                self.save(data)

        TestTask().run()
        if format=='pd':
            assert TestTask().output().load().equals(data)
        else:
            assert TestTask().output().load()==data

    helper(df, d6tflow.tasks.TaskCachePandas, 'pd')
    helper({'test': 1}, d6tflow.tasks.TaskJson)
    helper({'test': 1}, d6tflow.tasks.TaskPickle)

    from d6tflow.tasks.h5 import TaskH5Pandas
    helper(df, TaskH5Pandas, 'pd')

    try:
        from d6tflow.tasks.dt import TaskDatatable
        import datatable as dt
        dt = dt.Frame(df)
        helper(dt, TaskH5Pandas)
    except:
        warnings.warn('datatable failed')

    if 0==1: # todo:
        import dask.dataframe as dd
        t1 = Task1();
        t1.run();
        ddf = dd.read_parquet(t1.output().path)
        from d6tflow.tasks.dask import TaskPqDask
        helper(ddf, TaskPqDask, 'pd')
        t1.invalidate(confirm=False) 
开发者ID:d6t,项目名称:d6tflow,代码行数:37,代码来源:main.py

示例3: __init__

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def __init__(self, item, datastore, collection,
                 snapshot=None, filters=None, columns=None,
                 engine="fastparquet"):
        self.engine = engine
        self.datastore = datastore
        self.collection = collection
        self.snapshot = snapshot
        self.item = item

        self._path = utils.make_path(datastore, collection, item)
        if not self._path.exists():
            raise ValueError(
                "Item `%s` doesn't exist. "
                "Create it using collection.write(`%s`, data, ...)" % (
                    item, item))
        if snapshot:
            snap_path = utils.make_path(
                datastore, collection, "_snapshots", snapshot)

            self._path = utils.make_path(snap_path, item)

            if not utils.path_exists(snap_path):
                raise ValueError("Snapshot `%s` doesn't exist" % snapshot)

            if not utils.path_exists(self._path):
                raise ValueError(
                    "Item `%s` doesn't exist in this snapshot" % item)

        self.metadata = utils.read_metadata(self._path)
        self.data = dd.read_parquet(
            self._path, engine=self.engine, filters=filters, columns=columns) 
开发者ID:ranaroussi,项目名称:pystore,代码行数:33,代码来源:item.py

示例4: index

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def index(self, item, last=False):
        data = dd.read_parquet(self._item_path(item, as_string=True),
                               columns="index", engine=self.engine)
        if not last:
            return data.index.compute()

        return float(str(data.index).split(
                     "\nName")[0].split("\n")[-1].split(" ")[0]) 
开发者ID:ranaroussi,项目名称:pystore,代码行数:10,代码来源:collection.py

示例5: test_copy_dask_to_dir

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def test_copy_dask_to_dir(tmp_path, expected_dask_df, dask_flow):
    destination = tmp_path / "output"
    destination.mkdir()
    expected_dir_path = destination / "dask_df.pq.dask"

    dask_flow.get("dask_df", mode="FileCopier").copy(destination=destination)

    actual = dd.read_parquet(expected_dir_path)
    assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute()) 
开发者ID:square,项目名称:bionic,代码行数:11,代码来源:test_copy.py

示例6: test_copy_dask_to_gcs_dir

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def test_copy_dask_to_gcs_dir(
    tmp_path, tmp_gcs_url_prefix, expected_dask_df, dask_flow
):
    cloud_url = tmp_gcs_url_prefix + "output"

    dask_flow.get("dask_df", mode="FileCopier").copy(destination=cloud_url)

    check_call(f"gsutil -m cp -r {cloud_url} {tmp_path}", shell=True)
    actual = dd.read_parquet(tmp_path / "output")
    assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute()) 
开发者ID:square,项目名称:bionic,代码行数:12,代码来源:test_copy.py

示例7: process_dataframe

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def process_dataframe(client, hdfs_dir_input, hdfs_dir_output):
    dask_df = client.persist(dd.read_parquet(hdfs_dir_input))
    st = ScalerTransformer(dask_df)
    scaled_features = st.get_transformed_data()
    scaled_features.repartition(npartitions=32).to_parquet(hdfs_dir_output) 
开发者ID:Morphl-AI,项目名称:MorphL-Community-Edition,代码行数:7,代码来源:ga_chp_bq_advanced_preprocessor.py

示例8: main

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def main():
    client = Client()
    dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT))
    ModelGenerator(dask_df).generate_and_save_model() 
开发者ID:Morphl-AI,项目名称:MorphL-Community-Edition,代码行数:6,代码来源:ga_chp_bq_model_generator.py

示例9: append

# 需要导入模块: from dask import dataframe [as 别名]
# 或者: from dask.dataframe import read_parquet [as 别名]
def append(self, item, data, npartitions=None, epochdate=False,
               threaded=False, reload_items=False, **kwargs):

        if not utils.path_exists(self._item_path(item)):
            raise ValueError(
                """Item do not exists. Use `<collection>.write(...)`""")

        # work on copy
        data = data.copy()

        try:
            if epochdate or ("datetime" in str(data.index.dtype) and
                             any(data.index.nanosecond) > 0):
                data = utils.datetime_to_int64(data)
            old_index = dd.read_parquet(self._item_path(item, as_string=True),
                                        columns=[], engine=self.engine
                                        ).index.compute()
            data = data[~data.index.isin(old_index)]
        except Exception:
            return

        if data.empty:
            return

        if data.index.name == "":
            data.index.name = "index"

        # combine old dataframe with new
        current = self.item(item)
        new = dd.from_pandas(data, npartitions=1)
        combined = dd.concat([current.data, new]).drop_duplicates(keep="last")

        if npartitions is None:
            memusage = combined.memory_usage(deep=True).sum()
            if isinstance(combined, dd.DataFrame):
                memusage = memusage.compute()
            npartitions = int(1 + memusage // DEFAULT_PARTITION_SIZE)

        # write data
        write = self.write_threaded if threaded else self.write
        write(item, combined, npartitions=npartitions, chunksize=None,
              metadata=current.metadata, overwrite=True,
              epochdate=epochdate, reload_items=reload_items, **kwargs) 
开发者ID:ranaroussi,项目名称:pystore,代码行数:45,代码来源:collection.py


注:本文中的dask.dataframe.read_parquet方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。