當前位置: 首頁>>代碼示例>>Python>>正文


Python dataframe.read_parquet方法代碼示例

本文整理匯總了Python中dask.dataframe.read_parquet方法的典型用法代碼示例。如果您正苦於以下問題:Python dataframe.read_parquet方法的具體用法?Python dataframe.read_parquet怎麽用?Python dataframe.read_parquet使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在dask.dataframe的用法示例。


在下文中一共展示了dataframe.read_parquet方法的9個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: main

# 需要導入模塊: from dask import dataframe [as 別名]
# 或者: from dask.dataframe import read_parquet [as 別名]
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    data = dd.read_parquet(data_path)
    data = data.repartition(npartitions=2)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = DaskLFApplier(lfs)
    L = applier.apply(data)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    data = data.reset_index().set_index("index")
    data_labeled = data.assign(y_prob=dd.from_array(y_prob))
    dd.to_parquet(data_labeled, output_path)
    logging.info(f"Labels saved to {output_path}") 
開發者ID:snorkel-team,項目名稱:snorkel-tutorials,代碼行數:26,代碼來源:drybell_dask.py

示例2: test_formats

# 需要導入模塊: from dask import dataframe [as 別名]
# 或者: from dask.dataframe import read_parquet [as 別名]
def test_formats(cleanup):
    def helper(data,TaskClass,format=None):
        class TestTask(TaskClass):
            def run(self):
                self.save(data)

        TestTask().run()
        if format=='pd':
            assert TestTask().output().load().equals(data)
        else:
            assert TestTask().output().load()==data

    helper(df, d6tflow.tasks.TaskCachePandas, 'pd')
    helper({'test': 1}, d6tflow.tasks.TaskJson)
    helper({'test': 1}, d6tflow.tasks.TaskPickle)

    from d6tflow.tasks.h5 import TaskH5Pandas
    helper(df, TaskH5Pandas, 'pd')

    try:
        from d6tflow.tasks.dt import TaskDatatable
        import datatable as dt
        dt = dt.Frame(df)
        helper(dt, TaskH5Pandas)
    except:
        warnings.warn('datatable failed')

    if 0==1: # todo:
        import dask.dataframe as dd
        t1 = Task1();
        t1.run();
        ddf = dd.read_parquet(t1.output().path)
        from d6tflow.tasks.dask import TaskPqDask
        helper(ddf, TaskPqDask, 'pd')
        t1.invalidate(confirm=False) 
開發者ID:d6t,項目名稱:d6tflow,代碼行數:37,代碼來源:main.py

示例3: __init__

# 需要導入模塊: from dask import dataframe [as 別名]
# 或者: from dask.dataframe import read_parquet [as 別名]
def __init__(self, item, datastore, collection,
                 snapshot=None, filters=None, columns=None,
                 engine="fastparquet"):
        self.engine = engine
        self.datastore = datastore
        self.collection = collection
        self.snapshot = snapshot
        self.item = item

        self._path = utils.make_path(datastore, collection, item)
        if not self._path.exists():
            raise ValueError(
                "Item `%s` doesn't exist. "
                "Create it using collection.write(`%s`, data, ...)" % (
                    item, item))
        if snapshot:
            snap_path = utils.make_path(
                datastore, collection, "_snapshots", snapshot)

            self._path = utils.make_path(snap_path, item)

            if not utils.path_exists(snap_path):
                raise ValueError("Snapshot `%s` doesn't exist" % snapshot)

            if not utils.path_exists(self._path):
                raise ValueError(
                    "Item `%s` doesn't exist in this snapshot" % item)

        self.metadata = utils.read_metadata(self._path)
        self.data = dd.read_parquet(
            self._path, engine=self.engine, filters=filters, columns=columns) 
開發者ID:ranaroussi,項目名稱:pystore,代碼行數:33,代碼來源:item.py

示例4: index

# 需要導入模塊: from dask import dataframe [as 別名]
# 或者: from dask.dataframe import read_parquet [as 別名]
def index(self, item, last=False):
        data = dd.read_parquet(self._item_path(item, as_string=True),
                               columns="index", engine=self.engine)
        if not last:
            return data.index.compute()

        return float(str(data.index).split(
                     "\nName")[0].split("\n")[-1].split(" ")[0]) 
開發者ID:ranaroussi,項目名稱:pystore,代碼行數:10,代碼來源:collection.py

示例5: test_copy_dask_to_dir

# 需要導入模塊: from dask import dataframe [as 別名]
# 或者: from dask.dataframe import read_parquet [as 別名]
def test_copy_dask_to_dir(tmp_path, expected_dask_df, dask_flow):
    destination = tmp_path / "output"
    destination.mkdir()
    expected_dir_path = destination / "dask_df.pq.dask"

    dask_flow.get("dask_df", mode="FileCopier").copy(destination=destination)

    actual = dd.read_parquet(expected_dir_path)
    assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute()) 
開發者ID:square,項目名稱:bionic,代碼行數:11,代碼來源:test_copy.py

示例6: test_copy_dask_to_gcs_dir

# 需要導入模塊: from dask import dataframe [as 別名]
# 或者: from dask.dataframe import read_parquet [as 別名]
def test_copy_dask_to_gcs_dir(
    tmp_path, tmp_gcs_url_prefix, expected_dask_df, dask_flow
):
    cloud_url = tmp_gcs_url_prefix + "output"

    dask_flow.get("dask_df", mode="FileCopier").copy(destination=cloud_url)

    check_call(f"gsutil -m cp -r {cloud_url} {tmp_path}", shell=True)
    actual = dd.read_parquet(tmp_path / "output")
    assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute()) 
開發者ID:square,項目名稱:bionic,代碼行數:12,代碼來源:test_copy.py

示例7: process_dataframe

# 需要導入模塊: from dask import dataframe [as 別名]
# 或者: from dask.dataframe import read_parquet [as 別名]
def process_dataframe(client, hdfs_dir_input, hdfs_dir_output):
    dask_df = client.persist(dd.read_parquet(hdfs_dir_input))
    st = ScalerTransformer(dask_df)
    scaled_features = st.get_transformed_data()
    scaled_features.repartition(npartitions=32).to_parquet(hdfs_dir_output) 
開發者ID:Morphl-AI,項目名稱:MorphL-Community-Edition,代碼行數:7,代碼來源:ga_chp_bq_advanced_preprocessor.py

示例8: main

# 需要導入模塊: from dask import dataframe [as 別名]
# 或者: from dask.dataframe import read_parquet [as 別名]
def main():
    client = Client()
    dask_df = client.persist(dd.read_parquet(HDFS_DIR_INPUT))
    ModelGenerator(dask_df).generate_and_save_model() 
開發者ID:Morphl-AI,項目名稱:MorphL-Community-Edition,代碼行數:6,代碼來源:ga_chp_bq_model_generator.py

示例9: append

# 需要導入模塊: from dask import dataframe [as 別名]
# 或者: from dask.dataframe import read_parquet [as 別名]
def append(self, item, data, npartitions=None, epochdate=False,
               threaded=False, reload_items=False, **kwargs):

        if not utils.path_exists(self._item_path(item)):
            raise ValueError(
                """Item do not exists. Use `<collection>.write(...)`""")

        # work on copy
        data = data.copy()

        try:
            if epochdate or ("datetime" in str(data.index.dtype) and
                             any(data.index.nanosecond) > 0):
                data = utils.datetime_to_int64(data)
            old_index = dd.read_parquet(self._item_path(item, as_string=True),
                                        columns=[], engine=self.engine
                                        ).index.compute()
            data = data[~data.index.isin(old_index)]
        except Exception:
            return

        if data.empty:
            return

        if data.index.name == "":
            data.index.name = "index"

        # combine old dataframe with new
        current = self.item(item)
        new = dd.from_pandas(data, npartitions=1)
        combined = dd.concat([current.data, new]).drop_duplicates(keep="last")

        if npartitions is None:
            memusage = combined.memory_usage(deep=True).sum()
            if isinstance(combined, dd.DataFrame):
                memusage = memusage.compute()
            npartitions = int(1 + memusage // DEFAULT_PARTITION_SIZE)

        # write data
        write = self.write_threaded if threaded else self.write
        write(item, combined, npartitions=npartitions, chunksize=None,
              metadata=current.metadata, overwrite=True,
              epochdate=epochdate, reload_items=reload_items, **kwargs) 
開發者ID:ranaroussi,項目名稱:pystore,代碼行數:45,代碼來源:collection.py


注:本文中的dask.dataframe.read_parquet方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。