Python parquet.write_table方法代码示例

本文整理汇总了Python中pyarrow.parquet.write_table方法的典型用法代码示例。如果您正苦于以下问题：Python parquet.write_table方法的具体用法？Python parquet.write_table怎么用？Python parquet.write_table使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow.parquet的用法示例。

在下文中一共展示了parquet.write_table方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: dataframe_into_parquet_bytes

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def dataframe_into_parquet_bytes(
    df: pd.DataFrame, compression: str = "snappy"
) -> bytes:
    """
    Convert a dataframe into bytes representing a parquet table.

    Parameters
    ----------
    df: pd.DataFrame
        DataFrame to be compressed
    compression: str
        Compression to use, passed to  :func:`pyarrow.parquet.write_table`

    Returns
    -------
    bytes
    """
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf, compression=compression)
    return buf.getvalue().to_pybytes()

开发者ID:equinor，项目名称:gordo，代码行数:23，代码来源:utils.py

示例2: convert_apache_arrow_feather_to_apache_parquet

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def convert_apache_arrow_feather_to_apache_parquet(
    data_path: InputPath('ApacheArrowFeather'),
    output_data_path: OutputPath('ApacheParquet'),
):
    '''Converts Apache Arrow Feather to Apache Parquet.

    [Apache Arrow Feather](https://arrow.apache.org/docs/python/feather.html)
    [Apache Parquet](https://parquet.apache.org/)

    Annotations:
        author: Alexey Volkov <alexey.volkov@ark-kun.com>
    '''
    from pyarrow import feather, parquet

    table = feather.read_table(data_path)
    parquet.write_table(table, output_data_path)

开发者ID:kubeflow，项目名称:pipelines，代码行数:18，代码来源:component.py

示例3: test_pandas_parquet_serialization

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def test_pandas_parquet_serialization():
    # Only test this if pandas is installed
    pytest.importorskip("pandas")

    import pandas as pd
    import pyarrow as pa
    import pyarrow.parquet as pq

    tempdir = tempfile.mkdtemp()
    filename = os.path.join(tempdir, "parquet-test")
    pd.DataFrame({"col1": [0, 1], "col2": [0, 1]}).to_parquet(filename)
    with open(os.path.join(tempdir, "parquet-compression"), "wb") as f:
        table = pa.Table.from_arrays([pa.array([1, 2, 3])], ["hello"])
        pq.write_table(table, f, compression="lz4")
    # Clean up
    shutil.rmtree(tempdir)

开发者ID:ray-project，项目名称:ray，代码行数:18，代码来源:test_advanced_3.py

示例4: store

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def store(self, store, key_prefix, df):
        key = "{}.parquet".format(key_prefix)
        if isinstance(df, pa.Table):
            table = df
        else:
            table = pa.Table.from_pandas(df)
        buf = pa.BufferOutputStream()
        if (
            self.chunk_size
            and self.chunk_size < len(table)
            and not ARROW_LARGER_EQ_0150
        ):
            table = _reset_dictionary_columns(table)
        pq.write_table(
            table,
            buf,
            version=self._PARQUET_VERSION,
            chunk_size=self.chunk_size,
            compression=self.compression,
            coerce_timestamps="us",
        )
        store.put(key, buf.getvalue().to_pybytes())
        return key

开发者ID:JDASoftwareGroup，项目名称:kartothek，代码行数:25，代码来源:_parquet.py

示例5: test_pyarrow_07992

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def test_pyarrow_07992(store):
    key = "test.parquet"
    df = pd.DataFrame({"a": [1]})
    table = pa.Table.from_pandas(df)
    meta = b"""{
        "pandas_version": "0.20.3",
        "index_columns": ["__index_level_0__"],
        "columns": [
            {"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"},
            {"metadata": null, "name": null, "numpy_type": "int64", "pandas_type": "int64"}
        ],
        "column_indexes": [
            {"metadata": null, "name": null, "numpy_type": "object", "pandas_type": "string"}
        ]
    }"""
    table = table.replace_schema_metadata({b"pandas": meta})
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(key, buf.getvalue().to_pybytes())
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)

开发者ID:JDASoftwareGroup，项目名称:kartothek，代码行数:22，代码来源:test_parquet.py

示例6: test_index_metadata

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def test_index_metadata(store):
    key = "test.parquet"
    df = pd.DataFrame({"a": [1]})
    table = pa.Table.from_pandas(df)
    meta = b"""{
        "pandas_version": "0.20.3",
        "index_columns": ["__index_level_0__"],
        "columns": [
            {"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"}
        ]
    }"""
    table = table.replace_schema_metadata({b"pandas": meta})
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(key, buf.getvalue().to_pybytes())
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)

开发者ID:JDASoftwareGroup，项目名称:kartothek，代码行数:18，代码来源:test_parquet.py

示例7: insert

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def insert(self, path, expr, **kwargs):
        path = self.root / path
        df = execute(expr)
        table = pa.Table.from_pandas(df)
        pq.write_table(table, str(path))

开发者ID:ibis-project，项目名称:ibis，代码行数:7，代码来源:parquet.py

示例8: to_parquet

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def to_parquet(
    df,
    bucket_name,
    prefix,
    retry_config,
    session_kwargs,
    client_kwargs,
    compression=None,
    flavor="spark",
):
    import pyarrow as pa
    import pyarrow.parquet as pq

    session = Session(**session_kwargs)
    client = session.resource("s3", **client_kwargs)
    bucket = client.Bucket(bucket_name)
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf, compression=compression, flavor=flavor)
    response = retry_api_call(
        bucket.put_object,
        config=retry_config,
        Body=buf.getvalue().to_pybytes(),
        Key=prefix + str(uuid.uuid4()),
    )
    return "s3://{0}/{1}".format(response.bucket_name, response.key)

开发者ID:laughingman7743，项目名称:PyAthena，代码行数:28，代码来源:util.py

示例9: save_parquet

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def save_parquet(filename, dataframe, hf_meta):
    table = pa.Table.from_pandas(dataframe, preserve_index=True)
    meta_dict = table.schema.metadata
    hf_string = json.dumps(hf_meta).encode()
    meta_dict[b"hydrofunctions_meta"] = hf_string
    table = table.replace_schema_metadata(meta_dict)
    pq.write_table(table, filename)

开发者ID:mroberge，项目名称:hydrofunctions，代码行数:9，代码来源:hydrofunctions.py

示例10: convert_tsv_to_apache_parquet

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def convert_tsv_to_apache_parquet(
    data_path: InputPath('TSV'),
    output_data_path: OutputPath('ApacheParquet'),
):
    '''Converts TSV table to Apache Parquet.

    [Apache Parquet](https://parquet.apache.org/)

    Annotations:
        author: Alexey Volkov <alexey.volkov@ark-kun.com>
    '''
    from pyarrow import csv, parquet

    table = csv.read_csv(data_path, parse_options=csv.ParseOptions(delimiter='\t'))
    parquet.write_table(table, output_data_path)

开发者ID:kubeflow，项目名称:pipelines，代码行数:17，代码来源:component.py

示例11: convert_csv_to_apache_parquet

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def convert_csv_to_apache_parquet(
    data_path: InputPath('CSV'),
    output_data_path: OutputPath('ApacheParquet'),
):
    '''Converts CSV table to Apache Parquet.

    [Apache Parquet](https://parquet.apache.org/)

    Annotations:
        author: Alexey Volkov <alexey.volkov@ark-kun.com>
    '''
    from pyarrow import csv, parquet

    table = csv.read_csv(data_path)
    parquet.write_table(table, output_data_path)

开发者ID:kubeflow，项目名称:pipelines，代码行数:17，代码来源:component.py

示例12: test_read_parquet

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def test_read_parquet(tmpdir, continuous):
    str_arr = pa.array(["a", None, "c"], pa.string())
    int_arr = pa.array([1, None, -2], pa.int32())
    bool_arr = pa.array([True, None, False], pa.bool_())
    table = pa.Table.from_arrays([str_arr, int_arr, bool_arr], ["str", "int", "bool"])

    pq.write_table(table, "df.parquet")
    result = fr.read_parquet("df.parquet", continuous=continuous)
    expected = fr.pandas_from_arrow(table, continuous=continuous)
    tm.assert_frame_equal(result, expected)

开发者ID:xhochy，项目名称:fletcher，代码行数:12，代码来源:test_io.py

示例13: test_parquet_roundtrip

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def test_parquet_roundtrip(array_type):
    df = pd.DataFrame({"col": array_type(["A", "B"])})
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    reader = pa.BufferReader(buf.getvalue().to_pybytes())
    table = pq.read_table(reader)
    pdt.assert_frame_equal(df, table.to_pandas())

开发者ID:xhochy，项目名称:fletcher，代码行数:10，代码来源:test_pyarrow_roundtrip.py

示例14: gen_kde

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def gen_kde(N, file_name):
    # np.random.seed(0)
    df = pd.DataFrame({'points': np.random.random(N)})
    table = pa.Table.from_pandas(df)
    row_group_size = 128
    pq.write_table(table, 'kde.parquet', row_group_size)

开发者ID:IntelPython，项目名称:sdc，代码行数:8，代码来源:gen_kde_pq.py

示例15: store

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def store(self, store: KeyValueStore, dataset_uuid: str) -> str:
        """
        Store the index as a parquet file

        If compatible, the new keyname will be the name stored under the attribute `index_storage_key`.
        If this attribute is None, a new key will be generated of the format

            `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet`

        where the timestamp is in nanosecond accuracy and is created upon Index object initialization

        Parameters
        ----------
        store:
        dataset_uuid:
        """
        storage_key = None

        if (
            self.index_storage_key is not None
            and dataset_uuid
            and dataset_uuid in self.index_storage_key
        ):
            storage_key = self.index_storage_key
        if storage_key is None:
            storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format(
                dataset_uuid=dataset_uuid,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
                column=quote(self.column),
                timestamp=quote(self.creation_time.isoformat()),
            )

        table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
        buf = pa.BufferOutputStream()
        pq.write_table(table, buf)

        store.put(storage_key, buf.getvalue().to_pybytes())
        return storage_key

开发者ID:JDASoftwareGroup，项目名称:kartothek，代码行数:40，代码来源:index.py

注：本文中的pyarrow.parquet.write_table方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。