当前位置: 首页>>代码示例>>Python>>正文


Python pyarrow.BufferOutputStream方法代码示例

本文整理汇总了Python中pyarrow.BufferOutputStream方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.BufferOutputStream方法的具体用法?Python pyarrow.BufferOutputStream怎么用?Python pyarrow.BufferOutputStream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyarrow的用法示例。


在下文中一共展示了pyarrow.BufferOutputStream方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: dataframe_into_parquet_bytes

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def dataframe_into_parquet_bytes(
    df: pd.DataFrame, compression: str = "snappy"
) -> bytes:
    """
    Convert a dataframe into bytes representing a parquet table.

    Parameters
    ----------
    df: pd.DataFrame
        DataFrame to be compressed
    compression: str
        Compression to use, passed to  :func:`pyarrow.parquet.write_table`

    Returns
    -------
    bytes
    """
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf, compression=compression)
    return buf.getvalue().to_pybytes() 
开发者ID:equinor,项目名称:gordo,代码行数:23,代码来源:utils.py

示例2: store

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def store(self, store, key_prefix, df):
        key = "{}.parquet".format(key_prefix)
        if isinstance(df, pa.Table):
            table = df
        else:
            table = pa.Table.from_pandas(df)
        buf = pa.BufferOutputStream()
        if (
            self.chunk_size
            and self.chunk_size < len(table)
            and not ARROW_LARGER_EQ_0150
        ):
            table = _reset_dictionary_columns(table)
        pq.write_table(
            table,
            buf,
            version=self._PARQUET_VERSION,
            chunk_size=self.chunk_size,
            compression=self.compression,
            coerce_timestamps="us",
        )
        store.put(key, buf.getvalue().to_pybytes())
        return key 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:25,代码来源:_parquet.py

示例3: test_pyarrow_07992

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def test_pyarrow_07992(store):
    key = "test.parquet"
    df = pd.DataFrame({"a": [1]})
    table = pa.Table.from_pandas(df)
    meta = b"""{
        "pandas_version": "0.20.3",
        "index_columns": ["__index_level_0__"],
        "columns": [
            {"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"},
            {"metadata": null, "name": null, "numpy_type": "int64", "pandas_type": "int64"}
        ],
        "column_indexes": [
            {"metadata": null, "name": null, "numpy_type": "object", "pandas_type": "string"}
        ]
    }"""
    table = table.replace_schema_metadata({b"pandas": meta})
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(key, buf.getvalue().to_pybytes())
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df) 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:22,代码来源:test_parquet.py

示例4: to_parquet

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def to_parquet(
    df,
    bucket_name,
    prefix,
    retry_config,
    session_kwargs,
    client_kwargs,
    compression=None,
    flavor="spark",
):
    import pyarrow as pa
    import pyarrow.parquet as pq

    session = Session(**session_kwargs)
    client = session.resource("s3", **client_kwargs)
    bucket = client.Bucket(bucket_name)
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf, compression=compression, flavor=flavor)
    response = retry_api_call(
        bucket.put_object,
        config=retry_config,
        Body=buf.getvalue().to_pybytes(),
        Key=prefix + str(uuid.uuid4()),
    )
    return "s3://{0}/{1}".format(response.bucket_name, response.key) 
开发者ID:laughingman7743,项目名称:PyAthena,代码行数:28,代码来源:util.py

示例5: serialize

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def serialize(self, rows):
        # Need to be able to serialize `None`. A bit hacky, but we use an empty buffer to encode 'None'.
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchStreamWriter(sink, rows.schema)
        writer.write_table(rows)
        writer.close()
        return sink.getvalue() 
开发者ID:uber,项目名称:petastorm,代码行数:9,代码来源:arrow_table_serializer.py

示例6: _serialize_arrow_payload

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def _serialize_arrow_payload(data, table_metadata, preserve_index=True):

    if isinstance(data, pd.DataFrame):

        # detect if there are categorical columns in dataframe
        cols = data.select_dtypes(include=['category']).columns

        # if there are categorical columns, make a copy before casting
        # to avoid mutating input data
        # https://github.com/omnisci/pymapd/issues/169
        if cols.size > 0:
            data_ = data.copy()
            data_[cols] = data_[cols].astype('object')
        else:
            data_ = data

        data = pa.RecordBatch.from_pandas(data_, preserve_index=preserve_index)

    stream = pa.BufferOutputStream()
    writer = pa.RecordBatchStreamWriter(stream, data.schema)

    if isinstance(data, pa.RecordBatch):
        writer.write_batch(data)
    elif isinstance(data, pa.Table):
        writer.write_table(data)

    writer.close()
    return stream.getvalue() 
开发者ID:omnisci,项目名称:pymapd,代码行数:30,代码来源:_pandas_loaders.py

示例7: test_parquet_roundtrip

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def test_parquet_roundtrip(array_type):
    df = pd.DataFrame({"col": array_type(["A", "B"])})
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    reader = pa.BufferReader(buf.getvalue().to_pybytes())
    table = pq.read_table(reader)
    pdt.assert_frame_equal(df, table.to_pandas()) 
开发者ID:xhochy,项目名称:fletcher,代码行数:10,代码来源:test_pyarrow_roundtrip.py

示例8: store

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def store(self, store: KeyValueStore, dataset_uuid: str) -> str:
        """
        Store the index as a parquet file

        If compatible, the new keyname will be the name stored under the attribute `index_storage_key`.
        If this attribute is None, a new key will be generated of the format

            `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet`

        where the timestamp is in nanosecond accuracy and is created upon Index object initialization

        Parameters
        ----------
        store:
        dataset_uuid:
        """
        storage_key = None

        if (
            self.index_storage_key is not None
            and dataset_uuid
            and dataset_uuid in self.index_storage_key
        ):
            storage_key = self.index_storage_key
        if storage_key is None:
            storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format(
                dataset_uuid=dataset_uuid,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
                column=quote(self.column),
                timestamp=quote(self.creation_time.isoformat()),
            )

        table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
        buf = pa.BufferOutputStream()
        pq.write_table(table, buf)

        store.put(storage_key, buf.getvalue().to_pybytes())
        return storage_key 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:40,代码来源:index.py

示例9: __getstate__

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def __getstate__(self):
        if not self.loaded:
            return (self.column, self.index_storage_key, self.dtype, None)

        table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
        buf = pa.BufferOutputStream()
        pq.write_table(table, buf)
        parquet_bytes = buf.getvalue().to_pybytes()
        # Since `self.dtype` will be inferred by parquet bytes, do not return
        # this argument during serialization to avoid unnecessary memory consumption
        return (self.column, self.index_storage_key, None, parquet_bytes) 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:13,代码来源:index.py

示例10: _schema2bytes

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def _schema2bytes(schema):
    buf = pa.BufferOutputStream()
    pq.write_metadata(schema, buf, version="2.0", coerce_timestamps="us")
    return buf.getvalue().to_pybytes() 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:6,代码来源:common_metadata.py

示例11: test_compat_old_rw_path

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def test_compat_old_rw_path(df_all_types, store):
    # strip down DF before some column types weren't supported before anyway
    df = df_all_types[
        [
            c
            for c in df_all_types.columns
            if (
                not c.startswith("array_")  # array types (always null)
                and c != "unicode"  # unicode type (alway null)
                and "8" not in c  # 8 bit types are casted to 64 bit
                and "16" not in c  # 16 bit types are casted to 64 bit
                and "32" not in c  # 32 bit types are casted to 64 bit
            )
        ]
    ]
    expected_meta = make_meta(df, origin="df")

    # old schema write path
    old_meta = dask_make_meta(df)
    pa_table = pa.Table.from_pandas(old_meta)
    buf = pa.BufferOutputStream()
    pq.write_table(pa_table, buf, version="2.0")
    key_old = _get_common_metadata_key("dataset_uuid_old", "table")
    store.put(key_old, buf.getvalue().to_pybytes())

    actual_meta = read_schema_metadata(
        dataset_uuid="dataset_uuid_old", store=store, table="table"
    )
    validate_compatible([actual_meta, expected_meta])

    store_schema_metadata(
        schema=make_meta(df, origin="df"),
        dataset_uuid="dataset_uuid_new",
        store=store,
        table="table",
    )
    key_new = _get_common_metadata_key("dataset_uuid_new", "table")
    actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store)
    actual_df["date"] = actual_df["date"].dt.date
    pdt.assert_frame_equal(actual_df, old_meta) 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:42,代码来源:test_common_metadata.py


注:本文中的pyarrow.BufferOutputStream方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。