本文整理汇总了Python中pyarrow.BufferOutputStream方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.BufferOutputStream方法的具体用法?Python pyarrow.BufferOutputStream怎么用?Python pyarrow.BufferOutputStream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.BufferOutputStream方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: dataframe_into_parquet_bytes
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def dataframe_into_parquet_bytes(
df: pd.DataFrame, compression: str = "snappy"
) -> bytes:
"""
Convert a dataframe into bytes representing a parquet table.
Parameters
----------
df: pd.DataFrame
DataFrame to be compressed
compression: str
Compression to use, passed to :func:`pyarrow.parquet.write_table`
Returns
-------
bytes
"""
table = pa.Table.from_pandas(df)
buf = pa.BufferOutputStream()
pq.write_table(table, buf, compression=compression)
return buf.getvalue().to_pybytes()
示例2: store
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def store(self, store, key_prefix, df):
key = "{}.parquet".format(key_prefix)
if isinstance(df, pa.Table):
table = df
else:
table = pa.Table.from_pandas(df)
buf = pa.BufferOutputStream()
if (
self.chunk_size
and self.chunk_size < len(table)
and not ARROW_LARGER_EQ_0150
):
table = _reset_dictionary_columns(table)
pq.write_table(
table,
buf,
version=self._PARQUET_VERSION,
chunk_size=self.chunk_size,
compression=self.compression,
coerce_timestamps="us",
)
store.put(key, buf.getvalue().to_pybytes())
return key
示例3: test_pyarrow_07992
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def test_pyarrow_07992(store):
key = "test.parquet"
df = pd.DataFrame({"a": [1]})
table = pa.Table.from_pandas(df)
meta = b"""{
"pandas_version": "0.20.3",
"index_columns": ["__index_level_0__"],
"columns": [
{"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"},
{"metadata": null, "name": null, "numpy_type": "int64", "pandas_type": "int64"}
],
"column_indexes": [
{"metadata": null, "name": null, "numpy_type": "object", "pandas_type": "string"}
]
}"""
table = table.replace_schema_metadata({b"pandas": meta})
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
store.put(key, buf.getvalue().to_pybytes())
pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)
示例4: to_parquet
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def to_parquet(
df,
bucket_name,
prefix,
retry_config,
session_kwargs,
client_kwargs,
compression=None,
flavor="spark",
):
import pyarrow as pa
import pyarrow.parquet as pq
session = Session(**session_kwargs)
client = session.resource("s3", **client_kwargs)
bucket = client.Bucket(bucket_name)
table = pa.Table.from_pandas(df)
buf = pa.BufferOutputStream()
pq.write_table(table, buf, compression=compression, flavor=flavor)
response = retry_api_call(
bucket.put_object,
config=retry_config,
Body=buf.getvalue().to_pybytes(),
Key=prefix + str(uuid.uuid4()),
)
return "s3://{0}/{1}".format(response.bucket_name, response.key)
示例5: serialize
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def serialize(self, rows):
# Need to be able to serialize `None`. A bit hacky, but we use an empty buffer to encode 'None'.
sink = pa.BufferOutputStream()
writer = pa.RecordBatchStreamWriter(sink, rows.schema)
writer.write_table(rows)
writer.close()
return sink.getvalue()
示例6: _serialize_arrow_payload
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def _serialize_arrow_payload(data, table_metadata, preserve_index=True):
if isinstance(data, pd.DataFrame):
# detect if there are categorical columns in dataframe
cols = data.select_dtypes(include=['category']).columns
# if there are categorical columns, make a copy before casting
# to avoid mutating input data
# https://github.com/omnisci/pymapd/issues/169
if cols.size > 0:
data_ = data.copy()
data_[cols] = data_[cols].astype('object')
else:
data_ = data
data = pa.RecordBatch.from_pandas(data_, preserve_index=preserve_index)
stream = pa.BufferOutputStream()
writer = pa.RecordBatchStreamWriter(stream, data.schema)
if isinstance(data, pa.RecordBatch):
writer.write_batch(data)
elif isinstance(data, pa.Table):
writer.write_table(data)
writer.close()
return stream.getvalue()
示例7: test_parquet_roundtrip
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def test_parquet_roundtrip(array_type):
df = pd.DataFrame({"col": array_type(["A", "B"])})
table = pa.Table.from_pandas(df)
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
reader = pa.BufferReader(buf.getvalue().to_pybytes())
table = pq.read_table(reader)
pdt.assert_frame_equal(df, table.to_pandas())
示例8: store
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def store(self, store: KeyValueStore, dataset_uuid: str) -> str:
"""
Store the index as a parquet file
If compatible, the new keyname will be the name stored under the attribute `index_storage_key`.
If this attribute is None, a new key will be generated of the format
`{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet`
where the timestamp is in nanosecond accuracy and is created upon Index object initialization
Parameters
----------
store:
dataset_uuid:
"""
storage_key = None
if (
self.index_storage_key is not None
and dataset_uuid
and dataset_uuid in self.index_storage_key
):
storage_key = self.index_storage_key
if storage_key is None:
storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format(
dataset_uuid=dataset_uuid,
suffix=naming.EXTERNAL_INDEX_SUFFIX,
column=quote(self.column),
timestamp=quote(self.creation_time.isoformat()),
)
table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
store.put(storage_key, buf.getvalue().to_pybytes())
return storage_key
示例9: __getstate__
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def __getstate__(self):
if not self.loaded:
return (self.column, self.index_storage_key, self.dtype, None)
table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
parquet_bytes = buf.getvalue().to_pybytes()
# Since `self.dtype` will be inferred by parquet bytes, do not return
# this argument during serialization to avoid unnecessary memory consumption
return (self.column, self.index_storage_key, None, parquet_bytes)
示例10: _schema2bytes
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def _schema2bytes(schema):
buf = pa.BufferOutputStream()
pq.write_metadata(schema, buf, version="2.0", coerce_timestamps="us")
return buf.getvalue().to_pybytes()
示例11: test_compat_old_rw_path
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferOutputStream [as 别名]
def test_compat_old_rw_path(df_all_types, store):
# strip down DF before some column types weren't supported before anyway
df = df_all_types[
[
c
for c in df_all_types.columns
if (
not c.startswith("array_") # array types (always null)
and c != "unicode" # unicode type (alway null)
and "8" not in c # 8 bit types are casted to 64 bit
and "16" not in c # 16 bit types are casted to 64 bit
and "32" not in c # 32 bit types are casted to 64 bit
)
]
]
expected_meta = make_meta(df, origin="df")
# old schema write path
old_meta = dask_make_meta(df)
pa_table = pa.Table.from_pandas(old_meta)
buf = pa.BufferOutputStream()
pq.write_table(pa_table, buf, version="2.0")
key_old = _get_common_metadata_key("dataset_uuid_old", "table")
store.put(key_old, buf.getvalue().to_pybytes())
actual_meta = read_schema_metadata(
dataset_uuid="dataset_uuid_old", store=store, table="table"
)
validate_compatible([actual_meta, expected_meta])
store_schema_metadata(
schema=make_meta(df, origin="df"),
dataset_uuid="dataset_uuid_new",
store=store,
table="table",
)
key_new = _get_common_metadata_key("dataset_uuid_new", "table")
actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store)
actual_df["date"] = actual_df["date"].dt.date
pdt.assert_frame_equal(actual_df, old_meta)