本文整理汇总了Python中pyarrow.parquet.write_table方法的典型用法代码示例。如果您正苦于以下问题:Python parquet.write_table方法的具体用法?Python parquet.write_table怎么用?Python parquet.write_table使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow.parquet
的用法示例。
在下文中一共展示了parquet.write_table方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: dataframe_into_parquet_bytes
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def dataframe_into_parquet_bytes(
df: pd.DataFrame, compression: str = "snappy"
) -> bytes:
"""
Convert a dataframe into bytes representing a parquet table.
Parameters
----------
df: pd.DataFrame
DataFrame to be compressed
compression: str
Compression to use, passed to :func:`pyarrow.parquet.write_table`
Returns
-------
bytes
"""
table = pa.Table.from_pandas(df)
buf = pa.BufferOutputStream()
pq.write_table(table, buf, compression=compression)
return buf.getvalue().to_pybytes()
示例2: convert_apache_arrow_feather_to_apache_parquet
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def convert_apache_arrow_feather_to_apache_parquet(
data_path: InputPath('ApacheArrowFeather'),
output_data_path: OutputPath('ApacheParquet'),
):
'''Converts Apache Arrow Feather to Apache Parquet.
[Apache Arrow Feather](https://arrow.apache.org/docs/python/feather.html)
[Apache Parquet](https://parquet.apache.org/)
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
from pyarrow import feather, parquet
table = feather.read_table(data_path)
parquet.write_table(table, output_data_path)
示例3: test_pandas_parquet_serialization
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def test_pandas_parquet_serialization():
# Only test this if pandas is installed
pytest.importorskip("pandas")
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
tempdir = tempfile.mkdtemp()
filename = os.path.join(tempdir, "parquet-test")
pd.DataFrame({"col1": [0, 1], "col2": [0, 1]}).to_parquet(filename)
with open(os.path.join(tempdir, "parquet-compression"), "wb") as f:
table = pa.Table.from_arrays([pa.array([1, 2, 3])], ["hello"])
pq.write_table(table, f, compression="lz4")
# Clean up
shutil.rmtree(tempdir)
示例4: store
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def store(self, store, key_prefix, df):
key = "{}.parquet".format(key_prefix)
if isinstance(df, pa.Table):
table = df
else:
table = pa.Table.from_pandas(df)
buf = pa.BufferOutputStream()
if (
self.chunk_size
and self.chunk_size < len(table)
and not ARROW_LARGER_EQ_0150
):
table = _reset_dictionary_columns(table)
pq.write_table(
table,
buf,
version=self._PARQUET_VERSION,
chunk_size=self.chunk_size,
compression=self.compression,
coerce_timestamps="us",
)
store.put(key, buf.getvalue().to_pybytes())
return key
示例5: test_pyarrow_07992
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def test_pyarrow_07992(store):
key = "test.parquet"
df = pd.DataFrame({"a": [1]})
table = pa.Table.from_pandas(df)
meta = b"""{
"pandas_version": "0.20.3",
"index_columns": ["__index_level_0__"],
"columns": [
{"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"},
{"metadata": null, "name": null, "numpy_type": "int64", "pandas_type": "int64"}
],
"column_indexes": [
{"metadata": null, "name": null, "numpy_type": "object", "pandas_type": "string"}
]
}"""
table = table.replace_schema_metadata({b"pandas": meta})
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
store.put(key, buf.getvalue().to_pybytes())
pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)
示例6: test_index_metadata
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def test_index_metadata(store):
key = "test.parquet"
df = pd.DataFrame({"a": [1]})
table = pa.Table.from_pandas(df)
meta = b"""{
"pandas_version": "0.20.3",
"index_columns": ["__index_level_0__"],
"columns": [
{"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"}
]
}"""
table = table.replace_schema_metadata({b"pandas": meta})
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
store.put(key, buf.getvalue().to_pybytes())
pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)
示例7: insert
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def insert(self, path, expr, **kwargs):
path = self.root / path
df = execute(expr)
table = pa.Table.from_pandas(df)
pq.write_table(table, str(path))
示例8: to_parquet
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def to_parquet(
df,
bucket_name,
prefix,
retry_config,
session_kwargs,
client_kwargs,
compression=None,
flavor="spark",
):
import pyarrow as pa
import pyarrow.parquet as pq
session = Session(**session_kwargs)
client = session.resource("s3", **client_kwargs)
bucket = client.Bucket(bucket_name)
table = pa.Table.from_pandas(df)
buf = pa.BufferOutputStream()
pq.write_table(table, buf, compression=compression, flavor=flavor)
response = retry_api_call(
bucket.put_object,
config=retry_config,
Body=buf.getvalue().to_pybytes(),
Key=prefix + str(uuid.uuid4()),
)
return "s3://{0}/{1}".format(response.bucket_name, response.key)
示例9: save_parquet
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def save_parquet(filename, dataframe, hf_meta):
table = pa.Table.from_pandas(dataframe, preserve_index=True)
meta_dict = table.schema.metadata
hf_string = json.dumps(hf_meta).encode()
meta_dict[b"hydrofunctions_meta"] = hf_string
table = table.replace_schema_metadata(meta_dict)
pq.write_table(table, filename)
示例10: convert_tsv_to_apache_parquet
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def convert_tsv_to_apache_parquet(
data_path: InputPath('TSV'),
output_data_path: OutputPath('ApacheParquet'),
):
'''Converts TSV table to Apache Parquet.
[Apache Parquet](https://parquet.apache.org/)
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
from pyarrow import csv, parquet
table = csv.read_csv(data_path, parse_options=csv.ParseOptions(delimiter='\t'))
parquet.write_table(table, output_data_path)
示例11: convert_csv_to_apache_parquet
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def convert_csv_to_apache_parquet(
data_path: InputPath('CSV'),
output_data_path: OutputPath('ApacheParquet'),
):
'''Converts CSV table to Apache Parquet.
[Apache Parquet](https://parquet.apache.org/)
Annotations:
author: Alexey Volkov <alexey.volkov@ark-kun.com>
'''
from pyarrow import csv, parquet
table = csv.read_csv(data_path)
parquet.write_table(table, output_data_path)
示例12: test_read_parquet
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def test_read_parquet(tmpdir, continuous):
str_arr = pa.array(["a", None, "c"], pa.string())
int_arr = pa.array([1, None, -2], pa.int32())
bool_arr = pa.array([True, None, False], pa.bool_())
table = pa.Table.from_arrays([str_arr, int_arr, bool_arr], ["str", "int", "bool"])
pq.write_table(table, "df.parquet")
result = fr.read_parquet("df.parquet", continuous=continuous)
expected = fr.pandas_from_arrow(table, continuous=continuous)
tm.assert_frame_equal(result, expected)
示例13: test_parquet_roundtrip
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def test_parquet_roundtrip(array_type):
df = pd.DataFrame({"col": array_type(["A", "B"])})
table = pa.Table.from_pandas(df)
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
reader = pa.BufferReader(buf.getvalue().to_pybytes())
table = pq.read_table(reader)
pdt.assert_frame_equal(df, table.to_pandas())
示例14: gen_kde
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def gen_kde(N, file_name):
# np.random.seed(0)
df = pd.DataFrame({'points': np.random.random(N)})
table = pa.Table.from_pandas(df)
row_group_size = 128
pq.write_table(table, 'kde.parquet', row_group_size)
示例15: store
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import write_table [as 别名]
def store(self, store: KeyValueStore, dataset_uuid: str) -> str:
"""
Store the index as a parquet file
If compatible, the new keyname will be the name stored under the attribute `index_storage_key`.
If this attribute is None, a new key will be generated of the format
`{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet`
where the timestamp is in nanosecond accuracy and is created upon Index object initialization
Parameters
----------
store:
dataset_uuid:
"""
storage_key = None
if (
self.index_storage_key is not None
and dataset_uuid
and dataset_uuid in self.index_storage_key
):
storage_key = self.index_storage_key
if storage_key is None:
storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format(
dataset_uuid=dataset_uuid,
suffix=naming.EXTERNAL_INDEX_SUFFIX,
column=quote(self.column),
timestamp=quote(self.creation_time.isoformat()),
)
table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
store.put(storage_key, buf.getvalue().to_pybytes())
return storage_key