本文整理汇总了Python中pyarrow.parquet.write_table函数的典型用法代码示例。如果您正苦于以下问题:Python write_table函数的具体用法?Python write_table怎么用?Python write_table使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了write_table函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_pandas_parquet_configuration_options
def test_pandas_parquet_configuration_options(tmpdir):
size = 10000
np.random.seed(0)
df = pd.DataFrame({
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16),
'uint32': np.arange(size, dtype=np.uint32),
'uint64': np.arange(size, dtype=np.uint64),
'int8': np.arange(size, dtype=np.int16),
'int16': np.arange(size, dtype=np.int16),
'int32': np.arange(size, dtype=np.int32),
'int64': np.arange(size, dtype=np.int64),
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0
})
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df)
for use_dictionary in [True, False]:
pq.write_table(arrow_table, filename.strpath,
version="2.0",
use_dictionary=use_dictionary)
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df, df_read)
for compression in ['NONE', 'SNAPPY', 'GZIP']:
pq.write_table(arrow_table, filename.strpath,
version="2.0",
compression=compression)
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df, df_read)
示例2: test_read_multiple_parquet_files
def test_read_multiple_parquet_files(self):
import pyarrow.parquet as pq
nfiles = 10
size = 5
tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid())
self.hdfs.mkdir(tmpdir)
test_data = []
paths = []
for i in range(nfiles):
df = test_parquet._test_dataframe(size, seed=i)
df['index'] = np.arange(i * size, (i + 1) * size)
# Hack so that we don't have a dtype cast in v1 files
df['uint32'] = df['uint32'].astype(np.int64)
path = pjoin(tmpdir, '{0}.parquet'.format(i))
table = pa.Table.from_pandas(df, preserve_index=False)
with self.hdfs.open(path, 'wb') as f:
pq.write_table(table, f)
test_data.append(table)
paths.append(path)
result = self.hdfs.read_parquet(tmpdir)
expected = pa.concat_tables(test_data)
pdt.assert_frame_equal(result.to_pandas()
.sort_values(by='index').reset_index(drop=True),
expected.to_pandas())
示例3: test_read_single_row_group
def test_read_single_row_group():
# ARROW-471
N, K = 10000, 4
df = alltypes_sample(size=N)
a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
buf = io.BytesIO()
pq.write_table(a_table, buf, row_group_size=N / K,
compression='snappy', version='2.0')
buf.seek(0)
pf = pq.ParquetFile(buf)
assert pf.num_row_groups == K
row_groups = [pf.read_row_group(i) for i in range(K)]
result = pa.concat_tables(row_groups)
pdt.assert_frame_equal(df, result.to_pandas())
cols = df.columns[:2]
row_groups = [pf.read_row_group(i, columns=cols)
for i in range(K)]
result = pa.concat_tables(row_groups)
pdt.assert_frame_equal(df[cols], result.to_pandas())
示例4: test_pandas_parquet_1_0_rountrip
def test_pandas_parquet_1_0_rountrip(tmpdir):
size = 10000
np.random.seed(0)
df = pd.DataFrame({
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16),
'uint32': np.arange(size, dtype=np.uint32),
'uint64': np.arange(size, dtype=np.uint64),
'int8': np.arange(size, dtype=np.int16),
'int16': np.arange(size, dtype=np.int16),
'int32': np.arange(size, dtype=np.int32),
'int64': np.arange(size, dtype=np.int64),
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
'str': [str(x) for x in range(size)],
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
'empty_str': [''] * size
})
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df)
pq.write_table(arrow_table, filename.strpath, version="1.0")
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
# We pass uint32_t as int64_t if we write Parquet version 1.0
df['uint32'] = df['uint32'].values.astype(np.int64)
pdt.assert_frame_equal(df, df_read)
示例5: _write_table
def _write_table(table, path, **kwargs):
import pyarrow.parquet as pq
if isinstance(table, pd.DataFrame):
table = pa.Table.from_pandas(table)
pq.write_table(table, path, **kwargs)
return table
示例6: make_sample_file
def make_sample_file(df):
a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
buf = io.BytesIO()
pq.write_table(a_table, buf, compression='SNAPPY', version='2.0')
buf.seek(0)
return pq.ParquetFile(buf)
示例7: test_pandas_parquet_2_0_rountrip
def test_pandas_parquet_2_0_rountrip(tmpdir):
df = alltypes_sample(size=10000)
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
pq.write_table(arrow_table, filename.strpath, version="2.0")
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df, df_read)
示例8: test_pandas_parquet_native_file_roundtrip
def test_pandas_parquet_native_file_roundtrip(tmpdir):
df = _test_dataframe(10000)
arrow_table = A.from_pandas_dataframe(df)
imos = paio.InMemoryOutputStream()
pq.write_table(arrow_table, imos, version="2.0")
buf = imos.get_result()
reader = paio.BufferReader(buf)
df_read = pq.read_table(reader).to_pandas()
pdt.assert_frame_equal(df, df_read)
示例9: test_column_of_lists
def test_column_of_lists(tmpdir):
df, schema = dataframe_with_arrays()
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, schema=schema)
pq.write_table(arrow_table, filename.strpath, version="2.0")
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df, df_read)
示例10: _write_partition_pyarrow
def _write_partition_pyarrow(df, open_with, filename, write_index,
metadata_path=None, **kwargs):
import pyarrow as pa
from pyarrow import parquet
t = pa.Table.from_pandas(df, preserve_index=write_index)
with open_with(filename, 'wb') as fil:
parquet.write_table(t, fil, **kwargs)
if metadata_path is not None:
with open_with(metadata_path, 'wb') as fil:
kwargs.pop('compression', None)
parquet.write_metadata(t.schema, fil, **kwargs)
示例11: read_parquet
def read_parquet(fn):
""" read parquet file with Spark """
print("Loading parquest file: %s..."% fn)
file_name = 'parquet_sample.dat'
read_parquest(file_name)
fn = 'sample.parquet'
tbl = pq.read_table(fn)
df = tbl.to_pandas()
d=df.iloc[:, 0:3]
table = pa.Table.from_pandas(d)
pq.write_table(table, 'example.parquet')
pass
示例12: test_min_chunksize
def test_min_chunksize():
data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
table = pa.Table.from_pandas(data.reset_index())
buf = io.BytesIO()
pq.write_table(table, buf, chunk_size=-1)
buf.seek(0)
result = pq.read_table(buf)
assert result.equals(table)
with pytest.raises(ValueError):
pq.write_table(table, buf, chunk_size=0)
示例13: test_client
def test_client(tmpdir, data):
# construct with a path to a file
d = tmpdir / 'pq'
d.mkdir()
for k, v in data.items():
f = d / "{}.parquet".format(k)
table = pa.Table.from_pandas(v)
pq.write_table(table, str(f))
c = ParquetClient(tmpdir)
assert c.list_databases() == ['pq']
assert c.database().pq.list_tables() == ['close', 'open']
示例14: test_pandas_column_selection
def test_pandas_column_selection(tmpdir):
size = 10000
np.random.seed(0)
df = pd.DataFrame({
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16)
})
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df)
pq.write_table(arrow_table, filename.strpath)
table_read = pq.read_table(filename.strpath, columns=['uint8'])
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df[['uint8']], df_read)
示例15: test_fastparquet_read_with_hdfs
def test_fastparquet_read_with_hdfs():
fs = hdfs_test_client()
df = tm.makeDataFrame()
table = pa.Table.from_pandas(df)
path = '/tmp/testing.parquet'
with fs.open(path, 'wb') as f:
pq.write_table(table, f)
parquet_file = fastparquet.ParquetFile(path, open_with=fs.open)
result = parquet_file.to_pandas()
tm.assert_frame_equal(result, df)