本文整理汇总了Python中pyarrow.parquet.read_table方法的典型用法代码示例。如果您正苦于以下问题:Python parquet.read_table方法的具体用法?Python parquet.read_table怎么用?Python parquet.read_table使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow.parquet
的用法示例。
在下文中一共展示了parquet.read_table方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_parquet_dmatrix_file_mode
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def _get_parquet_dmatrix_file_mode(files_path):
"""Get Data Matrix from parquet data in file mode.
:param files_path: File path where parquet formatted training data resides, either directory or file
:return: xgb.DMatrix
"""
try:
table = pq.read_table(files_path)
data = table.to_pandas()
del table
if type(data) is pd.DataFrame:
# pyarrow.Table.to_pandas may produce NumPy array or pandas DataFrame
data = data.to_numpy()
dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])
del data
return dmatrix
except Exception as e:
raise exc.UserError("Failed to load parquet data with exception:\n{}".format(e))
示例2: dataframe_from_parquet_bytes
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def dataframe_from_parquet_bytes(buf: bytes) -> pd.DataFrame:
"""
Convert bytes representing a parquet table into a pandas dataframe.
Parameters
----------
buf: bytes
Bytes representing a parquet table. Can be the direct result from
`func`::gordo.server.utils.dataframe_into_parquet_bytes
Returns
-------
pandas.DataFrame
"""
table = pq.read_table(io.BytesIO(buf))
return table.to_pandas()
示例3: read_parquet
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def read_parquet(
path, columns: Optional[List[str]] = None, continuous: bool = False
) -> pd.DataFrame:
"""
Load a parquet object from the file path, returning a DataFrame with fletcher columns.
Parameters
----------
path : str or file-like
continuous : bool
Use FletcherContinuousArray instead of FletcherChunkedArray
Returns
-------
pd.DataFrame
"""
table = pq.read_table(path, columns=columns)
return pandas_from_arrow(table, continuous=continuous)
示例4: _parquet_bytes_to_dict
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def _parquet_bytes_to_dict(column: str, index_buffer: bytes):
reader = pa.BufferReader(index_buffer)
# This can be done much more efficient but would take a lot more
# time to implement so this will be only done on request.
table = pq.read_table(reader)
if ARROW_LARGER_EQ_0150:
column_type = table.schema.field(column).type
else:
column_type = table.schema.field_by_name(column).type
# `datetime.datetime` objects have a precision of up to microseconds only, so arrow
# parses the type to `pa.timestamp("us")`. Since the
# values are normalized to `numpy.datetime64[ns]` anyways, we do not care about this
# and load the column type as `pa.timestamp("ns")`
if column_type == pa.timestamp("us"):
column_type = pa.timestamp("ns")
df = _fix_pyarrow_07992_table(table).to_pandas() # Could eventually be phased out
index_dct = dict(
zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values))
)
return index_dct, column_type
示例5: df_from_bytes_parquet_
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def df_from_bytes_parquet_(bytes_: bytes) -> pd.DataFrame:
"""
Since pyabc 0.9.14, pandas DataFrames are converted using
pyarrow parquet. If the conversion to DataFrame fails,
then `df_from_bytes_msgpack_` is tried, which was the formerly
used method. This is in particular useful for databases that
still employ the old format. In case errors occur here, it may
be necessary to use a pandas version prior to 0.25.0.
"""
try:
b = BytesIO(bytes_)
table = parquet.read_table(b)
df = table.to_pandas()
except pyarrow.lib.ArrowIOError:
df = df_from_bytes_msgpack_(bytes_)
return df
示例6: _get_parquet_dmatrix_pipe_mode
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def _get_parquet_dmatrix_pipe_mode(pipe_path):
"""Get Data Matrix from parquet data in pipe mode.
:param pipe_path: SageMaker pipe path where parquet formatted training data is piped
:return: xgb.DMatrix or None
"""
try:
f = mlio.SageMakerPipe(pipe_path)
examples = []
with f.open_read() as strm:
reader = mlio.ParquetRecordReader(strm)
for record in reader:
table = pq.read_table(as_arrow_file(record))
array = table.to_pandas()
if type(array) is pd.DataFrame:
array = array.to_numpy()
examples.append(array)
if examples:
data = np.vstack(examples)
del examples
dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])
return dmatrix
else:
return None
except Exception as e:
raise exc.UserError("Failed to load parquet data with exception:\n{}".format(e))
示例7: parquet_read_table
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def parquet_read_table(op, client, scope, **kwargs):
path = client.dictionary[op.name]
table = pq.read_table(str(path))
df = table.to_pandas()
return df
示例8: test_write
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_write(self):
# Write out test file
with UncloseableBytesIO() as write_buffer:
with Writer(write_buffer, self.table) as writer:
writer.write_row_group(self.data)
file_bytes = write_buffer.getvalue()
# Read in test file
read_buffer = BytesIO(file_bytes)
with pa.PythonFile(read_buffer, mode='r') as infile:
# Verify data
parq_table = pq.read_table(infile)
written_data = list(parq_table.to_pydict().values())
tuples_by_data_type = zip(self.data, written_data)
for i in tuples_by_data_type:
tuples_by_order = zip(i[0], i[1])
for j in tuples_by_order:
self.assertAlmostEquals(j[0], j[1], places=5)
# Verify parquet file schema
for i, field in enumerate(parq_table.schema):
self.assertEqual(field.type.id, self.expected_datatypes[i].id)
# Ensure timestamp column was written with int96; right now
# there is no way to see except to check that the unit on
# the timestamp type is 'ns'
ts_col = parq_table.schema.field_by_name('timestamp_col')
self.assertEqual(ts_col.type.unit, 'ns')
示例9: read_parquet
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def read_parquet(filename):
pa_table = pq.read_table(filename)
dataframe = pa_table.to_pandas()
meta_dict = pa_table.schema.metadata
if b"hydrofunctions_meta" in meta_dict:
meta_string = meta_dict[b"hydrofunctions_meta"].decode()
meta = json.loads(meta_string, encoding="utf-8")
else:
meta = None
return dataframe, meta
示例10: test_parquet_roundtrip
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_parquet_roundtrip(array_type):
df = pd.DataFrame({"col": array_type(["A", "B"])})
table = pa.Table.from_pandas(df)
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
reader = pa.BufferReader(buf.getvalue().to_pybytes())
table = pq.read_table(reader)
pdt.assert_frame_equal(df, table.to_pandas())
示例11: test_pq_read
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_pq_read(self):
def test_impl():
t = pq.read_table('kde.parquet')
df = t.to_pandas()
X = df['points']
return X.sum()
hpat_func = self.jit(test_impl)
np.testing.assert_almost_equal(hpat_func(), test_impl())
self.assertEqual(count_array_REPs(), 0)
self.assertEqual(count_parfor_REPs(), 0)
示例12: test_pq_str
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_pq_str(self):
def test_impl():
df = pq.read_table('example.parquet').to_pandas()
A = df.two.values == 'foo'
return A.sum()
hpat_func = self.jit(test_impl)
np.testing.assert_almost_equal(hpat_func(), test_impl())
self.assertEqual(count_array_REPs(), 0)
self.assertEqual(count_parfor_REPs(), 0)
示例13: test_pq_str_with_nan_seq
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_pq_str_with_nan_seq(self):
def test_impl():
df = pq.read_table('example.parquet').to_pandas()
A = df.five.values == 'foo'
return A
hpat_func = self.jit(test_impl)
np.testing.assert_almost_equal(hpat_func(), test_impl())
示例14: test_pq_str_with_nan_par
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_pq_str_with_nan_par(self):
def test_impl():
df = pq.read_table('example.parquet').to_pandas()
A = df.five.values == 'foo'
return A.sum()
hpat_func = self.jit(test_impl)
np.testing.assert_almost_equal(hpat_func(), test_impl())
self.assertEqual(count_array_REPs(), 0)
self.assertEqual(count_parfor_REPs(), 0)
示例15: test_pq_str_with_nan_par_multigroup
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_pq_str_with_nan_par_multigroup(self):
def test_impl():
df = pq.read_table('example2.parquet').to_pandas()
A = df.five.values == 'foo'
return A.sum()
hpat_func = self.jit(test_impl)
np.testing.assert_almost_equal(hpat_func(), test_impl())
self.assertEqual(count_array_REPs(), 0)
self.assertEqual(count_parfor_REPs(), 0)