本文整理汇总了Python中pyarrow.BufferReader方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.BufferReader方法的具体用法?Python pyarrow.BufferReader怎么用?Python pyarrow.BufferReader使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.BufferReader方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _parquet_bytes_to_dict
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferReader [as 别名]
def _parquet_bytes_to_dict(column: str, index_buffer: bytes):
reader = pa.BufferReader(index_buffer)
# This can be done much more efficient but would take a lot more
# time to implement so this will be only done on request.
table = pq.read_table(reader)
if ARROW_LARGER_EQ_0150:
column_type = table.schema.field(column).type
else:
column_type = table.schema.field_by_name(column).type
# `datetime.datetime` objects have a precision of up to microseconds only, so arrow
# parses the type to `pa.timestamp("us")`. Since the
# values are normalized to `numpy.datetime64[ns]` anyways, we do not care about this
# and load the column type as `pa.timestamp("ns")`
if column_type == pa.timestamp("us"):
column_type = pa.timestamp("ns")
df = _fix_pyarrow_07992_table(table).to_pandas() # Could eventually be phased out
index_dct = dict(
zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values))
)
return index_dct, column_type
示例2: write_mutable_tensor
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferReader [as 别名]
def write_mutable_tensor(self, session_id, name, payload_type, body):
import pyarrow
from ..serialize import dataserializer
from ..tensor.core import Indexes
session_uid = SessionActor.gen_uid(session_id)
session_ref = self.get_actor_ref(session_uid)
index_json_size = np.frombuffer(body[0:8], dtype=np.int64).item()
index_json = json.loads(body[8:8+index_json_size].decode('ascii'))
index = Indexes.from_json(index_json).indexes
if payload_type is None:
value = dataserializer.loads(body[8+index_json_size:])
elif payload_type == 'tensor':
tensor_chunk_offset = 8 + index_json_size
with pyarrow.BufferReader(body[tensor_chunk_offset:]) as reader:
value = pyarrow.read_tensor(reader).to_numpy()
elif payload_type == 'record_batch':
schema_size = np.frombuffer(body[8+index_json_size:8+index_json_size+8], dtype=np.int64).item()
schema_offset = 8 + index_json_size + 8
with pyarrow.BufferReader(body[schema_offset:schema_offset+schema_size]) as reader:
schema = pyarrow.read_schema(reader)
record_batch_offset = schema_offset + schema_size
with pyarrow.BufferReader(body[record_batch_offset:]) as reader:
record_batch = pyarrow.read_record_batch(reader, schema)
value = record_batch.to_pandas().to_records(index=False)
else:
raise ValueError('Not supported payload type: %s' % payload_type)
return session_ref.write_mutable_tensor(name, index, value)
示例3: _deserialize
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferReader [as 别名]
def _deserialize(data: bytes, msgpacked_cols: List[str]) -> pd.DataFrame:
"""
Data are returned as feather-packed pandas DataFrames.
Due to limitations in pyarrow, some objects are msgpacked inside the DataFrame.
"""
import pyarrow
df = pd.read_feather(pyarrow.BufferReader(data))
for col in msgpacked_cols:
df[col] = df[col].apply(lambda element: deserialize(element, "msgpack-ext"))
if "index" in df.columns:
df.set_index("index", inplace=True) # pandas.to_feather does not support indexes,
# so we have to send indexless frames over the wire, and set the index here.
return df
示例4: test_parquet_roundtrip
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferReader [as 别名]
def test_parquet_roundtrip(array_type):
df = pd.DataFrame({"col": array_type(["A", "B"])})
table = pa.Table.from_pandas(df)
buf = pa.BufferOutputStream()
pq.write_table(table, buf)
reader = pa.BufferReader(buf.getvalue().to_pybytes())
table = pq.read_table(reader)
pdt.assert_frame_equal(df, table.to_pandas())
示例5: _bytes2schema
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferReader [as 别名]
def _bytes2schema(data):
reader = pa.BufferReader(data)
schema = pq.read_schema(reader)
fields = []
for idx in range(len(schema)):
f = schema[idx]
# schema data recovered from parquet always contains timestamp data in us-granularity, but pandas will use
# ns-granularity, so we re-align the two different worlds here
if f.type == pa.timestamp("us"):
f = pa.field(f.name, pa.timestamp("ns"))
fields.append(f)
return pa.schema(fields, schema.metadata)