当前位置: 首页>>代码示例>>Python>>正文


Python pyarrow.BufferReader方法代码示例

本文整理汇总了Python中pyarrow.BufferReader方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.BufferReader方法的具体用法?Python pyarrow.BufferReader怎么用?Python pyarrow.BufferReader使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyarrow的用法示例。


在下文中一共展示了pyarrow.BufferReader方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _parquet_bytes_to_dict

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferReader [as 别名]
def _parquet_bytes_to_dict(column: str, index_buffer: bytes):
    reader = pa.BufferReader(index_buffer)
    # This can be done much more efficient but would take a lot more
    # time to implement so this will be only done on request.
    table = pq.read_table(reader)
    if ARROW_LARGER_EQ_0150:
        column_type = table.schema.field(column).type
    else:
        column_type = table.schema.field_by_name(column).type

    # `datetime.datetime` objects have a precision of up to microseconds only, so arrow
    # parses the type to `pa.timestamp("us")`. Since the
    # values are normalized to `numpy.datetime64[ns]` anyways, we do not care about this
    # and load the column type as `pa.timestamp("ns")`
    if column_type == pa.timestamp("us"):
        column_type = pa.timestamp("ns")

    df = _fix_pyarrow_07992_table(table).to_pandas()  # Could eventually be phased out

    index_dct = dict(
        zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values))
    )
    return index_dct, column_type 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:25,代码来源:index.py

示例2: write_mutable_tensor

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferReader [as 别名]
def write_mutable_tensor(self, session_id, name, payload_type, body):
        import pyarrow

        from ..serialize import dataserializer
        from ..tensor.core import Indexes
        session_uid = SessionActor.gen_uid(session_id)
        session_ref = self.get_actor_ref(session_uid)

        index_json_size = np.frombuffer(body[0:8], dtype=np.int64).item()
        index_json = json.loads(body[8:8+index_json_size].decode('ascii'))
        index = Indexes.from_json(index_json).indexes
        if payload_type is None:
            value = dataserializer.loads(body[8+index_json_size:])
        elif payload_type == 'tensor':
            tensor_chunk_offset = 8 + index_json_size
            with pyarrow.BufferReader(body[tensor_chunk_offset:]) as reader:
                value = pyarrow.read_tensor(reader).to_numpy()
        elif payload_type == 'record_batch':
            schema_size = np.frombuffer(body[8+index_json_size:8+index_json_size+8], dtype=np.int64).item()
            schema_offset = 8 + index_json_size + 8
            with pyarrow.BufferReader(body[schema_offset:schema_offset+schema_size]) as reader:
                schema = pyarrow.read_schema(reader)
            record_batch_offset = schema_offset + schema_size
            with pyarrow.BufferReader(body[record_batch_offset:]) as reader:
                record_batch = pyarrow.read_record_batch(reader, schema)
                value = record_batch.to_pandas().to_records(index=False)
        else:
            raise ValueError('Not supported payload type: %s' % payload_type)
        return session_ref.write_mutable_tensor(name, index, value) 
开发者ID:mars-project,项目名称:mars,代码行数:31,代码来源:server.py

示例3: _deserialize

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferReader [as 别名]
def _deserialize(data: bytes, msgpacked_cols: List[str]) -> pd.DataFrame:
        """
        Data are returned as feather-packed pandas DataFrames.
        Due to limitations in pyarrow, some objects are msgpacked inside the DataFrame.
        """
        import pyarrow

        df = pd.read_feather(pyarrow.BufferReader(data))
        for col in msgpacked_cols:
            df[col] = df[col].apply(lambda element: deserialize(element, "msgpack-ext"))

        if "index" in df.columns:
            df.set_index("index", inplace=True)  # pandas.to_feather does not support indexes,
            # so we have to send indexless frames over the wire, and set the index here.
        return df 
开发者ID:MolSSI,项目名称:QCPortal,代码行数:17,代码来源:dataset_view.py

示例4: test_parquet_roundtrip

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferReader [as 别名]
def test_parquet_roundtrip(array_type):
    df = pd.DataFrame({"col": array_type(["A", "B"])})
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    reader = pa.BufferReader(buf.getvalue().to_pybytes())
    table = pq.read_table(reader)
    pdt.assert_frame_equal(df, table.to_pandas()) 
开发者ID:xhochy,项目名称:fletcher,代码行数:10,代码来源:test_pyarrow_roundtrip.py

示例5: _bytes2schema

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import BufferReader [as 别名]
def _bytes2schema(data):
    reader = pa.BufferReader(data)
    schema = pq.read_schema(reader)
    fields = []
    for idx in range(len(schema)):
        f = schema[idx]

        # schema data recovered from parquet always contains timestamp data in us-granularity, but pandas will use
        # ns-granularity, so we re-align the two different worlds here
        if f.type == pa.timestamp("us"):
            f = pa.field(f.name, pa.timestamp("ns"))

        fields.append(f)
    return pa.schema(fields, schema.metadata) 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:16,代码来源:common_metadata.py


注:本文中的pyarrow.BufferReader方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。