Python parquet.read_table方法代码示例

本文整理汇总了Python中pyarrow.parquet.read_table方法的典型用法代码示例。如果您正苦于以下问题：Python parquet.read_table方法的具体用法？Python parquet.read_table怎么用？Python parquet.read_table使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow.parquet的用法示例。

在下文中一共展示了parquet.read_table方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _get_parquet_dmatrix_file_mode

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def _get_parquet_dmatrix_file_mode(files_path):
    """Get Data Matrix from parquet data in file mode.

    :param files_path: File path where parquet formatted training data resides, either directory or file
    :return: xgb.DMatrix
    """
    try:
        table = pq.read_table(files_path)

        data = table.to_pandas()
        del table

        if type(data) is pd.DataFrame:
            # pyarrow.Table.to_pandas may produce NumPy array or pandas DataFrame
            data = data.to_numpy()

        dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])
        del data

        return dmatrix

    except Exception as e:
        raise exc.UserError("Failed to load parquet data with exception:\n{}".format(e))

开发者ID:aws，项目名称:sagemaker-xgboost-container，代码行数:25，代码来源:data_utils.py

示例2: dataframe_from_parquet_bytes

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def dataframe_from_parquet_bytes(buf: bytes) -> pd.DataFrame:
    """
    Convert bytes representing a parquet table into a pandas dataframe.

    Parameters
    ----------
    buf: bytes
        Bytes representing a parquet table. Can be the direct result from
        `func`::gordo.server.utils.dataframe_into_parquet_bytes

    Returns
    -------
    pandas.DataFrame
    """
    table = pq.read_table(io.BytesIO(buf))
    return table.to_pandas()

开发者ID:equinor，项目名称:gordo，代码行数:18，代码来源:utils.py

示例3: read_parquet

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def read_parquet(
    path, columns: Optional[List[str]] = None, continuous: bool = False
) -> pd.DataFrame:
    """
    Load a parquet object from the file path, returning a DataFrame with fletcher columns.

    Parameters
    ----------
    path : str or file-like
    continuous : bool
        Use FletcherContinuousArray instead of FletcherChunkedArray

    Returns
    -------
    pd.DataFrame
    """
    table = pq.read_table(path, columns=columns)
    return pandas_from_arrow(table, continuous=continuous)

开发者ID:xhochy，项目名称:fletcher，代码行数:20，代码来源:io.py

示例4: _parquet_bytes_to_dict

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def _parquet_bytes_to_dict(column: str, index_buffer: bytes):
    reader = pa.BufferReader(index_buffer)
    # This can be done much more efficient but would take a lot more
    # time to implement so this will be only done on request.
    table = pq.read_table(reader)
    if ARROW_LARGER_EQ_0150:
        column_type = table.schema.field(column).type
    else:
        column_type = table.schema.field_by_name(column).type

    # `datetime.datetime` objects have a precision of up to microseconds only, so arrow
    # parses the type to `pa.timestamp("us")`. Since the
    # values are normalized to `numpy.datetime64[ns]` anyways, we do not care about this
    # and load the column type as `pa.timestamp("ns")`
    if column_type == pa.timestamp("us"):
        column_type = pa.timestamp("ns")

    df = _fix_pyarrow_07992_table(table).to_pandas()  # Could eventually be phased out

    index_dct = dict(
        zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values))
    )
    return index_dct, column_type

开发者ID:JDASoftwareGroup，项目名称:kartothek，代码行数:25，代码来源:index.py

示例5: df_from_bytes_parquet_

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def df_from_bytes_parquet_(bytes_: bytes) -> pd.DataFrame:
    """
    Since pyabc 0.9.14, pandas DataFrames are converted using
    pyarrow parquet. If the conversion to DataFrame fails,
    then `df_from_bytes_msgpack_` is tried, which was the formerly
    used method. This is in particular useful for databases that
    still employ the old format. In case errors occur here, it may
    be necessary to use a pandas version prior to 0.25.0.
    """
    try:
        b = BytesIO(bytes_)
        table = parquet.read_table(b)
        df = table.to_pandas()
    except pyarrow.lib.ArrowIOError:
        df = df_from_bytes_msgpack_(bytes_)
    return df

开发者ID:ICB-DCM，项目名称:pyABC，代码行数:18，代码来源:dataframe_bytes_storage.py

示例6: _get_parquet_dmatrix_pipe_mode

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def _get_parquet_dmatrix_pipe_mode(pipe_path):
    """Get Data Matrix from parquet data in pipe mode.

    :param pipe_path: SageMaker pipe path where parquet formatted training data is piped
    :return: xgb.DMatrix or None
    """
    try:
        f = mlio.SageMakerPipe(pipe_path)
        examples = []

        with f.open_read() as strm:
            reader = mlio.ParquetRecordReader(strm)

            for record in reader:
                table = pq.read_table(as_arrow_file(record))
                array = table.to_pandas()
                if type(array) is pd.DataFrame:
                    array = array.to_numpy()
                examples.append(array)

        if examples:
            data = np.vstack(examples)
            del examples

            dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0])
            return dmatrix
        else:
            return None

    except Exception as e:
        raise exc.UserError("Failed to load parquet data with exception:\n{}".format(e))

开发者ID:aws，项目名称:sagemaker-xgboost-container，代码行数:33，代码来源:data_utils.py

示例7: parquet_read_table

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def parquet_read_table(op, client, scope, **kwargs):
    path = client.dictionary[op.name]
    table = pq.read_table(str(path))
    df = table.to_pandas()
    return df

开发者ID:ibis-project，项目名称:ibis，代码行数:7，代码来源:parquet.py

示例8: test_write

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_write(self):
        # Write out test file
        with UncloseableBytesIO() as write_buffer:
            with Writer(write_buffer, self.table) as writer:
                writer.write_row_group(self.data)
            file_bytes = write_buffer.getvalue()

        # Read in test file
        read_buffer = BytesIO(file_bytes)
        with pa.PythonFile(read_buffer, mode='r') as infile:

            # Verify data
            parq_table = pq.read_table(infile)
            written_data = list(parq_table.to_pydict().values())

            tuples_by_data_type = zip(self.data, written_data)
            for i in tuples_by_data_type:
                tuples_by_order = zip(i[0], i[1])
                for j in tuples_by_order:
                    self.assertAlmostEquals(j[0], j[1], places=5)

            # Verify parquet file schema
            for i, field in enumerate(parq_table.schema):
                self.assertEqual(field.type.id, self.expected_datatypes[i].id)

            # Ensure timestamp column was written with int96; right now
            # there is no way to see except to check that the unit on
            # the timestamp type is 'ns'
            ts_col = parq_table.schema.field_by_name('timestamp_col')
            self.assertEqual(ts_col.type.unit, 'ns')

开发者ID:hellonarrativ，项目名称:spectrify，代码行数:32，代码来源:test_parquet.py

示例9: read_parquet

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def read_parquet(filename):
    pa_table = pq.read_table(filename)
    dataframe = pa_table.to_pandas()
    meta_dict = pa_table.schema.metadata
    if b"hydrofunctions_meta" in meta_dict:
        meta_string = meta_dict[b"hydrofunctions_meta"].decode()
        meta = json.loads(meta_string, encoding="utf-8")
    else:
        meta = None
    return dataframe, meta

开发者ID:mroberge，项目名称:hydrofunctions，代码行数:12，代码来源:hydrofunctions.py

示例10: test_parquet_roundtrip

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_parquet_roundtrip(array_type):
    df = pd.DataFrame({"col": array_type(["A", "B"])})
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    reader = pa.BufferReader(buf.getvalue().to_pybytes())
    table = pq.read_table(reader)
    pdt.assert_frame_equal(df, table.to_pandas())

开发者ID:xhochy，项目名称:fletcher，代码行数:10，代码来源:test_pyarrow_roundtrip.py

示例11: test_pq_read

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_pq_read(self):
        def test_impl():
            t = pq.read_table('kde.parquet')
            df = t.to_pandas()
            X = df['points']
            return X.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0)

开发者ID:IntelPython，项目名称:sdc，代码行数:13，代码来源:test_io.py

示例12: test_pq_str

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_pq_str(self):
        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            A = df.two.values == 'foo'
            return A.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0)

开发者ID:IntelPython，项目名称:sdc，代码行数:12，代码来源:test_io.py

示例13: test_pq_str_with_nan_seq

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_pq_str_with_nan_seq(self):
        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            A = df.five.values == 'foo'
            return A

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())

开发者ID:IntelPython，项目名称:sdc，代码行数:10，代码来源:test_io.py

示例14: test_pq_str_with_nan_par

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_pq_str_with_nan_par(self):
        def test_impl():
            df = pq.read_table('example.parquet').to_pandas()
            A = df.five.values == 'foo'
            return A.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0)

开发者ID:IntelPython，项目名称:sdc，代码行数:12，代码来源:test_io.py

示例15: test_pq_str_with_nan_par_multigroup

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import read_table [as 别名]
def test_pq_str_with_nan_par_multigroup(self):
        def test_impl():
            df = pq.read_table('example2.parquet').to_pandas()
            A = df.five.values == 'foo'
            return A.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0)

开发者ID:IntelPython，项目名称:sdc，代码行数:12，代码来源:test_io.py

注：本文中的pyarrow.parquet.read_table方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。