当前位置: 首页>>代码示例>>Python>>正文


Python ParquetFile.to_pandas方法代码示例

本文整理汇总了Python中fastparquet.ParquetFile.to_pandas方法的典型用法代码示例。如果您正苦于以下问题:Python ParquetFile.to_pandas方法的具体用法?Python ParquetFile.to_pandas怎么用?Python ParquetFile.to_pandas使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在fastparquet.ParquetFile的用法示例。


在下文中一共展示了ParquetFile.to_pandas方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_groups_roundtrip

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_groups_roundtrip(tempdir):
    df = pd.DataFrame(
        {
            "a": np.random.choice(["a", "b", None], size=1000),
            "b": np.random.randint(0, 64000, size=1000),
            "c": np.random.choice([True, False], size=1000),
        }
    )
    writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive")

    r = ParquetFile(tempdir)
    assert r.columns == ["b"]
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)

    writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=["a", "c"], file_scheme="hive")

    r = ParquetFile(tempdir)
    assert r.count == sum(~df.a.isnull())
    assert len(r.row_groups) == 8
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
开发者ID:martindurant,项目名称:parquet-python,代码行数:28,代码来源:test_output.py

示例2: test_text_convert

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_text_convert(tempdir):
    df = pd.DataFrame({"a": ["a"] * 100, "b": [b"a"] * 100})
    fn = os.path.join(tempdir, "tmp.parq")

    write(fn, df, fixed_text={"a": 1, "b": 2})
    pf = ParquetFile(fn)
    assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
    assert pf.schema[1].type_length == 1
    assert pf.schema[2].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
    assert pf.schema[2].type_length == 2
    assert pf.statistics["max"]["a"] == ["a"]
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df, df2, check_categorical=False)

    write(fn, df)
    pf = ParquetFile(fn)
    assert pf.schema[1].type == parquet_thrift.Type.BYTE_ARRAY
    assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY
    assert pf.statistics["max"]["a"] == ["a"]
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df, df2, check_categorical=False)

    write(fn, df, fixed_text={"a": 1})
    pf = ParquetFile(fn)
    assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
    assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY
    assert pf.statistics["max"]["a"] == ["a"]
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df, df2, check_categorical=False)
开发者ID:martindurant,项目名称:parquet-python,代码行数:31,代码来源:test_output.py

示例3: time_text

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def time_text():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path(tempdir, 'temp.parq')
        n = 1000000
        d = pd.DataFrame({
            'a': np.random.choice(['hi', 'you', 'people'], size=n),
            'b': np.random.choice([b'hi', b'you', b'people'], size=n)})

        for col in d.columns:
            for fixed in [None, 6]:
                df = d[[col]]
                if isinstance(df.iloc[0, 0], bytes):
                    t = "bytes"
                else:
                    t = 'utf8'
                write(fn, df)
                with measure('%s: write, fixed: %s' % (t, fixed), result):
                    write(fn, df, has_nulls=False, write_index=False,
                          fixed_text={col: fixed}, object_encoding=t)

                pf = ParquetFile(fn)
                pf.to_pandas()  # warm-up

                with measure('%s: read, fixed: %s' % (t, fixed), result):
                    pf.to_pandas()
        return result
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:29,代码来源:columns.py

示例4: test_auto_null

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_auto_null(tempdir):
    tmp = str(tempdir)
    df = pd.DataFrame(
        {
            "a": [1, 2, 3, 0],
            "b": [1.0, 2.0, 3.0, np.nan],
            "c": pd.to_timedelta([1, 2, 3, np.nan], unit="ms"),
            "d": ["a", "b", "c", None],
        }
    )
    df["e"] = df["d"].astype("category")
    fn = os.path.join(tmp, "test.parq")

    with pytest.raises(TypeError):
        ## TODO: this should be a nicer error?
        write(fn, df, has_nulls=False)

    write(fn, df, has_nulls=True)
    pf = ParquetFile(fn)
    for col in pf.schema[2:]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    assert pf.schema[1].repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    df2 = pf.to_pandas(categories=["e"])
    tm.assert_frame_equal(df, df2, check_categorical=False)

    write(fn, df, has_nulls=None)
    pf = ParquetFile(fn)
    for col in pf.schema[1:3]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    assert pf.schema[4].repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    df2 = pf.to_pandas(categories=["e"])
    tm.assert_frame_equal(df, df2, check_categorical=False)
开发者ID:martindurant,项目名称:parquet-python,代码行数:34,代码来源:test_output.py

示例5: test_bad_file_paths

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_bad_file_paths(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
    dir1 = os.path.join(tempdir, 'x=0')
    fn1 = os.path.join(dir1, 'part.=.parquet')
    os.makedirs(dir1)
    write(fn1, df)
    dir2 = os.path.join(tempdir, 'y/z')
    fn2 = os.path.join(dir2, 'part.0.parquet')
    os.makedirs(dir2)
    write(fn2, df)

    pf = ParquetFile([fn1, fn2])
    assert pf.file_scheme == 'other'
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
    assert 'dir0' not in out

    path1 = os.path.join(tempdir, 'data')
    fn1 = os.path.join(path1, 'out.parq')
    os.makedirs(path1)
    write(fn1, df)
    path2 = os.path.join(tempdir, 'data2')
    fn2 = os.path.join(path2, 'out.parq')
    os.makedirs(path2)
    write(fn2, df)
    pf = ParquetFile([fn1, fn2])
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:30,代码来源:test_api.py

示例6: test_index_not_in_columns

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_index_not_in_columns(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('a')
    write(tempdir, df, file_scheme='hive')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(columns=['b'])
    assert out.index.tolist() == ['x', 'y', 'z']
    out = pf.to_pandas(columns=['b'], index=False)
    assert out.index.tolist() == [0, 1, 2]
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:10,代码来源:test_api.py

示例7: test_request_nonexistent_column

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_request_nonexistent_column(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3]})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df)

    pf = ParquetFile(fn)
    with pytest.raises(ValueError):
        pf.to_pandas(columns=['y'])
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:11,代码来源:test_api.py

示例8: test_in_filter_numbers

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_in_filter_numbers(tempdir):
    symbols = ['a', 'a', 'b', 'c', 'c', 'd']
    values = [1, 2, 3, 4, 5, 6]
    df = pd.DataFrame(data={'symbols': symbols, 'values': values})
    write(tempdir, df, file_scheme='hive', partition_on=['values'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('values', 'in', ['1', '4'])])
    assert set(out.symbols) == {'a', 'c'}
    out = pf.to_pandas(filters=[('values', 'in', [1, 4])])
    assert set(out.symbols) == {'a', 'c'}
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:12,代码来源:test_api.py

示例9: test_filter_without_paths

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_filter_without_paths(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
        'letter': ['a', 'b', 'c', 'd', 'e', 'f', 'g']
    })
    write(fn, df)

    pf = ParquetFile(fn)
    out = pf.to_pandas(filters=[['x', '>', 3]])
    pd.util.testing.assert_frame_equal(out, df)
    out = pf.to_pandas(filters=[['x', '>', 30]])
    assert len(out) == 0
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:15,代码来源:test_api.py

示例10: test_multi_cat_fail

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_multi_cat_fail(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    N = 200
    df = pd.DataFrame(
        {'a': np.random.randint(10, size=N),
         'b': np.random.choice(['a', 'b', 'c'], size=N),
         'c': np.arange(200)})
    df = df.set_index(['a', 'b'])
    write(fn, df, row_group_offsets=25)

    pf = ParquetFile(fn)
    with pytest.raises(RuntimeError):
        pf.to_pandas()
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:15,代码来源:test_api.py

示例11: test_filelike

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_filelike(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})
    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2])
    with open(fn, 'rb') as f:
        pf = ParquetFile(f, open_with=open)
        d2 = pf.to_pandas()
        pd.util.testing.assert_frame_equal(d2, df)

    b = io.BytesIO(open(fn, 'rb').read())
    pf = ParquetFile(b, open_with=open)
    d2 = pf.to_pandas()
    pd.util.testing.assert_frame_equal(d2, df)
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:17,代码来源:test_api.py

示例12: test_multi_list

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_multi_list(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
    dir1 = os.path.join(tempdir, 'x')
    write(dir1, df, file_scheme='hive')
    dir2 = os.path.join(tempdir, 'y')
    write(dir2, df, file_scheme='hive')
    dir3 = os.path.join(tempdir, 'z', 'deep')
    write(dir3, df, file_scheme='hive')

    pf = ParquetFile([dir1, dir2])
    out = pf.to_pandas()  # this version may have extra column!
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
    pf = ParquetFile([dir1, dir2, dir3])
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 3
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:17,代码来源:test_api.py

示例13: test_single_upper_directory

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_single_upper_directory(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5], 'y': ['aa'] * 4})
    write(tempdir, df, file_scheme='hive', partition_on='y')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert (out.y == 'aa').all()

    os.unlink(os.path.join(tempdir, '_metadata'))
    os.unlink(os.path.join(tempdir, '_common_metadata'))
    import glob
    flist = list(sorted(glob.glob(os.path.join(tempdir, '*/*'))))
    pf = ParquetFile(flist, root=tempdir)
    assert pf.fn == join_path(os.path.join(tempdir, '_metadata'))
    out = pf.to_pandas()
    assert (out.y == 'aa').all()
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:17,代码来源:test_api.py

示例14: test_roundtrip_complex

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_roundtrip_complex(tempdir, scheme):
    import datetime

    data = pd.DataFrame(
        {
            "ui32": np.arange(1000, dtype=np.uint32),
            "i16": np.arange(1000, dtype=np.int16),
            "ui8": np.array([1, 2, 3, 4] * 250, dtype=np.uint8),
            "f16": np.arange(1000, dtype=np.float16),
            "dicts": [{"oi": "you"}] * 1000,
            "t": [datetime.datetime.now()] * 1000,
            "td": [datetime.timedelta(seconds=1)] * 1000,
            "bool": np.random.choice([True, False], size=1000),
        }
    )
    data.loc[100, "t"] = None

    fname = os.path.join(tempdir, "test.parquet")
    write(fname, data, file_scheme=scheme)

    r = ParquetFile(fname)

    df = r.to_pandas()
    for col in r.columns:
        assert (df[col] == data[col])[~data[col].isnull()].all()
开发者ID:martindurant,项目名称:parquet-python,代码行数:27,代码来源:test_output.py

示例15: test_roundtrip

# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_roundtrip(tempdir, scheme, row_groups, comp):
    data = pd.DataFrame(
        {
            "i32": np.arange(1000, dtype=np.int32),
            "i64": np.arange(1000, dtype=np.int64),
            "f": np.arange(1000, dtype=np.float64),
            "bhello": np.random.choice([b"hello", b"you", b"people"], size=1000).astype("O"),
        }
    )
    data["a"] = np.array([b"a", b"b", b"c", b"d", b"e"] * 200, dtype="S1")
    data["aa"] = data["a"].map(lambda x: 2 * x).astype("S2")
    data["hello"] = data.bhello.str.decode("utf8")
    data["bcat"] = data.bhello.astype("category")
    data["cat"] = data.hello.astype("category")
    fname = os.path.join(tempdir, "test.parquet")
    write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp)

    r = ParquetFile(fname)

    df = r.to_pandas()

    assert data.cat.dtype == "category"

    for col in r.columns:
        assert (df[col] == data[col]).all()
开发者ID:martindurant,项目名称:parquet-python,代码行数:27,代码来源:test_output.py


注:本文中的fastparquet.ParquetFile.to_pandas方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。