本文整理汇总了Python中fastparquet.ParquetFile.to_pandas方法的典型用法代码示例。如果您正苦于以下问题:Python ParquetFile.to_pandas方法的具体用法?Python ParquetFile.to_pandas怎么用?Python ParquetFile.to_pandas使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fastparquet.ParquetFile
的用法示例。
在下文中一共展示了ParquetFile.to_pandas方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_groups_roundtrip
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_groups_roundtrip(tempdir):
df = pd.DataFrame(
{
"a": np.random.choice(["a", "b", None], size=1000),
"b": np.random.randint(0, 64000, size=1000),
"c": np.random.choice([True, False], size=1000),
}
)
writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive")
r = ParquetFile(tempdir)
assert r.columns == ["b"]
out = r.to_pandas()
for i, row in out.iterrows():
assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=["a", "c"], file_scheme="hive")
r = ParquetFile(tempdir)
assert r.count == sum(~df.a.isnull())
assert len(r.row_groups) == 8
out = r.to_pandas()
for i, row in out.iterrows():
assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
示例2: test_text_convert
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_text_convert(tempdir):
df = pd.DataFrame({"a": ["a"] * 100, "b": [b"a"] * 100})
fn = os.path.join(tempdir, "tmp.parq")
write(fn, df, fixed_text={"a": 1, "b": 2})
pf = ParquetFile(fn)
assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
assert pf.schema[1].type_length == 1
assert pf.schema[2].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
assert pf.schema[2].type_length == 2
assert pf.statistics["max"]["a"] == ["a"]
df2 = pf.to_pandas()
tm.assert_frame_equal(df, df2, check_categorical=False)
write(fn, df)
pf = ParquetFile(fn)
assert pf.schema[1].type == parquet_thrift.Type.BYTE_ARRAY
assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY
assert pf.statistics["max"]["a"] == ["a"]
df2 = pf.to_pandas()
tm.assert_frame_equal(df, df2, check_categorical=False)
write(fn, df, fixed_text={"a": 1})
pf = ParquetFile(fn)
assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY
assert pf.statistics["max"]["a"] == ["a"]
df2 = pf.to_pandas()
tm.assert_frame_equal(df, df2, check_categorical=False)
示例3: time_text
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def time_text():
with tmpdir() as tempdir:
result = {}
fn = join_path(tempdir, 'temp.parq')
n = 1000000
d = pd.DataFrame({
'a': np.random.choice(['hi', 'you', 'people'], size=n),
'b': np.random.choice([b'hi', b'you', b'people'], size=n)})
for col in d.columns:
for fixed in [None, 6]:
df = d[[col]]
if isinstance(df.iloc[0, 0], bytes):
t = "bytes"
else:
t = 'utf8'
write(fn, df)
with measure('%s: write, fixed: %s' % (t, fixed), result):
write(fn, df, has_nulls=False, write_index=False,
fixed_text={col: fixed}, object_encoding=t)
pf = ParquetFile(fn)
pf.to_pandas() # warm-up
with measure('%s: read, fixed: %s' % (t, fixed), result):
pf.to_pandas()
return result
示例4: test_auto_null
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_auto_null(tempdir):
tmp = str(tempdir)
df = pd.DataFrame(
{
"a": [1, 2, 3, 0],
"b": [1.0, 2.0, 3.0, np.nan],
"c": pd.to_timedelta([1, 2, 3, np.nan], unit="ms"),
"d": ["a", "b", "c", None],
}
)
df["e"] = df["d"].astype("category")
fn = os.path.join(tmp, "test.parq")
with pytest.raises(TypeError):
## TODO: this should be a nicer error?
write(fn, df, has_nulls=False)
write(fn, df, has_nulls=True)
pf = ParquetFile(fn)
for col in pf.schema[2:]:
assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
assert pf.schema[1].repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
df2 = pf.to_pandas(categories=["e"])
tm.assert_frame_equal(df, df2, check_categorical=False)
write(fn, df, has_nulls=None)
pf = ParquetFile(fn)
for col in pf.schema[1:3]:
assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
assert pf.schema[4].repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
df2 = pf.to_pandas(categories=["e"])
tm.assert_frame_equal(df, df2, check_categorical=False)
示例5: test_bad_file_paths
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_bad_file_paths(tempdir):
df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
dir1 = os.path.join(tempdir, 'x=0')
fn1 = os.path.join(dir1, 'part.=.parquet')
os.makedirs(dir1)
write(fn1, df)
dir2 = os.path.join(tempdir, 'y/z')
fn2 = os.path.join(dir2, 'part.0.parquet')
os.makedirs(dir2)
write(fn2, df)
pf = ParquetFile([fn1, fn2])
assert pf.file_scheme == 'other'
out = pf.to_pandas()
assert out.a.tolist() == ['x', 'y', 'z'] * 2
assert 'dir0' not in out
path1 = os.path.join(tempdir, 'data')
fn1 = os.path.join(path1, 'out.parq')
os.makedirs(path1)
write(fn1, df)
path2 = os.path.join(tempdir, 'data2')
fn2 = os.path.join(path2, 'out.parq')
os.makedirs(path2)
write(fn2, df)
pf = ParquetFile([fn1, fn2])
out = pf.to_pandas()
assert out.a.tolist() == ['x', 'y', 'z'] * 2
示例6: test_index_not_in_columns
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_index_not_in_columns(tempdir):
df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('a')
write(tempdir, df, file_scheme='hive')
pf = ParquetFile(tempdir)
out = pf.to_pandas(columns=['b'])
assert out.index.tolist() == ['x', 'y', 'z']
out = pf.to_pandas(columns=['b'], index=False)
assert out.index.tolist() == [0, 1, 2]
示例7: test_request_nonexistent_column
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_request_nonexistent_column(tempdir):
df = pd.DataFrame({'x': [1, 2, 3]})
fn = os.path.join(tempdir, 'foo.parquet')
write(fn, df)
pf = ParquetFile(fn)
with pytest.raises(ValueError):
pf.to_pandas(columns=['y'])
示例8: test_in_filter_numbers
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_in_filter_numbers(tempdir):
symbols = ['a', 'a', 'b', 'c', 'c', 'd']
values = [1, 2, 3, 4, 5, 6]
df = pd.DataFrame(data={'symbols': symbols, 'values': values})
write(tempdir, df, file_scheme='hive', partition_on=['values'])
pf = ParquetFile(tempdir)
out = pf.to_pandas(filters=[('values', 'in', ['1', '4'])])
assert set(out.symbols) == {'a', 'c'}
out = pf.to_pandas(filters=[('values', 'in', [1, 4])])
assert set(out.symbols) == {'a', 'c'}
示例9: test_filter_without_paths
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_filter_without_paths(tempdir):
fn = os.path.join(tempdir, 'test.parq')
df = pd.DataFrame({
'x': [1, 2, 3, 4, 5, 6, 7],
'letter': ['a', 'b', 'c', 'd', 'e', 'f', 'g']
})
write(fn, df)
pf = ParquetFile(fn)
out = pf.to_pandas(filters=[['x', '>', 3]])
pd.util.testing.assert_frame_equal(out, df)
out = pf.to_pandas(filters=[['x', '>', 30]])
assert len(out) == 0
示例10: test_multi_cat_fail
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_multi_cat_fail(tempdir):
fn = os.path.join(tempdir, 'test.parq')
N = 200
df = pd.DataFrame(
{'a': np.random.randint(10, size=N),
'b': np.random.choice(['a', 'b', 'c'], size=N),
'c': np.arange(200)})
df = df.set_index(['a', 'b'])
write(fn, df, row_group_offsets=25)
pf = ParquetFile(fn)
with pytest.raises(RuntimeError):
pf.to_pandas()
示例11: test_filelike
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_filelike(tempdir):
df = pd.DataFrame({'x': [1, 2, 3, 4],
'y': [1.0, 2.0, 1.0, 2.0],
'z': ['a', 'b', 'c', 'd']})
fn = os.path.join(tempdir, 'foo.parquet')
write(fn, df, row_group_offsets=[0, 2])
with open(fn, 'rb') as f:
pf = ParquetFile(f, open_with=open)
d2 = pf.to_pandas()
pd.util.testing.assert_frame_equal(d2, df)
b = io.BytesIO(open(fn, 'rb').read())
pf = ParquetFile(b, open_with=open)
d2 = pf.to_pandas()
pd.util.testing.assert_frame_equal(d2, df)
示例12: test_multi_list
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_multi_list(tempdir):
df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
dir1 = os.path.join(tempdir, 'x')
write(dir1, df, file_scheme='hive')
dir2 = os.path.join(tempdir, 'y')
write(dir2, df, file_scheme='hive')
dir3 = os.path.join(tempdir, 'z', 'deep')
write(dir3, df, file_scheme='hive')
pf = ParquetFile([dir1, dir2])
out = pf.to_pandas() # this version may have extra column!
assert out.a.tolist() == ['x', 'y', 'z'] * 2
pf = ParquetFile([dir1, dir2, dir3])
out = pf.to_pandas()
assert out.a.tolist() == ['x', 'y', 'z'] * 3
示例13: test_single_upper_directory
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_single_upper_directory(tempdir):
df = pd.DataFrame({'x': [1, 5, 2, 5], 'y': ['aa'] * 4})
write(tempdir, df, file_scheme='hive', partition_on='y')
pf = ParquetFile(tempdir)
out = pf.to_pandas()
assert (out.y == 'aa').all()
os.unlink(os.path.join(tempdir, '_metadata'))
os.unlink(os.path.join(tempdir, '_common_metadata'))
import glob
flist = list(sorted(glob.glob(os.path.join(tempdir, '*/*'))))
pf = ParquetFile(flist, root=tempdir)
assert pf.fn == join_path(os.path.join(tempdir, '_metadata'))
out = pf.to_pandas()
assert (out.y == 'aa').all()
示例14: test_roundtrip_complex
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_roundtrip_complex(tempdir, scheme):
import datetime
data = pd.DataFrame(
{
"ui32": np.arange(1000, dtype=np.uint32),
"i16": np.arange(1000, dtype=np.int16),
"ui8": np.array([1, 2, 3, 4] * 250, dtype=np.uint8),
"f16": np.arange(1000, dtype=np.float16),
"dicts": [{"oi": "you"}] * 1000,
"t": [datetime.datetime.now()] * 1000,
"td": [datetime.timedelta(seconds=1)] * 1000,
"bool": np.random.choice([True, False], size=1000),
}
)
data.loc[100, "t"] = None
fname = os.path.join(tempdir, "test.parquet")
write(fname, data, file_scheme=scheme)
r = ParquetFile(fname)
df = r.to_pandas()
for col in r.columns:
assert (df[col] == data[col])[~data[col].isnull()].all()
示例15: test_roundtrip
# 需要导入模块: from fastparquet import ParquetFile [as 别名]
# 或者: from fastparquet.ParquetFile import to_pandas [as 别名]
def test_roundtrip(tempdir, scheme, row_groups, comp):
data = pd.DataFrame(
{
"i32": np.arange(1000, dtype=np.int32),
"i64": np.arange(1000, dtype=np.int64),
"f": np.arange(1000, dtype=np.float64),
"bhello": np.random.choice([b"hello", b"you", b"people"], size=1000).astype("O"),
}
)
data["a"] = np.array([b"a", b"b", b"c", b"d", b"e"] * 200, dtype="S1")
data["aa"] = data["a"].map(lambda x: 2 * x).astype("S2")
data["hello"] = data.bhello.str.decode("utf8")
data["bcat"] = data.bhello.astype("category")
data["cat"] = data.hello.astype("category")
fname = os.path.join(tempdir, "test.parquet")
write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp)
r = ParquetFile(fname)
df = r.to_pandas()
assert data.cat.dtype == "category"
for col in r.columns:
assert (df[col] == data[col]).all()