本文整理汇总了Python中fastparquet.ParquetFile类的典型用法代码示例。如果您正苦于以下问题:Python ParquetFile类的具体用法?Python ParquetFile怎么用?Python ParquetFile使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ParquetFile类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_roundtrip_complex
def test_roundtrip_complex(tempdir, scheme):
import datetime
data = pd.DataFrame(
{
"ui32": np.arange(1000, dtype=np.uint32),
"i16": np.arange(1000, dtype=np.int16),
"ui8": np.array([1, 2, 3, 4] * 250, dtype=np.uint8),
"f16": np.arange(1000, dtype=np.float16),
"dicts": [{"oi": "you"}] * 1000,
"t": [datetime.datetime.now()] * 1000,
"td": [datetime.timedelta(seconds=1)] * 1000,
"bool": np.random.choice([True, False], size=1000),
}
)
data.loc[100, "t"] = None
fname = os.path.join(tempdir, "test.parquet")
write(fname, data, file_scheme=scheme)
r = ParquetFile(fname)
df = r.to_pandas()
for col in r.columns:
assert (df[col] == data[col])[~data[col].isnull()].all()
示例2: test_input_column_list_not_mutated
def test_input_column_list_not_mutated(tempdir):
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
write(tempdir, df, file_scheme='hive')
cols = ['a']
pf = ParquetFile(tempdir)
out = pf.to_pandas(columns=cols)
assert cols == ['a']
示例3: test_roundtrip
def test_roundtrip(tempdir, scheme, row_groups, comp):
data = pd.DataFrame(
{
"i32": np.arange(1000, dtype=np.int32),
"i64": np.arange(1000, dtype=np.int64),
"f": np.arange(1000, dtype=np.float64),
"bhello": np.random.choice([b"hello", b"you", b"people"], size=1000).astype("O"),
}
)
data["a"] = np.array([b"a", b"b", b"c", b"d", b"e"] * 200, dtype="S1")
data["aa"] = data["a"].map(lambda x: 2 * x).astype("S2")
data["hello"] = data.bhello.str.decode("utf8")
data["bcat"] = data.bhello.astype("category")
data["cat"] = data.hello.astype("category")
fname = os.path.join(tempdir, "test.parquet")
write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp)
r = ParquetFile(fname)
df = r.to_pandas()
assert data.cat.dtype == "category"
for col in r.columns:
assert (df[col] == data[col]).all()
示例4: test_floating_point_partition_name
def test_floating_point_partition_name(tempdir):
df = pd.DataFrame({'x': [1e99, 5e-10, 2e+2, -0.1], 'y1': ['aa', 'aa', 'bb', 'aa']})
write(tempdir, df, file_scheme='hive', partition_on=['y1'])
pf = ParquetFile(tempdir)
out = pf.to_pandas()
assert out[out.y1 == 'aa'].x.tolist() == [1e99, 5e-10, -0.1]
assert out[out.y1 == 'bb'].x.tolist() == [200.0]
示例5: test_groups_roundtrip
def test_groups_roundtrip(tempdir):
df = pd.DataFrame(
{
"a": np.random.choice(["a", "b", None], size=1000),
"b": np.random.randint(0, 64000, size=1000),
"c": np.random.choice([True, False], size=1000),
}
)
writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive")
r = ParquetFile(tempdir)
assert r.columns == ["b"]
out = r.to_pandas()
for i, row in out.iterrows():
assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=["a", "c"], file_scheme="hive")
r = ParquetFile(tempdir)
assert r.count == sum(~df.a.isnull())
assert len(r.row_groups) == 8
out = r.to_pandas()
for i, row in out.iterrows():
assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
示例6: test_numerical_partition_name
def test_numerical_partition_name(tempdir):
df = pd.DataFrame({'x': [1, 5, 2, 5], 'y1': ['aa', 'aa', 'bb', 'aa']})
write(tempdir, df, file_scheme='hive', partition_on=['y1'])
pf = ParquetFile(tempdir)
out = pf.to_pandas()
assert out[out.y1 == 'aa'].x.tolist() == [1, 5, 5]
assert out[out.y1 == 'bb'].x.tolist() == [2]
示例7: test_auto_null
def test_auto_null(tempdir):
tmp = str(tempdir)
df = pd.DataFrame(
{
"a": [1, 2, 3, 0],
"b": [1.0, 2.0, 3.0, np.nan],
"c": pd.to_timedelta([1, 2, 3, np.nan], unit="ms"),
"d": ["a", "b", "c", None],
}
)
df["e"] = df["d"].astype("category")
fn = os.path.join(tmp, "test.parq")
with pytest.raises(TypeError):
## TODO: this should be a nicer error?
write(fn, df, has_nulls=False)
write(fn, df, has_nulls=True)
pf = ParquetFile(fn)
for col in pf.schema[2:]:
assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
assert pf.schema[1].repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
df2 = pf.to_pandas(categories=["e"])
tm.assert_frame_equal(df, df2, check_categorical=False)
write(fn, df, has_nulls=None)
pf = ParquetFile(fn)
for col in pf.schema[1:3]:
assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
assert pf.schema[4].repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
df2 = pf.to_pandas(categories=["e"])
tm.assert_frame_equal(df, df2, check_categorical=False)
示例8: _read_pf_simple
def _read_pf_simple(fs, path, base, index_names, all_columns, is_series,
categories, cats, scheme, storage_name_mapping):
"""Read dataset with fastparquet using ParquetFile machinery"""
from fastparquet import ParquetFile
pf = ParquetFile(path, open_with=fs.open)
relpath = path.replace(base, '').lstrip('/')
for rg in pf.row_groups:
for ch in rg.columns:
ch.file_path = relpath
pf.file_scheme = scheme
pf.cats = cats
pf.fn = base
df = pf.to_pandas(all_columns, categories, index=index_names)
if df.index.nlevels == 1:
if index_names:
df.index.name = storage_name_mapping.get(index_names[0],
index_names[0])
else:
if index_names:
df.index.names = [storage_name_mapping.get(name, name)
for name in index_names]
df.columns = [storage_name_mapping.get(col, col)
for col in all_columns
if col not in (index_names or [])]
if is_series:
return df[df.columns[0]]
else:
return df
示例9: time_text
def time_text():
with tmpdir() as tempdir:
result = {}
fn = join_path(tempdir, 'temp.parq')
n = 1000000
d = pd.DataFrame({
'a': np.random.choice(['hi', 'you', 'people'], size=n),
'b': np.random.choice([b'hi', b'you', b'people'], size=n)})
for col in d.columns:
for fixed in [None, 6]:
df = d[[col]]
if isinstance(df.iloc[0, 0], bytes):
t = "bytes"
else:
t = 'utf8'
write(fn, df)
with measure('%s: write, fixed: %s' % (t, fixed), result):
write(fn, df, has_nulls=False, write_index=False,
fixed_text={col: fixed}, object_encoding=t)
pf = ParquetFile(fn)
pf.to_pandas() # warm-up
with measure('%s: read, fixed: %s' % (t, fixed), result):
pf.to_pandas()
return result
示例10: test_bad_file_paths
def test_bad_file_paths(tempdir):
df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
dir1 = os.path.join(tempdir, 'x=0')
fn1 = os.path.join(dir1, 'part.=.parquet')
os.makedirs(dir1)
write(fn1, df)
dir2 = os.path.join(tempdir, 'y/z')
fn2 = os.path.join(dir2, 'part.0.parquet')
os.makedirs(dir2)
write(fn2, df)
pf = ParquetFile([fn1, fn2])
assert pf.file_scheme == 'other'
out = pf.to_pandas()
assert out.a.tolist() == ['x', 'y', 'z'] * 2
assert 'dir0' not in out
path1 = os.path.join(tempdir, 'data')
fn1 = os.path.join(path1, 'out.parq')
os.makedirs(path1)
write(fn1, df)
path2 = os.path.join(tempdir, 'data2')
fn2 = os.path.join(path2, 'out.parq')
os.makedirs(path2)
write(fn2, df)
pf = ParquetFile([fn1, fn2])
out = pf.to_pandas()
assert out.a.tolist() == ['x', 'y', 'z'] * 2
示例11: test_index_not_in_columns
def test_index_not_in_columns(tempdir):
df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('a')
write(tempdir, df, file_scheme='hive')
pf = ParquetFile(tempdir)
out = pf.to_pandas(columns=['b'])
assert out.index.tolist() == ['x', 'y', 'z']
out = pf.to_pandas(columns=['b'], index=False)
assert out.index.tolist() == [0, 1, 2]
示例12: test_filter_stats
def test_filter_stats(tempdir):
df = pd.DataFrame({
'x': [1, 2, 3, 4, 5, 6, 7],
})
write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 4])
pf = ParquetFile(tempdir)
out = pf.to_pandas(filters=[('x', '>=', 5)])
assert out.x.tolist() == [5, 6, 7]
示例13: test_in_filter
def test_in_filter(tempdir):
symbols = ['a', 'a', 'b', 'c', 'c', 'd']
values = [1, 2, 3, 4, 5, 6]
df = pd.DataFrame(data={'symbols': symbols, 'values': values})
write(tempdir, df, file_scheme='hive', partition_on=['symbols'])
pf = ParquetFile(tempdir)
out = pf.to_pandas(filters=[('symbols', 'in', ['a', 'c'])])
assert set(out.symbols) == {'a', 'c'}
示例14: test_to_pandas
def test_to_pandas():
fname = TEST_DATA+'/airlines_parquet/4345e5eef217aa1b-c8f16177f35fd983_1150363067_data.1.parq'
pf = ParquetFile(fname)
out = pf.to_pandas()
assert len(out.columns) == 29
# test for bad integer conversion
assert (out.dep_time < 0).sum() == 0
assert out.dep_time.dtype == 'float64'
示例15: test_write_compression_dict
def test_write_compression_dict(tempdir, compression):
df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
fn = os.path.join(tempdir, "tmp.parq")
writer.write(fn, df, compression=compression)
r = ParquetFile(fn)
df2 = r.to_pandas()
tm.assert_frame_equal(df, df2, check_categorical=False)