当前位置: 首页>>代码示例>>Python>>正文


Python fastparquet.ParquetFile类代码示例

本文整理汇总了Python中fastparquet.ParquetFile的典型用法代码示例。如果您正苦于以下问题:Python ParquetFile类的具体用法?Python ParquetFile怎么用?Python ParquetFile使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了ParquetFile类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_roundtrip_complex

def test_roundtrip_complex(tempdir, scheme):
    import datetime

    data = pd.DataFrame(
        {
            "ui32": np.arange(1000, dtype=np.uint32),
            "i16": np.arange(1000, dtype=np.int16),
            "ui8": np.array([1, 2, 3, 4] * 250, dtype=np.uint8),
            "f16": np.arange(1000, dtype=np.float16),
            "dicts": [{"oi": "you"}] * 1000,
            "t": [datetime.datetime.now()] * 1000,
            "td": [datetime.timedelta(seconds=1)] * 1000,
            "bool": np.random.choice([True, False], size=1000),
        }
    )
    data.loc[100, "t"] = None

    fname = os.path.join(tempdir, "test.parquet")
    write(fname, data, file_scheme=scheme)

    r = ParquetFile(fname)

    df = r.to_pandas()
    for col in r.columns:
        assert (df[col] == data[col])[~data[col].isnull()].all()
开发者ID:martindurant,项目名称:parquet-python,代码行数:25,代码来源:test_output.py

示例2: test_input_column_list_not_mutated

def test_input_column_list_not_mutated(tempdir):
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
    write(tempdir, df, file_scheme='hive')
    cols = ['a']
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(columns=cols)
    assert cols == ['a']
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:7,代码来源:test_api.py

示例3: test_roundtrip

def test_roundtrip(tempdir, scheme, row_groups, comp):
    data = pd.DataFrame(
        {
            "i32": np.arange(1000, dtype=np.int32),
            "i64": np.arange(1000, dtype=np.int64),
            "f": np.arange(1000, dtype=np.float64),
            "bhello": np.random.choice([b"hello", b"you", b"people"], size=1000).astype("O"),
        }
    )
    data["a"] = np.array([b"a", b"b", b"c", b"d", b"e"] * 200, dtype="S1")
    data["aa"] = data["a"].map(lambda x: 2 * x).astype("S2")
    data["hello"] = data.bhello.str.decode("utf8")
    data["bcat"] = data.bhello.astype("category")
    data["cat"] = data.hello.astype("category")
    fname = os.path.join(tempdir, "test.parquet")
    write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp)

    r = ParquetFile(fname)

    df = r.to_pandas()

    assert data.cat.dtype == "category"

    for col in r.columns:
        assert (df[col] == data[col]).all()
开发者ID:martindurant,项目名称:parquet-python,代码行数:25,代码来源:test_output.py

示例4: test_floating_point_partition_name

def test_floating_point_partition_name(tempdir):
    df = pd.DataFrame({'x': [1e99, 5e-10, 2e+2, -0.1], 'y1': ['aa', 'aa', 'bb', 'aa']})
    write(tempdir, df, file_scheme='hive', partition_on=['y1'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out[out.y1 == 'aa'].x.tolist() == [1e99, 5e-10, -0.1]
    assert out[out.y1 == 'bb'].x.tolist() == [200.0]
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:7,代码来源:test_api.py

示例5: test_groups_roundtrip

def test_groups_roundtrip(tempdir):
    df = pd.DataFrame(
        {
            "a": np.random.choice(["a", "b", None], size=1000),
            "b": np.random.randint(0, 64000, size=1000),
            "c": np.random.choice([True, False], size=1000),
        }
    )
    writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive")

    r = ParquetFile(tempdir)
    assert r.columns == ["b"]
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)

    writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=["a", "c"], file_scheme="hive")

    r = ParquetFile(tempdir)
    assert r.count == sum(~df.a.isnull())
    assert len(r.row_groups) == 8
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
开发者ID:martindurant,项目名称:parquet-python,代码行数:26,代码来源:test_output.py

示例6: test_numerical_partition_name

def test_numerical_partition_name(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5], 'y1': ['aa', 'aa', 'bb', 'aa']})
    write(tempdir, df, file_scheme='hive', partition_on=['y1'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out[out.y1 == 'aa'].x.tolist() == [1, 5, 5]
    assert out[out.y1 == 'bb'].x.tolist() == [2]
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:7,代码来源:test_api.py

示例7: test_auto_null

def test_auto_null(tempdir):
    tmp = str(tempdir)
    df = pd.DataFrame(
        {
            "a": [1, 2, 3, 0],
            "b": [1.0, 2.0, 3.0, np.nan],
            "c": pd.to_timedelta([1, 2, 3, np.nan], unit="ms"),
            "d": ["a", "b", "c", None],
        }
    )
    df["e"] = df["d"].astype("category")
    fn = os.path.join(tmp, "test.parq")

    with pytest.raises(TypeError):
        ## TODO: this should be a nicer error?
        write(fn, df, has_nulls=False)

    write(fn, df, has_nulls=True)
    pf = ParquetFile(fn)
    for col in pf.schema[2:]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    assert pf.schema[1].repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    df2 = pf.to_pandas(categories=["e"])
    tm.assert_frame_equal(df, df2, check_categorical=False)

    write(fn, df, has_nulls=None)
    pf = ParquetFile(fn)
    for col in pf.schema[1:3]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    assert pf.schema[4].repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    df2 = pf.to_pandas(categories=["e"])
    tm.assert_frame_equal(df, df2, check_categorical=False)
开发者ID:martindurant,项目名称:parquet-python,代码行数:32,代码来源:test_output.py

示例8: _read_pf_simple

def _read_pf_simple(fs, path, base, index_names, all_columns, is_series,
                    categories, cats, scheme, storage_name_mapping):
    """Read dataset with fastparquet using ParquetFile machinery"""
    from fastparquet import ParquetFile
    pf = ParquetFile(path, open_with=fs.open)
    relpath = path.replace(base, '').lstrip('/')
    for rg in pf.row_groups:
        for ch in rg.columns:
            ch.file_path = relpath
    pf.file_scheme = scheme
    pf.cats = cats
    pf.fn = base
    df = pf.to_pandas(all_columns, categories, index=index_names)
    if df.index.nlevels == 1:
        if index_names:
            df.index.name = storage_name_mapping.get(index_names[0],
                                                     index_names[0])
    else:
        if index_names:
            df.index.names = [storage_name_mapping.get(name, name)
                              for name in index_names]
    df.columns = [storage_name_mapping.get(col, col)
                  for col in all_columns
                  if col not in (index_names or [])]

    if is_series:
        return df[df.columns[0]]
    else:
        return df
开发者ID:yliapis,项目名称:dask,代码行数:29,代码来源:parquet.py

示例9: time_text

def time_text():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path(tempdir, 'temp.parq')
        n = 1000000
        d = pd.DataFrame({
            'a': np.random.choice(['hi', 'you', 'people'], size=n),
            'b': np.random.choice([b'hi', b'you', b'people'], size=n)})

        for col in d.columns:
            for fixed in [None, 6]:
                df = d[[col]]
                if isinstance(df.iloc[0, 0], bytes):
                    t = "bytes"
                else:
                    t = 'utf8'
                write(fn, df)
                with measure('%s: write, fixed: %s' % (t, fixed), result):
                    write(fn, df, has_nulls=False, write_index=False,
                          fixed_text={col: fixed}, object_encoding=t)

                pf = ParquetFile(fn)
                pf.to_pandas()  # warm-up

                with measure('%s: read, fixed: %s' % (t, fixed), result):
                    pf.to_pandas()
        return result
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:27,代码来源:columns.py

示例10: test_bad_file_paths

def test_bad_file_paths(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
    dir1 = os.path.join(tempdir, 'x=0')
    fn1 = os.path.join(dir1, 'part.=.parquet')
    os.makedirs(dir1)
    write(fn1, df)
    dir2 = os.path.join(tempdir, 'y/z')
    fn2 = os.path.join(dir2, 'part.0.parquet')
    os.makedirs(dir2)
    write(fn2, df)

    pf = ParquetFile([fn1, fn2])
    assert pf.file_scheme == 'other'
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
    assert 'dir0' not in out

    path1 = os.path.join(tempdir, 'data')
    fn1 = os.path.join(path1, 'out.parq')
    os.makedirs(path1)
    write(fn1, df)
    path2 = os.path.join(tempdir, 'data2')
    fn2 = os.path.join(path2, 'out.parq')
    os.makedirs(path2)
    write(fn2, df)
    pf = ParquetFile([fn1, fn2])
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:28,代码来源:test_api.py

示例11: test_index_not_in_columns

def test_index_not_in_columns(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('a')
    write(tempdir, df, file_scheme='hive')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(columns=['b'])
    assert out.index.tolist() == ['x', 'y', 'z']
    out = pf.to_pandas(columns=['b'], index=False)
    assert out.index.tolist() == [0, 1, 2]
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:8,代码来源:test_api.py

示例12: test_filter_stats

def test_filter_stats(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
    })
    write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 4])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('x', '>=', 5)])
    assert out.x.tolist() == [5, 6, 7]
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:8,代码来源:test_api.py

示例13: test_in_filter

def test_in_filter(tempdir):
    symbols = ['a', 'a', 'b', 'c', 'c', 'd']
    values = [1, 2, 3, 4, 5, 6]
    df = pd.DataFrame(data={'symbols': symbols, 'values': values})
    write(tempdir, df, file_scheme='hive', partition_on=['symbols'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('symbols', 'in', ['a', 'c'])])
    assert set(out.symbols) == {'a', 'c'}
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:8,代码来源:test_api.py

示例14: test_to_pandas

def test_to_pandas():
    fname = TEST_DATA+'/airlines_parquet/4345e5eef217aa1b-c8f16177f35fd983_1150363067_data.1.parq'
    pf = ParquetFile(fname)
    out = pf.to_pandas()
    assert len(out.columns) == 29
    # test for bad integer conversion
    assert (out.dep_time < 0).sum() == 0
    assert out.dep_time.dtype == 'float64'
开发者ID:klahnakoski,项目名称:fastparquet,代码行数:8,代码来源:test_with_n.py

示例15: test_write_compression_dict

def test_write_compression_dict(tempdir, compression):
    df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
    fn = os.path.join(tempdir, "tmp.parq")
    writer.write(fn, df, compression=compression)
    r = ParquetFile(fn)
    df2 = r.to_pandas()

    tm.assert_frame_equal(df, df2, check_categorical=False)
开发者ID:martindurant,项目名称:parquet-python,代码行数:8,代码来源:test_output.py


注:本文中的fastparquet.ParquetFile类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。