当前位置: 首页>>代码示例>>Python>>正文


Python parquet.ParquetFile方法代码示例

本文整理汇总了Python中pyarrow.parquet.ParquetFile方法的典型用法代码示例。如果您正苦于以下问题:Python parquet.ParquetFile方法的具体用法?Python parquet.ParquetFile怎么用?Python parquet.ParquetFile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyarrow.parquet的用法示例。


在下文中一共展示了parquet.ParquetFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_write_from_csv

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_from_csv():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    assert pqf.num_row_groups == 1
    schema = pqf.schema
    assert schema.names == ['a', 'b']
    assert schema.column(0).logical_type.type == 'STRING'
    assert schema.column(1).logical_type.type == 'STRING'
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 3
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 3
    col_a = row_group.column(0).to_pylist()
    assert col_a == ['1', '2', '3']
    col_b = row_group.column(1).to_pylist()
    assert col_b == ['a', 'b', 'c'] 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:18,代码来源:test_write.py

示例2: table

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def table(self, name: str, path: Optional[str] = None) -> ir.TableExpr:
        if name not in self.list_tables(path):
            raise AttributeError(name)

        if path is None:
            path = self.root

        # get the schema
        f = path / "{}.parquet".format(name)

        parquet_file = pq.ParquetFile(str(f))
        schema = sch.infer(parquet_file.schema)

        table = self.table_class(name, schema, self).to_expr()
        self.dictionary[name] = f

        return table 
开发者ID:ibis-project,项目名称:ibis,代码行数:19,代码来源:parquet.py

示例3: test_predicate_accept_in

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_predicate_accept_in(store, predicate_value, expected):
    df = pd.DataFrame({"A": [0, 4, 13, 29]})  # min = 0, max = 29
    predicate = ("A", "in", predicate_value)
    serialiser = ParquetSerializer(chunk_size=None)
    key = serialiser.store(store, "prefix", df)

    parquet_file = ParquetFile(store.open(key))
    row_meta = parquet_file.metadata.row_group(0)
    arrow_schema = parquet_file.schema.to_arrow_schema()
    parquet_reader = parquet_file.reader
    assert (
        _predicate_accepts(
            predicate,
            row_meta=row_meta,
            arrow_schema=arrow_schema,
            parquet_reader=parquet_reader,
        )
        == expected
    ) 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:21,代码来源:test_parquet.py

示例4: test_convert_json

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_convert_json():
    """
    Test converting a JSON file to Parquet
    """
    schema = pa.schema([
        pa.field("foo", pa.int32()),
        pa.field("bar", pa.int64())
    ])

    input_path = "{}/tests/fixtures/simple_json.txt".format(os.getcwd())
    expected_file = "{}/tests/fixtures/simple.parquet".format(os.getcwd())
    with tempfile.NamedTemporaryFile() as f:
        output_file = f.name
        client.convert_json(input_path, output_file, schema)
        output = pq.ParquetFile(output_file)
        expected = pq.ParquetFile(expected_file)
        assert output.metadata.num_columns == expected.metadata.num_columns
        assert output.metadata.num_rows == expected.metadata.num_rows
        assert output.schema.equals(expected.schema)
        assert output.read_row_group(0).to_pydict() == expected.read_row_group(0).to_pydict() 
开发者ID:andrewgross,项目名称:json2parquet,代码行数:22,代码来源:test_client.py

示例5: test_write_from_tsv

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_from_tsv():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple2.tsv'])
    pqf = pq.ParquetFile('csvs/simple2.parquet')
    assert pqf.num_row_groups == 1
    schema = pqf.schema
    assert schema.names == ['a', 'b']
    assert schema.column(0).logical_type.type == 'STRING'
    assert schema.column(1).logical_type.type == 'STRING'
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 1
    col_a = row_group.column(0).to_pylist()
    assert col_a == ['1']
    col_b = row_group.column(1).to_pylist()
    assert col_b == ['b'] 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:16,代码来源:test_write.py

示例6: test_write_rename

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_rename():
    csv2parquet.main_with_args(csv2parquet.convert,
                               ['csvs/simple.csv', '--rename', '0=alpha', 'b=bee'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    schema = pqf.schema
    assert schema.names == ['alpha', 'bee'] 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:8,代码来源:test_write.py

示例7: test_write_row_group_size

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_row_group_size():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--row-group-size', '1'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    assert pqf.num_row_groups == 3 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:6,代码来源:test_write.py

示例8: test_write_limit

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_limit():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--rows', '1'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 1 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:7,代码来源:test_write.py

示例9: test_write_include_by_index

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_include_by_index():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--include', '0'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    schema = pqf.schema
    assert schema.names == ['a']
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 3
    col_a = row_group.column(0).to_pylist()
    assert col_a == ['1', '2', '3'] 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:11,代码来源:test_write.py

示例10: test_write_exclude_by_name

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_exclude_by_name():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--exclude', 'a'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    schema = pqf.schema
    assert schema.names == ['b']
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 3
    col_b = row_group.column(0).to_pylist()
    assert col_b == ['a', 'b', 'c'] 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:11,代码来源:test_write.py

示例11: test_write_exclude_by_index

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_exclude_by_index():
    csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--exclude', '0'])
    pqf = pq.ParquetFile('csvs/simple.parquet')
    schema = pqf.schema
    assert schema.names == ['b']
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 3
    col_b = row_group.column(0).to_pylist()
    assert col_b == ['a', 'b', 'c'] 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:11,代码来源:test_write.py

示例12: test_required_types

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_required_types():
    csv2parquet.main_with_args(csv2parquet.convert,
                               ['csvs/types.csv', '--type',
                                'bool=bool', 'float32=float32', 'float64=float64', 'int8=int8',
                                'int16=int16', 'int32=int32', 'int64=int64', 'string=string',
                                'timestamp=timestamp'])
    pqf = pq.ParquetFile('csvs/types.parquet')
    schema = pqf.schema
    assert schema.names == ['bool', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64',
                            'string', 'timestamp']
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 2
    bools = row_group.column(0).to_pylist()
    assert bools == [True, False]
    float32 = row_group.column(1).to_pylist()
    assert float32 == pytest.approx([0.5, 0.6])
    float64 = row_group.column(2).to_pylist()
    assert float64 == [0.75, 1.75]
    int8 = row_group.column(3).to_pylist()
    assert int8 == [12, 13]
    int16 = row_group.column(4).to_pylist()
    assert int16 == [400, 401]
    int32 = row_group.column(5).to_pylist()
    assert int32 == [132000, 132001]
    int64 = row_group.column(6).to_pylist()
    assert int64 == [6000000000, 6000000001]
    string = row_group.column(7).to_pylist()
    assert string == ['string', 'string']
    timestamp = row_group.column(8).to_pylist()
    assert timestamp == [datetime(2018, 7, 9, 0, 0), datetime(2018, 7, 10, 0, 0)] 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:32,代码来源:test_write.py

示例13: test_opt_invalid_types

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_opt_invalid_types():
    csv2parquet.main_with_args(csv2parquet.convert,
                               ['csvs/invalid-types.csv', '--type',
                                'bool=bool?', 'float32=float32?', 'float64=float64?', 'int8=int8?',
                                'int16=int16?', 'int32=int32?', 'int64=int64?', 'string=string?',
                                'timestamp=timestamp?'])
    pqf = pq.ParquetFile('csvs/invalid-types.parquet')
    schema = pqf.schema
    assert schema.names == ['bool', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64',
                            'string', 'timestamp']
    row_group = pqf.read_row_group(0)
    assert row_group.num_rows == 2
    bools = row_group.column(0).to_pylist()
    assert bools == [True, None]
    float32 = row_group.column(1).to_pylist()
    assert len(float32) == 2
    assert float32[0] == pytest.approx(0.5)
    assert float32[1] is None
    float64 = row_group.column(2).to_pylist()
    assert float64 == [0.75, None]
    int8 = row_group.column(3).to_pylist()
    assert int8 == [12, None]
    int16 = row_group.column(4).to_pylist()
    assert int16 == [400, None]
    int32 = row_group.column(5).to_pylist()
    assert int32 == [132000, None]
    int64 = row_group.column(6).to_pylist()
    assert int64 == [6000000000, None]
    string = row_group.column(7).to_pylist()
    assert string == ['string', 'blah']
    timestamp = row_group.column(8).to_pylist()
    assert timestamp == [datetime(2018, 7, 9, 0, 0), None] 
开发者ID:cldellow,项目名称:csv2parquet,代码行数:34,代码来源:test_write.py

示例14: handle

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def handle(self):
        if self._handle is None:
            self._handle = pq.ParquetFile(self.path)
        return self._handle 
开发者ID:LSSTDESC,项目名称:gcr-catalogs,代码行数:6,代码来源:parquet.py

示例15: test_rowgroup_writing

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_rowgroup_writing(store, use_categorical, chunk_size):
    df = pd.DataFrame({"string": ["abc", "affe", "banane", "buchstabe"]})
    serialiser = ParquetSerializer(chunk_size=2)
    # Arrow 0.9.0 has a bug in writing categorical columns to more than a single
    # RowGroup: "ArrowIOError: Column 2 had 2 while previous column had 4".
    # We have special handling for that in pandas-serialiser that should be
    # removed once we switch to 0.10.0
    if use_categorical:
        df_write = df.astype({"string": "category"})
    else:
        df_write = df
    key = serialiser.store(store, "prefix", df_write)

    parquet_file = ParquetFile(store.open(key))
    assert parquet_file.num_row_groups == 2 
开发者ID:JDASoftwareGroup,项目名称:kartothek,代码行数:17,代码来源:test_parquet.py


注:本文中的pyarrow.parquet.ParquetFile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。