本文整理汇总了Python中pyarrow.parquet.ParquetFile方法的典型用法代码示例。如果您正苦于以下问题:Python parquet.ParquetFile方法的具体用法?Python parquet.ParquetFile怎么用?Python parquet.ParquetFile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow.parquet
的用法示例。
在下文中一共展示了parquet.ParquetFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_write_from_csv
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_from_csv():
csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv'])
pqf = pq.ParquetFile('csvs/simple.parquet')
assert pqf.num_row_groups == 1
schema = pqf.schema
assert schema.names == ['a', 'b']
assert schema.column(0).logical_type.type == 'STRING'
assert schema.column(1).logical_type.type == 'STRING'
row_group = pqf.read_row_group(0)
assert row_group.num_rows == 3
row_group = pqf.read_row_group(0)
assert row_group.num_rows == 3
col_a = row_group.column(0).to_pylist()
assert col_a == ['1', '2', '3']
col_b = row_group.column(1).to_pylist()
assert col_b == ['a', 'b', 'c']
示例2: table
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def table(self, name: str, path: Optional[str] = None) -> ir.TableExpr:
if name not in self.list_tables(path):
raise AttributeError(name)
if path is None:
path = self.root
# get the schema
f = path / "{}.parquet".format(name)
parquet_file = pq.ParquetFile(str(f))
schema = sch.infer(parquet_file.schema)
table = self.table_class(name, schema, self).to_expr()
self.dictionary[name] = f
return table
示例3: test_predicate_accept_in
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_predicate_accept_in(store, predicate_value, expected):
df = pd.DataFrame({"A": [0, 4, 13, 29]}) # min = 0, max = 29
predicate = ("A", "in", predicate_value)
serialiser = ParquetSerializer(chunk_size=None)
key = serialiser.store(store, "prefix", df)
parquet_file = ParquetFile(store.open(key))
row_meta = parquet_file.metadata.row_group(0)
arrow_schema = parquet_file.schema.to_arrow_schema()
parquet_reader = parquet_file.reader
assert (
_predicate_accepts(
predicate,
row_meta=row_meta,
arrow_schema=arrow_schema,
parquet_reader=parquet_reader,
)
== expected
)
示例4: test_convert_json
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_convert_json():
"""
Test converting a JSON file to Parquet
"""
schema = pa.schema([
pa.field("foo", pa.int32()),
pa.field("bar", pa.int64())
])
input_path = "{}/tests/fixtures/simple_json.txt".format(os.getcwd())
expected_file = "{}/tests/fixtures/simple.parquet".format(os.getcwd())
with tempfile.NamedTemporaryFile() as f:
output_file = f.name
client.convert_json(input_path, output_file, schema)
output = pq.ParquetFile(output_file)
expected = pq.ParquetFile(expected_file)
assert output.metadata.num_columns == expected.metadata.num_columns
assert output.metadata.num_rows == expected.metadata.num_rows
assert output.schema.equals(expected.schema)
assert output.read_row_group(0).to_pydict() == expected.read_row_group(0).to_pydict()
示例5: test_write_from_tsv
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_from_tsv():
csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple2.tsv'])
pqf = pq.ParquetFile('csvs/simple2.parquet')
assert pqf.num_row_groups == 1
schema = pqf.schema
assert schema.names == ['a', 'b']
assert schema.column(0).logical_type.type == 'STRING'
assert schema.column(1).logical_type.type == 'STRING'
row_group = pqf.read_row_group(0)
assert row_group.num_rows == 1
col_a = row_group.column(0).to_pylist()
assert col_a == ['1']
col_b = row_group.column(1).to_pylist()
assert col_b == ['b']
示例6: test_write_rename
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_rename():
csv2parquet.main_with_args(csv2parquet.convert,
['csvs/simple.csv', '--rename', '0=alpha', 'b=bee'])
pqf = pq.ParquetFile('csvs/simple.parquet')
schema = pqf.schema
assert schema.names == ['alpha', 'bee']
示例7: test_write_row_group_size
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_row_group_size():
csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--row-group-size', '1'])
pqf = pq.ParquetFile('csvs/simple.parquet')
assert pqf.num_row_groups == 3
示例8: test_write_limit
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_limit():
csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--rows', '1'])
pqf = pq.ParquetFile('csvs/simple.parquet')
row_group = pqf.read_row_group(0)
assert row_group.num_rows == 1
示例9: test_write_include_by_index
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_include_by_index():
csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--include', '0'])
pqf = pq.ParquetFile('csvs/simple.parquet')
schema = pqf.schema
assert schema.names == ['a']
row_group = pqf.read_row_group(0)
assert row_group.num_rows == 3
col_a = row_group.column(0).to_pylist()
assert col_a == ['1', '2', '3']
示例10: test_write_exclude_by_name
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_exclude_by_name():
csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--exclude', 'a'])
pqf = pq.ParquetFile('csvs/simple.parquet')
schema = pqf.schema
assert schema.names == ['b']
row_group = pqf.read_row_group(0)
assert row_group.num_rows == 3
col_b = row_group.column(0).to_pylist()
assert col_b == ['a', 'b', 'c']
示例11: test_write_exclude_by_index
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_write_exclude_by_index():
csv2parquet.main_with_args(csv2parquet.convert, ['csvs/simple.csv', '--exclude', '0'])
pqf = pq.ParquetFile('csvs/simple.parquet')
schema = pqf.schema
assert schema.names == ['b']
row_group = pqf.read_row_group(0)
assert row_group.num_rows == 3
col_b = row_group.column(0).to_pylist()
assert col_b == ['a', 'b', 'c']
示例12: test_required_types
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_required_types():
csv2parquet.main_with_args(csv2parquet.convert,
['csvs/types.csv', '--type',
'bool=bool', 'float32=float32', 'float64=float64', 'int8=int8',
'int16=int16', 'int32=int32', 'int64=int64', 'string=string',
'timestamp=timestamp'])
pqf = pq.ParquetFile('csvs/types.parquet')
schema = pqf.schema
assert schema.names == ['bool', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64',
'string', 'timestamp']
row_group = pqf.read_row_group(0)
assert row_group.num_rows == 2
bools = row_group.column(0).to_pylist()
assert bools == [True, False]
float32 = row_group.column(1).to_pylist()
assert float32 == pytest.approx([0.5, 0.6])
float64 = row_group.column(2).to_pylist()
assert float64 == [0.75, 1.75]
int8 = row_group.column(3).to_pylist()
assert int8 == [12, 13]
int16 = row_group.column(4).to_pylist()
assert int16 == [400, 401]
int32 = row_group.column(5).to_pylist()
assert int32 == [132000, 132001]
int64 = row_group.column(6).to_pylist()
assert int64 == [6000000000, 6000000001]
string = row_group.column(7).to_pylist()
assert string == ['string', 'string']
timestamp = row_group.column(8).to_pylist()
assert timestamp == [datetime(2018, 7, 9, 0, 0), datetime(2018, 7, 10, 0, 0)]
示例13: test_opt_invalid_types
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_opt_invalid_types():
csv2parquet.main_with_args(csv2parquet.convert,
['csvs/invalid-types.csv', '--type',
'bool=bool?', 'float32=float32?', 'float64=float64?', 'int8=int8?',
'int16=int16?', 'int32=int32?', 'int64=int64?', 'string=string?',
'timestamp=timestamp?'])
pqf = pq.ParquetFile('csvs/invalid-types.parquet')
schema = pqf.schema
assert schema.names == ['bool', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64',
'string', 'timestamp']
row_group = pqf.read_row_group(0)
assert row_group.num_rows == 2
bools = row_group.column(0).to_pylist()
assert bools == [True, None]
float32 = row_group.column(1).to_pylist()
assert len(float32) == 2
assert float32[0] == pytest.approx(0.5)
assert float32[1] is None
float64 = row_group.column(2).to_pylist()
assert float64 == [0.75, None]
int8 = row_group.column(3).to_pylist()
assert int8 == [12, None]
int16 = row_group.column(4).to_pylist()
assert int16 == [400, None]
int32 = row_group.column(5).to_pylist()
assert int32 == [132000, None]
int64 = row_group.column(6).to_pylist()
assert int64 == [6000000000, None]
string = row_group.column(7).to_pylist()
assert string == ['string', 'blah']
timestamp = row_group.column(8).to_pylist()
assert timestamp == [datetime(2018, 7, 9, 0, 0), None]
示例14: handle
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def handle(self):
if self._handle is None:
self._handle = pq.ParquetFile(self.path)
return self._handle
示例15: test_rowgroup_writing
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetFile [as 别名]
def test_rowgroup_writing(store, use_categorical, chunk_size):
df = pd.DataFrame({"string": ["abc", "affe", "banane", "buchstabe"]})
serialiser = ParquetSerializer(chunk_size=2)
# Arrow 0.9.0 has a bug in writing categorical columns to more than a single
# RowGroup: "ArrowIOError: Column 2 had 2 while previous column had 4".
# We have special handling for that in pandas-serialiser that should be
# removed once we switch to 0.10.0
if use_categorical:
df_write = df.astype({"string": "category"})
else:
df_write = df
key = serialiser.store(store, "prefix", df_write)
parquet_file = ParquetFile(store.open(key))
assert parquet_file.num_row_groups == 2