本文整理汇总了Python中pyarrow.parquet.ParquetDataset方法的典型用法代码示例。如果您正苦于以下问题:Python parquet.ParquetDataset方法的具体用法?Python parquet.ParquetDataset怎么用?Python parquet.ParquetDataset使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow.parquet
的用法示例。
在下文中一共展示了parquet.ParquetDataset方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_schema_from_dataset_url
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def get_schema_from_dataset_url(dataset_url_or_urls, hdfs_driver='libhdfs3'):
"""Returns a :class:`petastorm.unischema.Unischema` object loaded from a dataset specified by a url.
:param dataset_url_or_urls: a url to a parquet directory or a url list (with the same scheme) to parquet files.
:param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
libhdfs (java through JNI) or libhdfs3 (C++)
:return: A :class:`petastorm.unischema.Unischema` object
"""
fs, path_or_paths = get_filesystem_and_path_or_paths(dataset_url_or_urls, hdfs_driver)
dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10)
# Get a unischema stored in the dataset metadata.
stored_schema = get_schema(dataset)
return stored_schema
示例2: test_asymetric_parquet_pieces
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def test_asymetric_parquet_pieces(reader_factory, tmpdir):
"""Check that datasets with parquet files that all rows in datasets that have different number of rowgroups can
be fully read """
url = 'file://' + tmpdir.strpath
ROWS_COUNT = 1000
# id_div_700 forces asymetric split between partitions and hopefully get us files with different number of row
# groups
create_test_scalar_dataset(url, ROWS_COUNT, partition_by=['id_div_700'])
# We verify we have pieces with different number of row-groups
dataset = pq.ParquetDataset(tmpdir.strpath)
row_group_counts = set(compat_get_metadata(piece, dataset.fs.open).num_row_groups for piece in dataset.pieces)
assert len(row_group_counts) > 1
# Make sure we are not missing any rows.
with reader_factory(url, schema_fields=['id']) as reader:
row_ids_batched = [row.id for row in reader]
actual_row_ids = list(itertools.chain(*row_ids_batched))
assert ROWS_COUNT == len(actual_row_ids)
示例3: test_regenerate_metadata
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def test_regenerate_metadata(synthetic_dataset, tmpdir):
a_moved_path = tmpdir.join('moved').strpath
copytree(synthetic_dataset.path, a_moved_path)
# Make sure we can read dataset before
_check_reader(a_moved_path)
# Delete both metadata files
dataset = pq.ParquetDataset(a_moved_path)
os.remove(dataset.common_metadata_path)
# make_reader should not be able to read a dataset without Petastorm metadat.
with pytest.raises(RuntimeError, match='make_reader supports reading only Petastorm datasets'):
_check_reader(a_moved_path)
# Regenerate all metadata including unischema information
petastorm_generate_metadata._main([
'--dataset_url', 'file://{}'.format(a_moved_path),
'--unischema_class', 'petastorm.tests.test_common.TestSchema',
])
# Reader should now work again (row group selector will not since we removed all metadata)
_check_reader(a_moved_path)
示例4: test_partition_cols_supported
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def test_partition_cols_supported(self, pa, df_full):
# GH #23283
partition_cols = ['bool', 'int']
df = df_full
with tm.ensure_clean_dir() as path:
df.to_parquet(path, partition_cols=partition_cols,
compression=None)
import pyarrow.parquet as pq
dataset = pq.ParquetDataset(path, validate_schema=False)
assert len(dataset.partitions.partition_names) == 2
assert dataset.partitions.partition_names == set(partition_cols)
示例5: read_parquet
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def read_parquet(self, path, columns=None, metadata=None, schema=None,
use_threads=True, use_pandas_metadata=False):
"""
Read Parquet data from path in file system. Can read from a single file
or a directory of files
Parameters
----------
path : str
Single file path or directory
columns : List[str], optional
Subset of columns to read
metadata : pyarrow.parquet.FileMetaData
Known metadata to validate files against
schema : pyarrow.parquet.Schema
Known schema to validate files against. Alternative to metadata
argument
use_threads : boolean, default True
Perform multi-threaded column reads
use_pandas_metadata : boolean, default False
If True and file has custom pandas schema metadata, ensure that
index columns are also loaded
Returns
-------
table : pyarrow.Table
"""
from pyarrow.parquet import ParquetDataset
dataset = ParquetDataset(path, schema=schema, metadata=metadata,
filesystem=self)
return dataset.read(columns=columns, use_threads=use_threads,
use_pandas_metadata=use_pandas_metadata)
示例6: _filter_row_groups
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def _filter_row_groups(self, dataset, row_groups, predicate, rowgroup_selector, cur_shard,
shard_count):
"""Calculates which rowgroups will be read during.
The following filters are applied:
- predicates;
- row-group selector (our indexing mechanism);
- training partition
:param dataset: ParquetDataset instance
:param row_groups: a list of row groups (a list of ParquetDatasetPiece objects)
:param predicate: instance of predicate object to filter rows to be returned by reader.
:param rowgroup_selector: instance of row group selector object to select row groups to be read
:param cur_shard: An int denoting the current shard number used. Each node should
pass in a unique partition number in the range [0, shard_count).
:param shard_count An int denoting the number of reader shards
:return: (filtered_row_group_indexes, worker_predicate): filtered_row_group_indexes an integer index into
row_groups array. worker_predicate contains only predicates that could not be resolved on the partitioned fields
and need to be evaluated by workers.
"""
filtered_row_group_indexes, worker_predicate = \
self._apply_predicate_to_row_groups(dataset, row_groups, predicate)
if rowgroup_selector:
filtered_row_group_indexes = self._apply_row_group_selector(dataset, rowgroup_selector,
filtered_row_group_indexes)
if cur_shard is not None or shard_count is not None:
filtered_row_group_indexes = self._partition_row_groups(dataset, row_groups, shard_count,
cur_shard,
filtered_row_group_indexes)
if not filtered_row_group_indexes:
warnings.warn('No matching data is available for loading after rowgroup '
'selector were applied and the data was sharded.')
return filtered_row_group_indexes, worker_predicate
示例7: _generate_unischema_metadata
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def _generate_unischema_metadata(dataset, schema):
"""
Generates the serialized unischema and adds it to the dataset parquet metadata to be used upon reading.
:param dataset: (ParquetDataset) Dataset to attach schema
:param schema: (Unischema) Schema to attach to dataset
:return: None
"""
# TODO(robbieg): Simply pickling unischema will break if the UnischemaField class is changed,
# or the codec classes are changed. We likely need something more robust.
assert schema
serialized_schema = pickle.dumps(schema)
utils.add_to_dataset_metadata(dataset, UNISCHEMA_KEY, serialized_schema)
示例8: _generate_num_row_groups_per_file
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def _generate_num_row_groups_per_file(dataset, spark_context, filesystem_factory):
"""
Generates the metadata file containing the number of row groups in each file
for the parquet dataset located at the dataset_url. It does this in spark by
opening all parquet files in the dataset on the executors and collecting the
number of row groups in each file back on the driver.
:param dataset: :class:`pyarrow.parquet.ParquetDataset`
:param spark_context: spark context to use for retrieving the number of row groups
in each parquet file in parallel
:return: None, upon successful completion the metadata file will exist.
"""
if not isinstance(dataset.paths, str):
raise ValueError('Expected dataset.paths to be a single path, not a list of paths')
# Get the common prefix of all the base path in order to retrieve a relative path
paths = [piece.path for piece in dataset.pieces]
# Needed pieces from the dataset must be extracted for spark because the dataset object is not serializable
base_path = dataset.paths
def get_row_group_info(path):
fs = filesystem_factory()
relative_path = os.path.relpath(path, base_path)
pq_file = fs.open(path)
num_row_groups = pq.read_metadata(pq_file).num_row_groups
pq_file.close()
return relative_path, num_row_groups
row_groups = spark_context.parallelize(paths, len(paths)) \
.map(get_row_group_info) \
.collect()
num_row_groups_str = json.dumps(dict(row_groups))
# Add the dict for the number of row groups in each file to the parquet file metadata footer
utils.add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, num_row_groups_str)
示例9: test_normalize_shuffle_partitions
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def test_normalize_shuffle_partitions(synthetic_dataset):
dataset = pq.ParquetDataset(synthetic_dataset.path)
row_drop_partitions = Reader._normalize_shuffle_options(2, dataset)
assert row_drop_partitions == 2
row_drop_partitions = Reader._normalize_shuffle_options(1000, dataset)
assert row_drop_partitions == 10
示例10: parquet_file_schema
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def parquet_file_schema(file_name):
import pyarrow.parquet as pq
col_names = []
col_types = []
pq_dataset = pq.ParquetDataset(file_name)
col_names = pq_dataset.schema.names
pa_schema = pq_dataset.schema.to_arrow_schema()
col_types = [_get_numba_typ_from_pa_typ(pa_schema.field_by_name(c).type)
for c in col_names]
# TODO: close file?
return col_names, col_types
示例11: _read_parquet_columns
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def _read_parquet_columns(path, columns, num_splits, kwargs): # pragma: no cover
"""Use a Ray task to read columns from Parquet into a Pandas DataFrame.
Note: Ray functions are not detected by codecov (thus pragma: no cover)
Args:
path: The path of the Parquet file.
columns: The list of column names to read.
num_splits: The number of partitions to split the column into.
Returns:
A list containing the split Pandas DataFrames and the Index as the last
element. If there is not `index_col` set, then we just return the length.
This is used to determine the total length of the DataFrame to build a
default Index.
"""
import pyarrow.parquet as pq
df = (
pq.ParquetDataset(path, **kwargs)
.read(columns=columns, use_pandas_metadata=True)
.to_pandas()
)
df = df[columns]
# Append the length of the index here to build it externally
return _split_result_for_readers(0, num_splits, df) + [len(df.index)]
示例12: process
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
"""Main worker function. Loads and returns all rows matching the predicate from a rowgroup
Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
the rest of the columns are not loaded.
:param piece_index:
:param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
of partitions.
:return:
"""
if not self._dataset:
self._dataset = pq.ParquetDataset(
self._dataset_path_or_paths,
filesystem=self._filesystem,
validate_schema=False)
if self._dataset.partitions is None:
# When read from parquet file list, the `dataset.partitions` will be None.
# But other petastorm code require at least an empty `ParquetPartitions` object.
self._dataset.partitions = pq.ParquetPartitions()
piece = self._split_pieces[piece_index]
# Create pyarrow file system
parquet_file = ParquetFile(self._dataset.fs.open(piece.path))
if not isinstance(self._local_cache, NullCache):
if worker_predicate:
raise RuntimeError('Local cache is not supported together with predicates, '
'unless the dataset is partitioned by the column the predicate operates on.')
if shuffle_row_drop_partition[1] != 1:
raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')
if worker_predicate:
all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition)
else:
# Using hash of the dataset path with the relative path in order to:
# 1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
# 2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with
# some cache implementations
# 3. Still leave relative path and the piece_index in plain text to make it easier to debug
if isinstance(self._dataset_path_or_paths, list):
path_str = ','.join(self._dataset_path_or_paths)
else:
path_str = self._dataset_path_or_paths
cache_key = '{}:{}:{}'.format(hashlib.md5(path_str.encode('utf-8')).hexdigest(),
piece.path, piece_index)
all_cols = self._local_cache.get(cache_key,
lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition))
if all_cols:
self.publish_func(all_cols)
示例13: process
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
"""Main worker function. Loads and returns all rows matching the predicate from a rowgroup
Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
the rest of the columns are not loaded.
:param piece_index:
:param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
of partitions.
:return:
"""
if not self._dataset:
self._dataset = pq.ParquetDataset(
self._dataset_path,
filesystem=self._filesystem,
validate_schema=False)
piece = self._split_pieces[piece_index]
# Create pyarrow file system
parquet_file = ParquetFile(self._dataset.fs.open(piece.path))
if not isinstance(self._local_cache, NullCache):
if worker_predicate:
raise RuntimeError('Local cache is not supported together with predicates, '
'unless the dataset is partitioned by the column the predicate operates on.')
if shuffle_row_drop_partition[1] != 1:
raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')
if worker_predicate:
all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition)
else:
# Using hash of the dataset path with the relative path in order to:
# 1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
# 2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with
# some cache implementations
# 3. Still leave relative path and the piece_index in plain text to make it easier to debug
cache_key = '{}:{}:{}'.format(hashlib.md5(self._dataset_path.encode('utf-8')).hexdigest(),
piece.path, piece_index)
all_cols = self._local_cache.get(cache_key,
lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition))
if self._ngram:
all_cols = self._ngram.form_ngram(data=all_cols, schema=self._schema)
if all_cols:
self.publish_func(all_cols)
示例14: build_rowgroup_index
# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def build_rowgroup_index(dataset_url, spark_context, indexers, hdfs_driver='libhdfs3'):
"""
Build index for given list of fields to use for fast rowgroup selection
:param dataset_url: (str) the url for the dataset (or a path if you would like to use the default hdfs config)
:param spark_context: (SparkContext)
:param indexers: list of objects to build row groups indexes. Should support RowGroupIndexerBase interface
:param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
libhdfs (java through JNI) or libhdfs3 (C++)
:return: None, upon successful completion the rowgroup predicates will be saved to _metadata file
"""
if dataset_url and dataset_url[-1] == '/':
dataset_url = dataset_url[:-1]
# Create pyarrow file system
resolver = FilesystemResolver(dataset_url, spark_context._jsc.hadoopConfiguration(),
hdfs_driver=hdfs_driver, user=spark_context.sparkUser())
dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
validate_schema=False)
split_pieces = dataset_metadata.load_row_groups(dataset)
schema = dataset_metadata.get_schema(dataset)
# We need direct reference on partitions object
partitions = dataset.partitions
pieces_num = len(split_pieces)
piece_info_list = []
for piece_index in range(pieces_num):
# indexes relies on the ordering of the split dataset pieces.
# This relies on how the dataset pieces are split and sorted which although should not change,
# still might and we should make sure not to forget that could break this.
piece = split_pieces[piece_index]
piece_info_list.append(PieceInfo(piece_index, piece.path, piece.row_group, piece.partition_keys))
start_time = time.time()
piece_info_rdd = spark_context.parallelize(piece_info_list, min(len(piece_info_list), PARALLEL_SLICE_NUM))
indexer_rdd = piece_info_rdd.map(lambda piece_info: _index_columns(piece_info, dataset_url, partitions,
indexers, schema, hdfs_driver=hdfs_driver))
indexer_list = indexer_rdd.reduce(_combine_indexers)
indexer_dict = {indexer.index_name: indexer for indexer in indexer_list}
serialized_indexers = pickle.dumps(indexer_dict, pickle.HIGHEST_PROTOCOL)
utils.add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, serialized_indexers)
logger.info("Elapsed time of index creation: %f s", (time.time() - start_time))