当前位置: 首页>>代码示例>>Python>>正文


Python parquet.ParquetDataset方法代码示例

本文整理汇总了Python中pyarrow.parquet.ParquetDataset方法的典型用法代码示例。如果您正苦于以下问题:Python parquet.ParquetDataset方法的具体用法?Python parquet.ParquetDataset怎么用?Python parquet.ParquetDataset使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyarrow.parquet的用法示例。


在下文中一共展示了parquet.ParquetDataset方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_schema_from_dataset_url

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def get_schema_from_dataset_url(dataset_url_or_urls, hdfs_driver='libhdfs3'):
    """Returns a :class:`petastorm.unischema.Unischema` object loaded from a dataset specified by a url.

    :param dataset_url_or_urls: a url to a parquet directory or a url list (with the same scheme) to parquet files.
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: A :class:`petastorm.unischema.Unischema` object
    """
    fs, path_or_paths = get_filesystem_and_path_or_paths(dataset_url_or_urls, hdfs_driver)

    dataset = pq.ParquetDataset(path_or_paths, filesystem=fs, validate_schema=False, metadata_nthreads=10)

    # Get a unischema stored in the dataset metadata.
    stored_schema = get_schema(dataset)

    return stored_schema 
开发者ID:uber,项目名称:petastorm,代码行数:18,代码来源:dataset_metadata.py

示例2: test_asymetric_parquet_pieces

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def test_asymetric_parquet_pieces(reader_factory, tmpdir):
    """Check that datasets with parquet files that all rows in datasets that have different number of rowgroups can
    be fully read """
    url = 'file://' + tmpdir.strpath

    ROWS_COUNT = 1000
    # id_div_700 forces asymetric split between partitions and hopefully get us files with different number of row
    # groups
    create_test_scalar_dataset(url, ROWS_COUNT, partition_by=['id_div_700'])

    # We verify we have pieces with different number of row-groups
    dataset = pq.ParquetDataset(tmpdir.strpath)
    row_group_counts = set(compat_get_metadata(piece, dataset.fs.open).num_row_groups for piece in dataset.pieces)
    assert len(row_group_counts) > 1

    # Make sure we are not missing any rows.
    with reader_factory(url, schema_fields=['id']) as reader:
        row_ids_batched = [row.id for row in reader]
        actual_row_ids = list(itertools.chain(*row_ids_batched))

    assert ROWS_COUNT == len(actual_row_ids) 
开发者ID:uber,项目名称:petastorm,代码行数:23,代码来源:test_parquet_reader.py

示例3: test_regenerate_metadata

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def test_regenerate_metadata(synthetic_dataset, tmpdir):
    a_moved_path = tmpdir.join('moved').strpath
    copytree(synthetic_dataset.path, a_moved_path)

    # Make sure we can read dataset before
    _check_reader(a_moved_path)

    # Delete both metadata files
    dataset = pq.ParquetDataset(a_moved_path)
    os.remove(dataset.common_metadata_path)

    # make_reader should not be able to read a dataset without Petastorm metadat.
    with pytest.raises(RuntimeError, match='make_reader supports reading only Petastorm datasets'):
        _check_reader(a_moved_path)

    # Regenerate all metadata including unischema information
    petastorm_generate_metadata._main([
        '--dataset_url', 'file://{}'.format(a_moved_path),
        '--unischema_class', 'petastorm.tests.test_common.TestSchema',
    ])

    # Reader should now work again (row group selector will not since we removed all metadata)
    _check_reader(a_moved_path) 
开发者ID:uber,项目名称:petastorm,代码行数:25,代码来源:test_generate_metadata.py

示例4: test_partition_cols_supported

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def test_partition_cols_supported(self, pa, df_full):
        # GH #23283
        partition_cols = ['bool', 'int']
        df = df_full
        with tm.ensure_clean_dir() as path:
            df.to_parquet(path, partition_cols=partition_cols,
                          compression=None)
            import pyarrow.parquet as pq
            dataset = pq.ParquetDataset(path, validate_schema=False)
            assert len(dataset.partitions.partition_names) == 2
            assert dataset.partitions.partition_names == set(partition_cols) 
开发者ID:Frank-qlu,项目名称:recruit,代码行数:13,代码来源:test_parquet.py

示例5: read_parquet

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def read_parquet(self, path, columns=None, metadata=None, schema=None,
                         use_threads=True, use_pandas_metadata=False):
            """
            Read Parquet data from path in file system. Can read from a single file
            or a directory of files

            Parameters
            ----------
            path : str
                Single file path or directory
            columns : List[str], optional
                Subset of columns to read
            metadata : pyarrow.parquet.FileMetaData
                Known metadata to validate files against
            schema : pyarrow.parquet.Schema
                Known schema to validate files against. Alternative to metadata
                argument
            use_threads : boolean, default True
                Perform multi-threaded column reads
            use_pandas_metadata : boolean, default False
                If True and file has custom pandas schema metadata, ensure that
                index columns are also loaded

            Returns
            -------
            table : pyarrow.Table
            """
            from pyarrow.parquet import ParquetDataset
            dataset = ParquetDataset(path, schema=schema, metadata=metadata,
                                     filesystem=self)
            return dataset.read(columns=columns, use_threads=use_threads,
                                use_pandas_metadata=use_pandas_metadata) 
开发者ID:mars-project,项目名称:mars,代码行数:34,代码来源:filesystem.py

示例6: _filter_row_groups

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def _filter_row_groups(self, dataset, row_groups, predicate, rowgroup_selector, cur_shard,
                           shard_count):
        """Calculates which rowgroups will be read during.

        The following filters are applied:
        - predicates;
        - row-group selector (our indexing mechanism);
        - training partition

        :param dataset: ParquetDataset instance
        :param row_groups: a list of row groups (a list of ParquetDatasetPiece objects)
        :param predicate: instance of predicate object to filter rows to be returned by reader.
        :param rowgroup_selector: instance of row group selector object to select row groups to be read
        :param cur_shard: An int denoting the current shard number used. Each node should
                       pass in a unique partition number in the range [0, shard_count).
        :param shard_count An int denoting the number of reader shards
        :return: (filtered_row_group_indexes, worker_predicate): filtered_row_group_indexes an integer index into
        row_groups array. worker_predicate contains only predicates that could not be resolved on the partitioned fields
        and need to be evaluated by workers.
        """

        filtered_row_group_indexes, worker_predicate = \
            self._apply_predicate_to_row_groups(dataset, row_groups, predicate)

        if rowgroup_selector:
            filtered_row_group_indexes = self._apply_row_group_selector(dataset, rowgroup_selector,
                                                                        filtered_row_group_indexes)

        if cur_shard is not None or shard_count is not None:
            filtered_row_group_indexes = self._partition_row_groups(dataset, row_groups, shard_count,
                                                                    cur_shard,
                                                                    filtered_row_group_indexes)

        if not filtered_row_group_indexes:
            warnings.warn('No matching data is available for loading after rowgroup '
                          'selector were applied and the data was sharded.')

        return filtered_row_group_indexes, worker_predicate 
开发者ID:uber,项目名称:petastorm,代码行数:40,代码来源:reader.py

示例7: _generate_unischema_metadata

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def _generate_unischema_metadata(dataset, schema):
    """
    Generates the serialized unischema and adds it to the dataset parquet metadata to be used upon reading.
    :param dataset: (ParquetDataset) Dataset to attach schema
    :param schema:  (Unischema) Schema to attach to dataset
    :return: None
    """
    # TODO(robbieg): Simply pickling unischema will break if the UnischemaField class is changed,
    #  or the codec classes are changed. We likely need something more robust.
    assert schema
    serialized_schema = pickle.dumps(schema)
    utils.add_to_dataset_metadata(dataset, UNISCHEMA_KEY, serialized_schema) 
开发者ID:uber,项目名称:petastorm,代码行数:14,代码来源:dataset_metadata.py

示例8: _generate_num_row_groups_per_file

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def _generate_num_row_groups_per_file(dataset, spark_context, filesystem_factory):
    """
    Generates the metadata file containing the number of row groups in each file
    for the parquet dataset located at the dataset_url. It does this in spark by
    opening all parquet files in the dataset on the executors and collecting the
    number of row groups in each file back on the driver.
    :param dataset: :class:`pyarrow.parquet.ParquetDataset`
    :param spark_context: spark context to use for retrieving the number of row groups
    in each parquet file in parallel
    :return: None, upon successful completion the metadata file will exist.
    """
    if not isinstance(dataset.paths, str):
        raise ValueError('Expected dataset.paths to be a single path, not a list of paths')

    # Get the common prefix of all the base path in order to retrieve a relative path
    paths = [piece.path for piece in dataset.pieces]

    # Needed pieces from the dataset must be extracted for spark because the dataset object is not serializable
    base_path = dataset.paths

    def get_row_group_info(path):
        fs = filesystem_factory()
        relative_path = os.path.relpath(path, base_path)
        pq_file = fs.open(path)
        num_row_groups = pq.read_metadata(pq_file).num_row_groups
        pq_file.close()
        return relative_path, num_row_groups

    row_groups = spark_context.parallelize(paths, len(paths)) \
        .map(get_row_group_info) \
        .collect()
    num_row_groups_str = json.dumps(dict(row_groups))
    # Add the dict for the number of row groups in each file to the parquet file metadata footer
    utils.add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, num_row_groups_str) 
开发者ID:uber,项目名称:petastorm,代码行数:36,代码来源:dataset_metadata.py

示例9: test_normalize_shuffle_partitions

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def test_normalize_shuffle_partitions(synthetic_dataset):
    dataset = pq.ParquetDataset(synthetic_dataset.path)
    row_drop_partitions = Reader._normalize_shuffle_options(2, dataset)
    assert row_drop_partitions == 2

    row_drop_partitions = Reader._normalize_shuffle_options(1000, dataset)
    assert row_drop_partitions == 10 
开发者ID:uber,项目名称:petastorm,代码行数:9,代码来源:test_reader.py

示例10: parquet_file_schema

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def parquet_file_schema(file_name):
    import pyarrow.parquet as pq
    col_names = []
    col_types = []

    pq_dataset = pq.ParquetDataset(file_name)
    col_names = pq_dataset.schema.names
    pa_schema = pq_dataset.schema.to_arrow_schema()

    col_types = [_get_numba_typ_from_pa_typ(pa_schema.field_by_name(c).type)
                 for c in col_names]
    # TODO: close file?
    return col_names, col_types 
开发者ID:IntelPython,项目名称:sdc,代码行数:15,代码来源:parquet_pio.py

示例11: _read_parquet_columns

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def _read_parquet_columns(path, columns, num_splits, kwargs):  # pragma: no cover
    """Use a Ray task to read columns from Parquet into a Pandas DataFrame.

    Note: Ray functions are not detected by codecov (thus pragma: no cover)

    Args:
        path: The path of the Parquet file.
        columns: The list of column names to read.
        num_splits: The number of partitions to split the column into.

    Returns:
            A list containing the split Pandas DataFrames and the Index as the last
            element. If there is not `index_col` set, then we just return the length.
            This is used to determine the total length of the DataFrame to build a
            default Index.
    """
    import pyarrow.parquet as pq

    df = (
        pq.ParquetDataset(path, **kwargs)
        .read(columns=columns, use_pandas_metadata=True)
        .to_pandas()
    )
    df = df[columns]
    # Append the length of the index here to build it externally
    return _split_result_for_readers(0, num_splits, df) + [len(df.index)] 
开发者ID:modin-project,项目名称:modin,代码行数:28,代码来源:io_exp.py

示例12: process

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
        """Main worker function. Loads and returns all rows matching the predicate from a rowgroup

        Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
        columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
        the rest of the columns are not loaded.

        :param piece_index:
        :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
            of partitions.
        :return:
        """

        if not self._dataset:
            self._dataset = pq.ParquetDataset(
                self._dataset_path_or_paths,
                filesystem=self._filesystem,
                validate_schema=False)

        if self._dataset.partitions is None:
            # When read from parquet file list, the `dataset.partitions` will be None.
            # But other petastorm code require at least an empty `ParquetPartitions` object.
            self._dataset.partitions = pq.ParquetPartitions()

        piece = self._split_pieces[piece_index]

        # Create pyarrow file system
        parquet_file = ParquetFile(self._dataset.fs.open(piece.path))

        if not isinstance(self._local_cache, NullCache):
            if worker_predicate:
                raise RuntimeError('Local cache is not supported together with predicates, '
                                   'unless the dataset is partitioned by the column the predicate operates on.')
            if shuffle_row_drop_partition[1] != 1:
                raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')

        if worker_predicate:
            all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition)
        else:
            # Using hash of the dataset path with the relative path in order to:
            #  1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
            #  2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with
            #     some cache implementations
            #  3. Still leave relative path and the piece_index in plain text to make it easier to debug
            if isinstance(self._dataset_path_or_paths, list):
                path_str = ','.join(self._dataset_path_or_paths)
            else:
                path_str = self._dataset_path_or_paths
            cache_key = '{}:{}:{}'.format(hashlib.md5(path_str.encode('utf-8')).hexdigest(),
                                          piece.path, piece_index)
            all_cols = self._local_cache.get(cache_key,
                                             lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition))

        if all_cols:
            self.publish_func(all_cols) 
开发者ID:uber,项目名称:petastorm,代码行数:57,代码来源:arrow_reader_worker.py

示例13: process

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
        """Main worker function. Loads and returns all rows matching the predicate from a rowgroup

        Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
        columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
        the rest of the columns are not loaded.

        :param piece_index:
        :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
            of partitions.
        :return:
        """

        if not self._dataset:
            self._dataset = pq.ParquetDataset(
                self._dataset_path,
                filesystem=self._filesystem,
                validate_schema=False)

        piece = self._split_pieces[piece_index]

        # Create pyarrow file system
        parquet_file = ParquetFile(self._dataset.fs.open(piece.path))

        if not isinstance(self._local_cache, NullCache):
            if worker_predicate:
                raise RuntimeError('Local cache is not supported together with predicates, '
                                   'unless the dataset is partitioned by the column the predicate operates on.')
            if shuffle_row_drop_partition[1] != 1:
                raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')

        if worker_predicate:
            all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition)
        else:
            # Using hash of the dataset path with the relative path in order to:
            #  1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
            #  2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with
            #     some cache implementations
            #  3. Still leave relative path and the piece_index in plain text to make it easier to debug
            cache_key = '{}:{}:{}'.format(hashlib.md5(self._dataset_path.encode('utf-8')).hexdigest(),
                                          piece.path, piece_index)
            all_cols = self._local_cache.get(cache_key,
                                             lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition))

        if self._ngram:
            all_cols = self._ngram.form_ngram(data=all_cols, schema=self._schema)

        if all_cols:
            self.publish_func(all_cols) 
开发者ID:uber,项目名称:petastorm,代码行数:51,代码来源:py_dict_reader_worker.py

示例14: build_rowgroup_index

# 需要导入模块: from pyarrow import parquet [as 别名]
# 或者: from pyarrow.parquet import ParquetDataset [as 别名]
def build_rowgroup_index(dataset_url, spark_context, indexers, hdfs_driver='libhdfs3'):
    """
    Build index for given list of fields to use for fast rowgroup selection
    :param dataset_url: (str) the url for the dataset (or a path if you would like to use the default hdfs config)
    :param spark_context: (SparkContext)
    :param indexers: list of objects to build row groups indexes. Should support RowGroupIndexerBase interface
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
    libhdfs (java through JNI) or libhdfs3 (C++)
    :return: None, upon successful completion the rowgroup predicates will be saved to _metadata file
    """

    if dataset_url and dataset_url[-1] == '/':
        dataset_url = dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(dataset_url, spark_context._jsc.hadoopConfiguration(),
                                  hdfs_driver=hdfs_driver, user=spark_context.sparkUser())
    dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
                                validate_schema=False)

    split_pieces = dataset_metadata.load_row_groups(dataset)
    schema = dataset_metadata.get_schema(dataset)

    # We need direct reference on partitions object
    partitions = dataset.partitions
    pieces_num = len(split_pieces)
    piece_info_list = []
    for piece_index in range(pieces_num):
        #  indexes relies on the ordering of the split dataset pieces.
        # This relies on how the dataset pieces are split and sorted which although should not change,
        # still might and we should make sure not to forget that could break this.
        piece = split_pieces[piece_index]
        piece_info_list.append(PieceInfo(piece_index, piece.path, piece.row_group, piece.partition_keys))

    start_time = time.time()
    piece_info_rdd = spark_context.parallelize(piece_info_list, min(len(piece_info_list), PARALLEL_SLICE_NUM))
    indexer_rdd = piece_info_rdd.map(lambda piece_info: _index_columns(piece_info, dataset_url, partitions,
                                                                       indexers, schema, hdfs_driver=hdfs_driver))
    indexer_list = indexer_rdd.reduce(_combine_indexers)

    indexer_dict = {indexer.index_name: indexer for indexer in indexer_list}
    serialized_indexers = pickle.dumps(indexer_dict, pickle.HIGHEST_PROTOCOL)
    utils.add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, serialized_indexers)
    logger.info("Elapsed time of index creation: %f s", (time.time() - start_time)) 
开发者ID:uber,项目名称:petastorm,代码行数:46,代码来源:rowgroup_indexing.py


注:本文中的pyarrow.parquet.ParquetDataset方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。