当前位置: 首页>>代码示例>>Java>>正文


Java BlockMetaData类代码示例

本文整理汇总了Java中parquet.hadoop.metadata.BlockMetaData的典型用法代码示例。如果您正苦于以下问题:Java BlockMetaData类的具体用法?Java BlockMetaData怎么用?Java BlockMetaData使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


BlockMetaData类属于parquet.hadoop.metadata包,在下文中一共展示了BlockMetaData类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getRowGroupNumbersFromFileSplit

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
    final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();

  final long splitStart = split.getStart();
  final long splitLength = split.getLength();

  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}
 
开发者ID:skhalifa,项目名称:QDrill,代码行数:25,代码来源:HiveDrillNativeScanBatchCreator.java

示例2: initialize

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public void initialize(MessageType fileSchema,
                       Map<String, String> fileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
        throws IOException {
    // initialize a ReadContext for this file
    ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
            configuration, toSetMultiMap(fileMetadata), fileSchema));
    this.requestedSchema = readContext.getRequestedSchema();
    this.fileSchema = fileSchema;
    this.file = file;
    this.columnCount = requestedSchema.getPaths().size();
    this.recordConverter = readSupport.prepareForRead(
            configuration, fileMetadata, fileSchema, readContext);
    this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
    List<ColumnDescriptor> columns = requestedSchema.getColumns();
    reader = new ParquetFileReader(configuration, file, blocks, columns);
    for (BlockMetaData block : blocks) {
        total += block.getRowCount();
    }
    LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:22,代码来源:InternalParquetRecordReader.java

示例3: initReader

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
private void initReader() throws IOException {
    if (reader != null) {
        reader.close();
        reader = null;
    }
    if (footersIterator.hasNext()) {
        Footer footer = footersIterator.next();

        List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks();

        MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema();

        List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups(
                filter, blocks, fileSchema);

        fileInfo.setBlockMetaDatas(blocks);
        fileInfo.setFileSchema(fileSchema);
        fileInfo.setFilteredBlocks(filteredBlocks);

        reader = new InternalParquetRecordReader<T>(readSupport, filter);
        reader.initialize(fileSchema,
                footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(),
                footer.getFile(), filteredBlocks, conf);
    }
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:26,代码来源:ParquetReader.java

示例4: add

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
private static void add(ParquetMetadata footer) {
    for (BlockMetaData blockMetaData : footer.getBlocks()) {
        ++blockCount;
        MessageType schema = footer.getFileMetaData().getSchema();
        recordCount += blockMetaData.getRowCount();
        List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
        for (ColumnChunkMetaData columnMetaData : columns) {
            ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
            add(
                    desc,
                    columnMetaData.getValueCount(),
                    columnMetaData.getTotalSize(),
                    columnMetaData.getTotalUncompressedSize(),
                    columnMetaData.getEncodings(),
                    columnMetaData.getStatistics());
        }
    }
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:19,代码来源:PrintFooter.java

示例5: mergeFooters

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
    String rootPath = root.toUri().getPath();
    GlobalMetaData fileMetaData = null;
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    for (Footer footer : footers) {
        String footerPath = footer.getFile().toUri().getPath();
        if (!footerPath.startsWith(rootPath)) {
            throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
        }
        footerPath = footerPath.substring(rootPath.length());
        while (footerPath.startsWith("/")) {
            footerPath = footerPath.substring(1);
        }
        fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
        for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
            block.setPath(footerPath);
            blocks.add(block);
        }
    }
    return new ParquetMetadata(fileMetaData.merge(), blocks);
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:22,代码来源:ParquetFileWriter.java

示例6: checkBelongingToANewHDFSBlock

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
 * @param rowGroupMetadata
 * @return true if the mid point of row group is in a new hdfs block, and also move the currentHDFSBlock pointer to the correct index that contains the row group;
 * return false if the mid point of row group is in the same hdfs block
 */
private boolean checkBelongingToANewHDFSBlock(BlockMetaData rowGroupMetadata) {
    boolean isNewHdfsBlock = false;
    long rowGroupMidPoint = rowGroupMetadata.getStartingPos() + (rowGroupMetadata.getCompressedSize() / 2);

    //if mid point is not in the current HDFS block any more, return true
    while (rowGroupMidPoint > getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex)) {
        isNewHdfsBlock = true;
        currentMidPointHDFSBlockIndex++;
        if (currentMidPointHDFSBlockIndex >= hdfsBlocks.length)
            throw new ParquetDecodingException("the row group is not in hdfs blocks in the file: midpoint of row groups is "
                    + rowGroupMidPoint
                    + ", the end of the hdfs block is "
                    + getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex - 1));
    }

    while (rowGroupMetadata.getStartingPos() > getHDFSBlockEndingPosition(currentStartHdfsBlockIndex)) {
        currentStartHdfsBlockIndex++;
        if (currentStartHdfsBlockIndex >= hdfsBlocks.length)
            throw new ParquetDecodingException("The row group does not start in this file: row group offset is "
                    + rowGroupMetadata.getStartingPos()
                    + " but the end of hdfs blocks of file is "
                    + getHDFSBlockEndingPosition(currentStartHdfsBlockIndex));
    }
    return isNewHdfsBlock;
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:31,代码来源:ParquetInputFormat.java

示例7: generateSplits

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
 * groups together all the data blocks for the same HDFS block
 *
 * @param rowGroupBlocks      data blocks (row groups)
 * @param hdfsBlocksArray     hdfs blocks
 * @param fileStatus          the containing file
 * @param requestedSchema     the schema requested by the user
 * @param readSupportMetadata the metadata provided by the readSupport implementation in init
 * @param minSplitSize        the mapred.min.split.size
 * @param maxSplitSize        the mapred.max.split.size
 * @return the splits (one per HDFS block)
 * @throws IOException If hosts can't be retrieved for the HDFS block
 */
static <T> List<ParquetInputSplit> generateSplits(
        List<BlockMetaData> rowGroupBlocks,
        BlockLocation[] hdfsBlocksArray,
        FileStatus fileStatus,
        String requestedSchema,
        Map<String, String> readSupportMetadata, long minSplitSize, long maxSplitSize) throws IOException {

    List<SplitInfo> splitRowGroups =
            generateSplitInfo(rowGroupBlocks, hdfsBlocksArray, minSplitSize, maxSplitSize);

    //generate splits from rowGroups of each split
    List<ParquetInputSplit> resultSplits = new ArrayList<ParquetInputSplit>();
    for (SplitInfo splitInfo : splitRowGroups) {
        ParquetInputSplit split = splitInfo.getParquetInputSplit(fileStatus, requestedSchema, readSupportMetadata);
        resultSplits.add(split);
    }
    return resultSplits;
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:32,代码来源:ParquetInputFormat.java

示例8: visit

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
@Override
public List<BlockMetaData> visit(FilterCompat.FilterPredicateCompat filterPredicateCompat) {
    FilterPredicate filterPredicate = filterPredicateCompat.getFilterPredicate();

    // check that the schema of the filter matches the schema of the file
    SchemaCompatibilityValidator.validate(filterPredicate, schema);

    List<BlockMetaData> filteredBlocks = new ArrayList<BlockMetaData>();

    for (BlockMetaData block : blocks) {
        if (!StatisticsFilter.canDrop(filterPredicate, block.getColumns())) {
            filteredBlocks.add(block);
        }
    }

    return filteredBlocks;
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:18,代码来源:RowGroupFilter.java

示例9: toParquetMetadata

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
    List<BlockMetaData> blocks = parquetMetadata.getBlocks();
    List<RowGroup> rowGroups = new ArrayList<RowGroup>();
    int numRows = 0;
    for (BlockMetaData block : blocks) {
        numRows += block.getRowCount();
        addRowGroup(parquetMetadata, rowGroups, block);
    }
    FileMetaData fileMetaData = new FileMetaData(
            currentVersion,
            toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
            numRows,
            rowGroups);

    Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
    for (Entry<String, String> keyValue : keyValues) {
        addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
    }

    fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
    return fileMetaData;
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:23,代码来源:ParquetMetadataConverter.java

示例10: generateSplitByDeprecatedConstructor

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
private List<ParquetInputSplit> generateSplitByDeprecatedConstructor(long min, long max) throws
        IOException {
    List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
    List<ClientSideMetadataSplitStrategy.SplitInfo> splitInfos = ClientSideMetadataSplitStrategy
            .generateSplitInfo(blocks, hdfsBlocks, min, max);

    for (ClientSideMetadataSplitStrategy.SplitInfo splitInfo : splitInfos) {
        BlockMetaData lastRowGroup = splitInfo.getRowGroups().get(splitInfo.getRowGroupCount() - 1);
        long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();

        ParquetInputSplit split = new ParquetInputSplit(fileStatus.getPath(),
                splitInfo.hdfsBlock.getOffset(), end, splitInfo.hdfsBlock.getHosts(),
                splitInfo.rowGroups, schema.toString(), null, null, extramd);
        splits.add(split);
    }

    return splits;
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:19,代码来源:TestInputFormat.java

示例11: ParquetReader

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public ParquetReader(MessageType fileSchema,
        Map<String, String> extraMetadata,
        MessageType requestedSchema,
        Path file,
        List<BlockMetaData> blocks,
        Configuration configuration)
        throws IOException
{
    this.fileSchema = fileSchema;
    this.extraMetadata = extraMetadata;
    this.requestedSchema = requestedSchema;
    this.file = file;
    this.blocks = blocks;
    this.configuration = configuration;
    this.fileReader = new ParquetFileReader(configuration, file, blocks, requestedSchema.getColumns());
    for (BlockMetaData block : blocks) {
        fileRowCount += block.getRowCount();
    }
}
 
开发者ID:y-lan,项目名称:presto,代码行数:20,代码来源:ParquetReader.java

示例12: ParquetFileReader

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public ParquetFileReader(
        Configuration configuration,
        Path file,
        List<BlockMetaData> blocks,
        List<ColumnDescriptor> columns)
        throws IOException
{
    this.file = file;
    this.inputStream = file.getFileSystem(configuration).open(file);
    this.blocks = blocks;
    if (!blocks.isEmpty()) {
        for (ColumnDescriptor columnDescriptor : columns) {
            for (ColumnChunkMetaData metadata : blocks.get(0).getColumns()) {
                if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) {
                    columnMetadata.put(columnDescriptor, metadata);
                }
            }
        }
    }
    this.codecFactory = new ParquetCodecFactory(configuration);
}
 
开发者ID:y-lan,项目名称:presto,代码行数:22,代码来源:ParquetFileReader.java

示例13: getBlocks

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
 * get the blocks (row groups) of the parquet files.
 * @return
 */
public List<BlockMetaData> getBlocks ()
{
    if (this.blockMetaDataList == null || this.blockMetaDataList.size() == 0)
    {
        this.blockMetaDataList = new ArrayList<BlockMetaData>();
        for (ParquetFileMetadata meta : this.fileMetaDataList)
        {
            this.blockMetaDataList.addAll(meta.getBlocks());
        }
    }
    return this.blockMetaDataList;
}
 
开发者ID:dbiir,项目名称:rainbow,代码行数:17,代码来源:ParquetMetadataStat.java

示例14: getTotalSize

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
 * get the total compressed size of the parquet files.
 * @return
 */
@Override
public long getTotalSize ()
{
    long size = 0;
    for (BlockMetaData meta : this.getBlocks())
    {
        size += meta.getCompressedSize();
    }
    return size;
}
 
开发者ID:dbiir,项目名称:rainbow,代码行数:15,代码来源:ParquetMetadataStat.java

示例15: testGetStat

import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
@Test
public void testGetStat () throws IOException, MetadataException
{
    //ParquetMetadataStat stat = new ParquetMetadataStat("10.172.96.77", 9000, "/tmp/hive-root/hive_2015-02-04_10-57-36_131_1404874572956637570-1/_tmp.-ext-10002");
    ParquetMetadataStat stat = new ParquetMetadataStat("192.168.124.15", 9000, "/msra/parquet_test");
    double[] columnSize = stat.getAvgColumnChunkSize();
    List<String> names = stat.getFieldNames();
    int i = 0;
    double total = 0;
    for (double size: columnSize)
    {
        //System.out.println(names.get(i) + "\t" + size);
        total += size;
        i++;
    }
    System.out.println(total/1024/1024 + "\t" + stat.getRowGroupCount() + "\t" + stat.getFileCount());

    for (BlockMetaData bm : stat.getBlocks())
    {
        System.out.println(bm.getCompressedSize() + ", " + bm.getTotalByteSize() + ", " + bm.getRowCount());
    }

    List<ParquetFileMetadata> metaDatas = stat.getFileMetaData();
    System.out.println(metaDatas.get(0).getFileMetaData().getCreatedBy());
    Map<String, String> keyValueMetaData = metaDatas.get(0).getFileMetaData().getKeyValueMetaData();
    for (String key : keyValueMetaData.keySet())
    {
        System.out.println(key + "=" + keyValueMetaData.get(key));
    }

    for (int j = 0; j < names.size(); ++j)
    {
        System.out.println(names.get(j) + "\t" + columnSize[j] + "\n");
    }
}
 
开发者ID:dbiir,项目名称:rainbow,代码行数:36,代码来源:TestParquetMetadataStat.java


注:本文中的parquet.hadoop.metadata.BlockMetaData类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。