当前位置: 首页>>代码示例>>Java>>正文


Java ParquetMetadata.getBlocks方法代码示例

本文整理汇总了Java中org.apache.parquet.hadoop.metadata.ParquetMetadata.getBlocks方法的典型用法代码示例。如果您正苦于以下问题:Java ParquetMetadata.getBlocks方法的具体用法?Java ParquetMetadata.getBlocks怎么用?Java ParquetMetadata.getBlocks使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.parquet.hadoop.metadata.ParquetMetadata的用法示例。


在下文中一共展示了ParquetMetadata.getBlocks方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: test

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:19,代码来源:TestStatistics.java

示例2: getRowGroupNumbersFromFileSplit

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
private static List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
    final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();

  final long splitStart = split.getStart();
  final long splitLength = split.getLength();

  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:25,代码来源:FileSplitParquetRecordReader.java

示例3: read

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void read(String fileName) throws IOException
{
    Path path = new Path(fileName);
    Configuration conf = new Configuration();
    conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());

    ParquetMetadata metadata = ParquetFileReader.readFooter(conf, path, NO_FILTER);
    ParquetFileReader reader = new ParquetFileReader(conf, metadata.getFileMetaData(), path, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns());
    PageReadStore pageReadStore;
    PageReader pageReader;
    DataPage page;
    while ((pageReadStore = reader.readNextRowGroup()) != null) {
        for (ColumnDescriptor cd: metadata.getFileMetaData().getSchema().getColumns()) {
            pageReader = pageReadStore.getPageReader(cd);
            page = pageReader.readPage();
        }
    }
}
 
开发者ID:dbiir,项目名称:RealtimeAnalysis,代码行数:20,代码来源:ParquetFileReaderTest.java

示例4: getRowGroupNumbersFromFileSplit

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
    final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();

  final long splitStart = split.getStart();
  final long splitLength = split.getLength();

  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}
 
开发者ID:axbaretto,项目名称:drill,代码行数:25,代码来源:HiveDrillNativeScanBatchCreator.java

示例5: checkCompatibility

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private static void checkCompatibility(ParquetMetadata metadata) {
  // make sure we can map Parquet blocks to Chunks
  for (BlockMetaData block : metadata.getBlocks()) {
    if (block.getRowCount() > Integer.MAX_VALUE) {
      IcedHashMapGeneric.IcedHashMapStringObject dbg = new IcedHashMapGeneric.IcedHashMapStringObject();
      dbg.put("startingPos", block.getStartingPos());
      dbg.put("rowCount", block.getRowCount());
      throw new H2OUnsupportedDataFileException("Unsupported Parquet file (technical limitation).",
              "Current implementation doesn't support Parquet files with blocks larger than " +
              Integer.MAX_VALUE + " rows.", dbg); // because we map each block to a single H2O Chunk
    }
  }
  // check that file doesn't have nested structures
  MessageType schema = metadata.getFileMetaData().getSchema();
  for (String[] path : schema.getPaths())
    if (path.length != 1) {
      throw new H2OUnsupportedDataFileException("Parquet files with nested structures are not supported.",
              "Detected a column with a nested structure " + Arrays.asList(path));
    }
}
 
开发者ID:h2oai,项目名称:h2o-3,代码行数:21,代码来源:ParquetParser.java

示例6: add

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:19,代码来源:PrintFooter.java

示例7: toParquetMetadata

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
  List<BlockMetaData> blocks = parquetMetadata.getBlocks();
  List<RowGroup> rowGroups = new ArrayList<RowGroup>();
  long numRows = 0;
  for (BlockMetaData block : blocks) {
    numRows += block.getRowCount();
    addRowGroup(parquetMetadata, rowGroups, block);
  }
  FileMetaData fileMetaData = new FileMetaData(
      currentVersion,
      toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
      numRows,
      rowGroups);

  Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
  for (Entry<String, String> keyValue : keyValues) {
    addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
  }

  fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());

  fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));

  return fileMetaData;
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:26,代码来源:ParquetMetadataConverter.java

示例8: run

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() >= 1,
      "A Parquet file is required.");
  Preconditions.checkArgument(targets.size() == 1,
      "Cannot process multiple Parquet files.");

  String source = targets.get(0);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);

  console.info("\nFile path:  {}", source);
  console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());

  Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData();
  if (kv != null && !kv.isEmpty()) {
    console.info("Properties:");
    String format = "  %" + maxSize(kv.keySet()) + "s: %s";
    for (Map.Entry<String, String> entry : kv.entrySet()) {
      console.info(String.format(format, entry.getKey(), entry.getValue()));
    }
  } else {
    console.info("Properties: (none)");
  }

  MessageType schema = footer.getFileMetaData().getSchema();
  console.info("Schema:\n{}", schema);

  List<BlockMetaData> rowGroups = footer.getBlocks();
  for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
    printRowGroup(console, index, rowGroups.get(index), schema);
  }

  console.info("");

  return 0;
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:39,代码来源:ParquetMetadataCommand.java

示例9: getParquetFileMetadata

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private ParquetFileMetadata getParquetFileMetadata(FileStatus file) throws IOException {
  final ParquetMetadata metadata;

  metadata = SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER);

  MessageType schema = metadata.getFileMetaData().getSchema();

  Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
  schema.getPaths();
  for (String[] path : schema.getPaths()) {
    originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
  }

  List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();

  ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
  ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
  boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
  ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
  if(logger.isDebugEnabled()){
    logger.debug(containsCorruptDates.toString());
  }
  final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
  for (BlockMetaData rowGroup : metadata.getBlocks()) {
    List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
    long length = 0;
    for (ColumnChunkMetaData col : rowGroup.getColumns()) {
      ColumnMetadata columnMetadata;

      boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());

      Statistics<?> stats = col.getStatistics();
      String[] columnName = col.getPath().toArray();
      SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
      ColumnTypeMetadata columnTypeMetadata =
          new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));

      columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
      if (statsAvailable) {
        // Write stats only if minVal==maxVal. Also, we then store only maxVal
        Object mxValue = null;
        if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
            stats.genericGetMax().equals(stats.genericGetMin())) {
          mxValue = stats.genericGetMax();
          if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
              && columnTypeMetadata.originalType == OriginalType.DATE) {
            mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
          }
        }
        columnMetadata =
            new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
      } else {
        columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
      }
      columnMetadataList.add(columnMetadata);
      length += col.getTotalSize();
    }

    RowGroupMetadata rowGroupMeta =
        new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
            getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);

    rowGroupMetadataList.add(rowGroupMeta);
  }

  return new ParquetFileMetadata(file, file.getLen(), rowGroupMetadataList, columnTypeInfo);
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:68,代码来源:Metadata.java

示例10: getReaders

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Override
public List<RecordReader> getReaders(final UnifiedParquetReader unifiedReader) throws ExecutionSetupException {
  final ParquetMetadata footer = unifiedReader.getFooter();
  long counter = 0;
  for(final BlockMetaData rowGroup : footer.getBlocks()) {
    counter += rowGroup.getRowCount();
  }
  final long rowCount = counter;
  final RecordReader reader = new AbstractRecordReader(unifiedReader.context, Collections.<SchemaPath>emptyList()) {
    private long remainingRowCount = rowCount;

    @Override
    public void setup(OutputMutator output) throws ExecutionSetupException {

    }

    @Override
    public int next() {
      if (numRowsPerBatch > remainingRowCount) {
        int toReturn = (int) remainingRowCount;
        remainingRowCount = 0;
        return toReturn;
      }

      remainingRowCount -= numRowsPerBatch;
      return (int)numRowsPerBatch;
    }

    @Override
    public void close() throws Exception {

    }
  };
  return Collections.singletonList(reader);
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:36,代码来源:UnifiedParquetReader.java

示例11: ParquetRowReader

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
public ParquetRowReader(Configuration configuration, Path filePath, ReadSupport<T> readSupport) throws IOException
{
    this.filePath = filePath;

    ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, filePath, ParquetMetadataConverter.NO_FILTER);
    List<BlockMetaData> blocks = parquetMetadata.getBlocks();

    FileMetaData fileMetadata = parquetMetadata.getFileMetaData();
    this.fileSchema = fileMetadata.getSchema();
    Map<String, String> keyValueMetadata = fileMetadata.getKeyValueMetaData();
    ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
            configuration, toSetMultiMap(keyValueMetadata), fileSchema));
    this.columnIOFactory = new ColumnIOFactory(fileMetadata.getCreatedBy());

    this.requestedSchema = readContext.getRequestedSchema();
    this.recordConverter = readSupport.prepareForRead(
            configuration, fileMetadata.getKeyValueMetaData(), fileSchema, readContext);

    List<ColumnDescriptor> columns = requestedSchema.getColumns();

    reader = new ParquetFileReader(configuration, fileMetadata, filePath, blocks, columns);

    long total = 0;
    for (BlockMetaData block : blocks) {
        total += block.getRowCount();
    }
    this.total = total;

    this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
    logger.info("ParquetRowReader initialized will read a total of " + total + " records.");
}
 
开发者ID:CyberAgent,项目名称:embulk-input-parquet_hadoop,代码行数:32,代码来源:ParquetRowReader.java

示例12: findFirstBlock

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private static BlockMetaData findFirstBlock(ParquetMetadata metadata) {
  BlockMetaData firstBlockMeta = metadata.getBlocks().get(0);
  for (BlockMetaData meta : metadata.getBlocks()) {
    if (meta.getStartingPos() < firstBlockMeta.getStartingPos()) {
      firstBlockMeta = meta;
    }
  }
  return firstBlockMeta;
}
 
开发者ID:h2oai,项目名称:h2o-3,代码行数:10,代码来源:ParquetParser.java

示例13: showDetails

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) {
  showDetails(out, meta.getFileMetaData());

  long i = 1;
  for (BlockMetaData bmeta : meta.getBlocks()) {
    out.println();
    showDetails(out, bmeta, i++);
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:10,代码来源:MetadataUtils.java

示例14: testMergedMetadata

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void testMergedMetadata() throws IOException {
  Path combinedFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, FILE_SCHEMA, combinedFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  ParquetMetadata combinedFooter = ParquetFileReader.readFooter(
      CONF, combinedFile, NO_FILTER);
  ParquetMetadata f1Footer = ParquetFileReader.readFooter(
      CONF, file1, NO_FILTER);
  ParquetMetadata f2Footer = ParquetFileReader.readFooter(
      CONF, file2, NO_FILTER);

  LinkedList<BlockMetaData> expectedRowGroups = new LinkedList<BlockMetaData>();
  expectedRowGroups.addAll(f1Footer.getBlocks());
  expectedRowGroups.addAll(f2Footer.getBlocks());

  Assert.assertEquals("Combined should have the right number of row groups",
      expectedRowGroups.size(),
      combinedFooter.getBlocks().size());

  long nextStart = 4;
  for (BlockMetaData rowGroup : combinedFooter.getBlocks()) {
    BlockMetaData expected = expectedRowGroups.removeFirst();
    Assert.assertEquals("Row count should match",
        expected.getRowCount(), rowGroup.getRowCount());
    Assert.assertEquals("Compressed size should match",
        expected.getCompressedSize(), rowGroup.getCompressedSize());
    Assert.assertEquals("Total size should match",
        expected.getTotalByteSize(), rowGroup.getTotalByteSize());
    Assert.assertEquals("Start pos should be at the last row group's end",
        nextStart, rowGroup.getStartingPos());
    assertColumnsEquivalent(expected.getColumns(), rowGroup.getColumns());
    nextStart = rowGroup.getStartingPos() + rowGroup.getTotalByteSize();
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:41,代码来源:TestParquetWriterAppendBlocks.java

示例15: testAllowDroppingColumns

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void testAllowDroppingColumns() throws IOException {
  MessageType droppedColumnSchema = Types.buildMessage()
      .required(BINARY).as(UTF8).named("string")
      .named("AppendTest");

  Path droppedColumnFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, droppedColumnSchema, droppedColumnFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  LinkedList<Group> expected = new LinkedList<Group>();
  expected.addAll(file1content);
  expected.addAll(file2content);

  ParquetMetadata footer = ParquetFileReader.readFooter(
      CONF, droppedColumnFile, NO_FILTER);
  for (BlockMetaData rowGroup : footer.getBlocks()) {
    Assert.assertEquals("Should have only the string column",
        1, rowGroup.getColumns().size());
  }

  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), droppedColumnFile)
      .build();

  Group next;
  while ((next = reader.read()) != null) {
    Group expectedNext = expected.removeFirst();
    Assert.assertEquals("Each string should match",
        expectedNext.getString("string", 0), next.getString("string", 0));
  }

  Assert.assertEquals("All records should be present", 0, expected.size());
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:39,代码来源:TestParquetWriterAppendBlocks.java


注:本文中的org.apache.parquet.hadoop.metadata.ParquetMetadata.getBlocks方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。