当前位置: 首页>>代码示例>>Java>>正文


Java ColumnChunkMetaData类代码示例

本文整理汇总了Java中org.apache.parquet.hadoop.metadata.ColumnChunkMetaData的典型用法代码示例。如果您正苦于以下问题:Java ColumnChunkMetaData类的具体用法?Java ColumnChunkMetaData怎么用?Java ColumnChunkMetaData使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


ColumnChunkMetaData类属于org.apache.parquet.hadoop.metadata包,在下文中一共展示了ColumnChunkMetaData类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: addColumn

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
public void addColumn(ColumnDescriptor descriptor, ColumnChunkMetaData metaData) throws IOException {
  final FSDataInputStream in;
  if (useSingleStream) {
    if (streams.isEmpty()) {
      in = fs.open(path);
      streams.add(in);
    } else {
      in = streams.get(0);
    }
    in.seek(metaData.getStartingPos());
    columns.put(descriptor, new SingleStreamColumnChunkIncPageReader(metaData, descriptor, in));
  } else {
    // create new stream per column
    in = fs.open(path);
    streams.add(in);
    in.seek(metaData.getStartingPos());
    columns.put(descriptor, new ColumnChunkIncPageReader(metaData, descriptor, in));
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:20,代码来源:ColumnChunkIncReadStore.java

示例2: readDictionaries

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
/**
 * Return dictionary per row group for all binary columns in given parquet file.
 * @param fs filesystem object.
 * @param filePath parquet file to scan
 * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
 * @throws IOException
 */
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CodecFactory codecFactory) throws IOException {
  final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
  if (parquetMetadata.getBlocks().size() > 1) {
    throw new IOException(
      format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
        parquetMetadata.getBlocks().size(), filePath));
  }
  final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
  final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();

  for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
    columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
  }

  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
  final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
  try(final FSDataInputStream in = fs.open(filePath)) {
    for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
      if (isBinaryType(columnChunkMetaData.getType())) {
        final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
        // if first page is dictionary encoded then load dictionary, otherwise skip this column.
        final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
        if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
          dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
        } else {
          columnsToSkip.add(column);
        }
      }
    }
  }
  return new ImmutablePair<>(dictionaries, columnsToSkip);
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:40,代码来源:LocalDictionariesReader.java

示例3: PageReader

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
PageReader(ColumnReader<?> parentStatus, FSDataInputStream inputStream, Path path, ColumnChunkMetaData columnChunkMetaData) throws ExecutionSetupException {
  this.parentColumnReader = parentStatus;
  allocatedDictionaryBuffers = new ArrayList<ByteBuf>();
  codecFactory = parentColumnReader.parentReader.getCodecFactory();
  this.stats = parentColumnReader.parentReader.parquetReaderStats;
  long start = columnChunkMetaData.getFirstDataPageOffset();
  this.inputStream = inputStream;
  try {
    this.dataReader = new ColumnDataReader(inputStream, start, columnChunkMetaData.getTotalSize());
    loadDictionaryIfExists(parentStatus, columnChunkMetaData, inputStream);
  } catch (IOException e) {
    throw new ExecutionSetupException("Error opening or reading metadata for parquet file at location: "
      + path.getName(), e);
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:16,代码来源:PageReader.java

示例4: loadDictionaryIfExists

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
private void loadDictionaryIfExists(final ColumnReader<?> parentStatus,
    final ColumnChunkMetaData columnChunkMetaData, final FSDataInputStream f) throws IOException {
  Stopwatch timer = Stopwatch.createUnstarted();
  if (columnChunkMetaData.getDictionaryPageOffset() > 0) {
    f.seek(columnChunkMetaData.getDictionaryPageOffset());
    long start=f.getPos();
    timer.start();
    final PageHeader pageHeader = Util.readPageHeader(f);
    long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
    long pageHeaderBytes=f.getPos()-start;
    this.updateStats(pageHeader, "Page Header", start, timeToRead, pageHeaderBytes, pageHeaderBytes);
    assert pageHeader.type == PageType.DICTIONARY_PAGE;
    assert isDictionaryEncoded(columnChunkMetaData.getEncodings()) :
      format("Missing dictionary encoding for dictionary page %s, in column chunk %s", pageHeader, columnChunkMetaData);
    readDictionaryPage(pageHeader, parentStatus);
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:18,代码来源:PageReader.java

示例5: ColumnReader

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
protected ColumnReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
                       ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException {
  this.parentReader = parentReader;
  this.columnDescriptor = descriptor;
  this.columnChunkMetaData = columnChunkMetaData;
  this.isFixedLength = fixedLength;
  this.schemaElement = schemaElement;
  this.valueVec =  v;
  this.pageReader = (parentReader.getSingleStream() != null)?
    new DeprecatedSingleStreamPageReader(this, parentReader.getSingleStream(), parentReader.getHadoopPath(), columnChunkMetaData) :
    new PageReader(this, parentReader.getFileSystem(), parentReader.getHadoopPath(), columnChunkMetaData);

  if (columnDescriptor.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
    if (columnDescriptor.getType() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
      dataTypeLengthInBits = columnDescriptor.getTypeLength() * 8;
    } else if (columnDescriptor.getType() == PrimitiveTypeName.INT96
      && (valueVec instanceof TimeStampMilliVector || valueVec instanceof NullableTimeStampMilliVector)) {
      // if int 96 column is being read as a Timestamp, this truncates the time format used by Impala
      // dataTypeLengthInBits is only ever used when computing offsets into the destination vector, so it
      // needs to be set to the bit width of the resulting Arrow type, usually this matches the input length
      dataTypeLengthInBits = 64;
    } else {
      dataTypeLengthInBits = DeprecatedParquetVectorizedReader.getTypeLengthInBits(columnDescriptor.getType());
    }
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:27,代码来源:ColumnReader.java

示例6: showDetails

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
public static void showDetails(PrettyPrintWriter out, List<ColumnChunkMetaData> ccmeta) {
  Map<String,Object> chunks = new LinkedHashMap<String,Object>();
  for (ColumnChunkMetaData cmeta : ccmeta) {
    String[] path = cmeta.getPath().toArray();

    Map<String,Object> current = chunks;
    for (int i = 0; i < path.length - 1; ++i) {
      String next = path[i];
      if (!current.containsKey(next)) {
        current.put(next, new LinkedHashMap<String,Object>());
      }

      current = (Map<String,Object>)current.get(next);
    }

    current.put(path[path.length - 1], cmeta);
  }

  showColumnChunkDetails(out, chunks, 0);
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:21,代码来源:MetadataUtils.java

示例7: add

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:19,代码来源:PrintFooter.java

示例8: readDictionary

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
  if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
      !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
    return null;
  }

  // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
  if (f.getPos() != meta.getStartingPos()) {
    f.seek(meta.getStartingPos());
  }

  PageHeader pageHeader = Util.readPageHeader(f);
  if (!pageHeader.isSetDictionary_page_header()) {
    return null; // TODO: should this complain?
  }

  DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
  BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());

  return new DictionaryPage(
      decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
      compressedPage.getDictionarySize(),
      compressedPage.getEncoding());
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:34,代码来源:ParquetFileReader.java

示例9: endColumn

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
/**
 * end a column (once all rep, def and data have been written)
 * @throws IOException
 */
public void endColumn() throws IOException {
  state = state.endColumn();
  LOG.debug("{}: end column", out.getPos());
  currentBlock.addColumn(ColumnChunkMetaData.get(
      currentChunkPath,
      currentChunkType,
      currentChunkCodec,
      encodingStatsBuilder.build(),
      currentEncodings,
      currentStatistics,
      currentChunkFirstDataPage,
      currentChunkDictionaryPageOffset,
      currentChunkValueCount,
      compressedLength,
      uncompressedLength));
  this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength);
  this.uncompressedLength = 0;
  this.compressedLength = 0;
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:24,代码来源:ParquetFileWriter.java

示例10: checkDeltaByteArrayProblem

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) {
  // splitting files?
  if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) {
    // this is okay if not using DELTA_BYTE_ARRAY with the bug
    Set<Encoding> encodings = new HashSet<Encoding>();
    for (ColumnChunkMetaData column : block.getColumns()) {
      encodings.addAll(column.getEncodings());
    }
    for (Encoding encoding : encodings) {
      if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) {
        throw new ParquetDecodingException("Cannot read data due to " +
            "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false");
      }
    }
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:17,代码来源:ParquetRecordReader.java

示例11: assertColumnsEquivalent

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
public void assertColumnsEquivalent(List<ColumnChunkMetaData> expected,
                                    List<ColumnChunkMetaData> actual) {
  Assert.assertEquals("Should have the expected columns",
      expected.size(), actual.size());
  for (int i = 0; i < actual.size(); i += 1) {
    ColumnChunkMetaData current = actual.get(i);
    if (i != 0) {
      ColumnChunkMetaData previous = actual.get(i - 1);
      long expectedStart = previous.getStartingPos() + previous.getTotalSize();
      Assert.assertEquals("Should start after the previous column",
          expectedStart, current.getStartingPos());
    }

    assertColumnMetadataEquivalent(expected.get(i), current);
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:17,代码来源:TestParquetWriterAppendBlocks.java

示例12: assertColumnMetadataEquivalent

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
public void assertColumnMetadataEquivalent(ColumnChunkMetaData expected,
                                           ColumnChunkMetaData actual) {
  Assert.assertEquals("Should be the expected column",
      expected.getPath(), expected.getPath());
  Assert.assertEquals("Primitive type should not change",
      expected.getType(), actual.getType());
  Assert.assertEquals("Compression codec should not change",
      expected.getCodec(), actual.getCodec());
  Assert.assertEquals("Data encodings should not change",
      expected.getEncodings(), actual.getEncodings());
  Assert.assertEquals("Statistics should not change",
      expected.getStatistics(), actual.getStatistics());
  Assert.assertEquals("Uncompressed size should not change",
      expected.getTotalUncompressedSize(), actual.getTotalUncompressedSize());
  Assert.assertEquals("Compressed size should not change",
      expected.getTotalSize(), actual.getTotalSize());
  Assert.assertEquals("Number of values should not change",
      expected.getValueCount(), actual.getValueCount());

}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:21,代码来源:TestParquetWriterAppendBlocks.java

示例13: testClearExceptionForNots

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
@Test
public void testClearExceptionForNots() {
  List<ColumnChunkMetaData> columnMetas = Arrays.asList(
      getDoubleColumnMeta(new DoubleStatistics(), 0L),
      getIntColumnMeta(new IntStatistics(), 0L));

  FilterPredicate pred = and(not(eq(doubleColumn, 12.0)), eq(intColumn, 17));

  try {
    canDrop(pred, columnMetas);
    fail("This should throw");
  } catch (IllegalArgumentException e) {
    assertEquals("This predicate contains a not! Did you forget to run this predicate through LogicalInverseRewriter?"
        + " not(eq(double.column, 12.0))", e.getMessage());
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:17,代码来源:TestStatisticsFilter.java

示例14: ColumnChunkIncPageReader

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
public ColumnChunkIncPageReader(ColumnChunkMetaData metaData, ColumnDescriptor columnDescriptor, FSDataInputStream in) throws IOException {
  this.metaData = metaData;
  this.columnDescriptor = columnDescriptor;
  this.size = metaData.getTotalSize();
  this.fileOffset = metaData.getStartingPos();
  this.in = in;
  this.decompressor = codecFactory.getDecompressor(metaData.getCodec());
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:9,代码来源:ColumnChunkIncReadStore.java

示例15: getParquetFileMetadata

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
private ParquetFileMetadata getParquetFileMetadata(FileStatus file) throws IOException {
  final ParquetMetadata metadata;

  metadata = SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER);

  MessageType schema = metadata.getFileMetaData().getSchema();

  Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
  schema.getPaths();
  for (String[] path : schema.getPaths()) {
    originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
  }

  List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();

  ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
  ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
  boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
  ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
  if(logger.isDebugEnabled()){
    logger.debug(containsCorruptDates.toString());
  }
  final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
  for (BlockMetaData rowGroup : metadata.getBlocks()) {
    List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
    long length = 0;
    for (ColumnChunkMetaData col : rowGroup.getColumns()) {
      ColumnMetadata columnMetadata;

      boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());

      Statistics<?> stats = col.getStatistics();
      String[] columnName = col.getPath().toArray();
      SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
      ColumnTypeMetadata columnTypeMetadata =
          new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));

      columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
      if (statsAvailable) {
        // Write stats only if minVal==maxVal. Also, we then store only maxVal
        Object mxValue = null;
        if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
            stats.genericGetMax().equals(stats.genericGetMin())) {
          mxValue = stats.genericGetMax();
          if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
              && columnTypeMetadata.originalType == OriginalType.DATE) {
            mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
          }
        }
        columnMetadata =
            new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
      } else {
        columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
      }
      columnMetadataList.add(columnMetadata);
      length += col.getTotalSize();
    }

    RowGroupMetadata rowGroupMeta =
        new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
            getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);

    rowGroupMetadataList.add(rowGroupMeta);
  }

  return new ParquetFileMetadata(file, file.getLen(), rowGroupMetadataList, columnTypeInfo);
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:68,代码来源:Metadata.java


注:本文中的org.apache.parquet.hadoop.metadata.ColumnChunkMetaData类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。