本文整理汇总了Java中org.apache.parquet.hadoop.metadata.ColumnChunkMetaData类的典型用法代码示例。如果您正苦于以下问题:Java ColumnChunkMetaData类的具体用法?Java ColumnChunkMetaData怎么用?Java ColumnChunkMetaData使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ColumnChunkMetaData类属于org.apache.parquet.hadoop.metadata包,在下文中一共展示了ColumnChunkMetaData类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: addColumn
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
public void addColumn(ColumnDescriptor descriptor, ColumnChunkMetaData metaData) throws IOException {
final FSDataInputStream in;
if (useSingleStream) {
if (streams.isEmpty()) {
in = fs.open(path);
streams.add(in);
} else {
in = streams.get(0);
}
in.seek(metaData.getStartingPos());
columns.put(descriptor, new SingleStreamColumnChunkIncPageReader(metaData, descriptor, in));
} else {
// create new stream per column
in = fs.open(path);
streams.add(in);
in.seek(metaData.getStartingPos());
columns.put(descriptor, new ColumnChunkIncPageReader(metaData, descriptor, in));
}
}
示例2: readDictionaries
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
/**
* Return dictionary per row group for all binary columns in given parquet file.
* @param fs filesystem object.
* @param filePath parquet file to scan
* @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
* @throws IOException
*/
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CodecFactory codecFactory) throws IOException {
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
if (parquetMetadata.getBlocks().size() > 1) {
throw new IOException(
format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
parquetMetadata.getBlocks().size(), filePath));
}
final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();
for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
}
final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
try(final FSDataInputStream in = fs.open(filePath)) {
for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
if (isBinaryType(columnChunkMetaData.getType())) {
final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
// if first page is dictionary encoded then load dictionary, otherwise skip this column.
final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
} else {
columnsToSkip.add(column);
}
}
}
}
return new ImmutablePair<>(dictionaries, columnsToSkip);
}
示例3: PageReader
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
PageReader(ColumnReader<?> parentStatus, FSDataInputStream inputStream, Path path, ColumnChunkMetaData columnChunkMetaData) throws ExecutionSetupException {
this.parentColumnReader = parentStatus;
allocatedDictionaryBuffers = new ArrayList<ByteBuf>();
codecFactory = parentColumnReader.parentReader.getCodecFactory();
this.stats = parentColumnReader.parentReader.parquetReaderStats;
long start = columnChunkMetaData.getFirstDataPageOffset();
this.inputStream = inputStream;
try {
this.dataReader = new ColumnDataReader(inputStream, start, columnChunkMetaData.getTotalSize());
loadDictionaryIfExists(parentStatus, columnChunkMetaData, inputStream);
} catch (IOException e) {
throw new ExecutionSetupException("Error opening or reading metadata for parquet file at location: "
+ path.getName(), e);
}
}
示例4: loadDictionaryIfExists
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
private void loadDictionaryIfExists(final ColumnReader<?> parentStatus,
final ColumnChunkMetaData columnChunkMetaData, final FSDataInputStream f) throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
if (columnChunkMetaData.getDictionaryPageOffset() > 0) {
f.seek(columnChunkMetaData.getDictionaryPageOffset());
long start=f.getPos();
timer.start();
final PageHeader pageHeader = Util.readPageHeader(f);
long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
long pageHeaderBytes=f.getPos()-start;
this.updateStats(pageHeader, "Page Header", start, timeToRead, pageHeaderBytes, pageHeaderBytes);
assert pageHeader.type == PageType.DICTIONARY_PAGE;
assert isDictionaryEncoded(columnChunkMetaData.getEncodings()) :
format("Missing dictionary encoding for dictionary page %s, in column chunk %s", pageHeader, columnChunkMetaData);
readDictionaryPage(pageHeader, parentStatus);
}
}
示例5: ColumnReader
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
protected ColumnReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException {
this.parentReader = parentReader;
this.columnDescriptor = descriptor;
this.columnChunkMetaData = columnChunkMetaData;
this.isFixedLength = fixedLength;
this.schemaElement = schemaElement;
this.valueVec = v;
this.pageReader = (parentReader.getSingleStream() != null)?
new DeprecatedSingleStreamPageReader(this, parentReader.getSingleStream(), parentReader.getHadoopPath(), columnChunkMetaData) :
new PageReader(this, parentReader.getFileSystem(), parentReader.getHadoopPath(), columnChunkMetaData);
if (columnDescriptor.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
if (columnDescriptor.getType() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
dataTypeLengthInBits = columnDescriptor.getTypeLength() * 8;
} else if (columnDescriptor.getType() == PrimitiveTypeName.INT96
&& (valueVec instanceof TimeStampMilliVector || valueVec instanceof NullableTimeStampMilliVector)) {
// if int 96 column is being read as a Timestamp, this truncates the time format used by Impala
// dataTypeLengthInBits is only ever used when computing offsets into the destination vector, so it
// needs to be set to the bit width of the resulting Arrow type, usually this matches the input length
dataTypeLengthInBits = 64;
} else {
dataTypeLengthInBits = DeprecatedParquetVectorizedReader.getTypeLengthInBits(columnDescriptor.getType());
}
}
}
示例6: showDetails
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
public static void showDetails(PrettyPrintWriter out, List<ColumnChunkMetaData> ccmeta) {
Map<String,Object> chunks = new LinkedHashMap<String,Object>();
for (ColumnChunkMetaData cmeta : ccmeta) {
String[] path = cmeta.getPath().toArray();
Map<String,Object> current = chunks;
for (int i = 0; i < path.length - 1; ++i) {
String next = path[i];
if (!current.containsKey(next)) {
current.put(next, new LinkedHashMap<String,Object>());
}
current = (Map<String,Object>)current.get(next);
}
current.put(path[path.length - 1], cmeta);
}
showColumnChunkDetails(out, chunks, 0);
}
示例7: add
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
private static void add(ParquetMetadata footer) {
for (BlockMetaData blockMetaData : footer.getBlocks()) {
++ blockCount;
MessageType schema = footer.getFileMetaData().getSchema();
recordCount += blockMetaData.getRowCount();
List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
for (ColumnChunkMetaData columnMetaData : columns) {
ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
add(
desc,
columnMetaData.getValueCount(),
columnMetaData.getTotalSize(),
columnMetaData.getTotalUncompressedSize(),
columnMetaData.getEncodings(),
columnMetaData.getStatistics());
}
}
}
示例8: readDictionary
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
/**
* Reads and decompresses a dictionary page for the given column chunk.
*
* Returns null if the given column chunk has no dictionary page.
*
* @param meta a column's ColumnChunkMetaData to read the dictionary from
* @return an uncompressed DictionaryPage or null
* @throws IOException
*/
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
!meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
return null;
}
// TODO: this should use getDictionaryPageOffset() but it isn't reliable.
if (f.getPos() != meta.getStartingPos()) {
f.seek(meta.getStartingPos());
}
PageHeader pageHeader = Util.readPageHeader(f);
if (!pageHeader.isSetDictionary_page_header()) {
return null; // TODO: should this complain?
}
DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());
return new DictionaryPage(
decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
compressedPage.getDictionarySize(),
compressedPage.getEncoding());
}
示例9: endColumn
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
/**
* end a column (once all rep, def and data have been written)
* @throws IOException
*/
public void endColumn() throws IOException {
state = state.endColumn();
LOG.debug("{}: end column", out.getPos());
currentBlock.addColumn(ColumnChunkMetaData.get(
currentChunkPath,
currentChunkType,
currentChunkCodec,
encodingStatsBuilder.build(),
currentEncodings,
currentStatistics,
currentChunkFirstDataPage,
currentChunkDictionaryPageOffset,
currentChunkValueCount,
compressedLength,
uncompressedLength));
this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength);
this.uncompressedLength = 0;
this.compressedLength = 0;
}
示例10: checkDeltaByteArrayProblem
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) {
// splitting files?
if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) {
// this is okay if not using DELTA_BYTE_ARRAY with the bug
Set<Encoding> encodings = new HashSet<Encoding>();
for (ColumnChunkMetaData column : block.getColumns()) {
encodings.addAll(column.getEncodings());
}
for (Encoding encoding : encodings) {
if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) {
throw new ParquetDecodingException("Cannot read data due to " +
"PARQUET-246: to read safely, set " + SPLIT_FILES + " to false");
}
}
}
}
示例11: assertColumnsEquivalent
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
public void assertColumnsEquivalent(List<ColumnChunkMetaData> expected,
List<ColumnChunkMetaData> actual) {
Assert.assertEquals("Should have the expected columns",
expected.size(), actual.size());
for (int i = 0; i < actual.size(); i += 1) {
ColumnChunkMetaData current = actual.get(i);
if (i != 0) {
ColumnChunkMetaData previous = actual.get(i - 1);
long expectedStart = previous.getStartingPos() + previous.getTotalSize();
Assert.assertEquals("Should start after the previous column",
expectedStart, current.getStartingPos());
}
assertColumnMetadataEquivalent(expected.get(i), current);
}
}
示例12: assertColumnMetadataEquivalent
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
public void assertColumnMetadataEquivalent(ColumnChunkMetaData expected,
ColumnChunkMetaData actual) {
Assert.assertEquals("Should be the expected column",
expected.getPath(), expected.getPath());
Assert.assertEquals("Primitive type should not change",
expected.getType(), actual.getType());
Assert.assertEquals("Compression codec should not change",
expected.getCodec(), actual.getCodec());
Assert.assertEquals("Data encodings should not change",
expected.getEncodings(), actual.getEncodings());
Assert.assertEquals("Statistics should not change",
expected.getStatistics(), actual.getStatistics());
Assert.assertEquals("Uncompressed size should not change",
expected.getTotalUncompressedSize(), actual.getTotalUncompressedSize());
Assert.assertEquals("Compressed size should not change",
expected.getTotalSize(), actual.getTotalSize());
Assert.assertEquals("Number of values should not change",
expected.getValueCount(), actual.getValueCount());
}
示例13: testClearExceptionForNots
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
@Test
public void testClearExceptionForNots() {
List<ColumnChunkMetaData> columnMetas = Arrays.asList(
getDoubleColumnMeta(new DoubleStatistics(), 0L),
getIntColumnMeta(new IntStatistics(), 0L));
FilterPredicate pred = and(not(eq(doubleColumn, 12.0)), eq(intColumn, 17));
try {
canDrop(pred, columnMetas);
fail("This should throw");
} catch (IllegalArgumentException e) {
assertEquals("This predicate contains a not! Did you forget to run this predicate through LogicalInverseRewriter?"
+ " not(eq(double.column, 12.0))", e.getMessage());
}
}
示例14: ColumnChunkIncPageReader
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
public ColumnChunkIncPageReader(ColumnChunkMetaData metaData, ColumnDescriptor columnDescriptor, FSDataInputStream in) throws IOException {
this.metaData = metaData;
this.columnDescriptor = columnDescriptor;
this.size = metaData.getTotalSize();
this.fileOffset = metaData.getStartingPos();
this.in = in;
this.decompressor = codecFactory.getDecompressor(metaData.getCodec());
}
示例15: getParquetFileMetadata
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; //导入依赖的package包/类
private ParquetFileMetadata getParquetFileMetadata(FileStatus file) throws IOException {
final ParquetMetadata metadata;
metadata = SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER);
MessageType schema = metadata.getFileMetaData().getSchema();
Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
schema.getPaths();
for (String[] path : schema.getPaths()) {
originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
}
List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();
ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
if(logger.isDebugEnabled()){
logger.debug(containsCorruptDates.toString());
}
final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
for (BlockMetaData rowGroup : metadata.getBlocks()) {
List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
long length = 0;
for (ColumnChunkMetaData col : rowGroup.getColumns()) {
ColumnMetadata columnMetadata;
boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
Statistics<?> stats = col.getStatistics();
String[] columnName = col.getPath().toArray();
SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
ColumnTypeMetadata columnTypeMetadata =
new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));
columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
if (statsAvailable) {
// Write stats only if minVal==maxVal. Also, we then store only maxVal
Object mxValue = null;
if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
stats.genericGetMax().equals(stats.genericGetMin())) {
mxValue = stats.genericGetMax();
if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
&& columnTypeMetadata.originalType == OriginalType.DATE) {
mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
}
}
columnMetadata =
new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
} else {
columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
}
columnMetadataList.add(columnMetadata);
length += col.getTotalSize();
}
RowGroupMetadata rowGroupMeta =
new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
rowGroupMetadataList.add(rowGroupMeta);
}
return new ParquetFileMetadata(file, file.getLen(), rowGroupMetadataList, columnTypeInfo);
}