本文整理汇总了Java中parquet.hadoop.metadata.ColumnPath类的典型用法代码示例。如果您正苦于以下问题:Java ColumnPath类的具体用法?Java ColumnPath怎么用?Java ColumnPath使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
ColumnPath类属于parquet.hadoop.metadata包,在下文中一共展示了ColumnPath类的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: startColumn
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
/**
* start a column inside a block
*
* @param descriptor the column descriptor
* @param valueCount the value count in this column
* @param statistics the statistics in this column
* @param compressionCodecName
* @throws IOException
*/
public void startColumn(ColumnDescriptor descriptor,
long valueCount,
CompressionCodecName compressionCodecName) throws IOException {
state = state.startColumn();
if (DEBUG) LOG.debug(out.getPos() + ": start column: " + descriptor + " count=" + valueCount);
currentEncodings = new HashSet<parquet.column.Encoding>();
currentChunkPath = ColumnPath.get(descriptor.getPath());
currentChunkType = descriptor.getType();
currentChunkCodec = compressionCodecName;
currentChunkValueCount = valueCount;
currentChunkFirstDataPage = out.getPos();
compressedLength = 0;
uncompressedLength = 0;
// need to know what type of stats to initialize to
// better way to do this?
currentStatistics = Statistics.getStatsBasedOnType(currentChunkType);
}
示例2: ParquetFileReader
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
public ParquetFileReader(
Configuration configuration,
Path file,
List<BlockMetaData> blocks,
List<ColumnDescriptor> columns)
throws IOException
{
this.file = file;
this.inputStream = file.getFileSystem(configuration).open(file);
this.blocks = blocks;
if (!blocks.isEmpty()) {
for (ColumnDescriptor columnDescriptor : columns) {
for (ColumnChunkMetaData metadata : blocks.get(0).getColumns()) {
if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) {
columnMetadata.put(columnDescriptor, metadata);
}
}
}
}
this.codecFactory = new ParquetCodecFactory(configuration);
}
示例3: ParquetFileReader
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
/**
* @param filePath the Parquet file (will be opened for read in this constructor)
* @param blocks the blocks to read
* @param columns the columns to read (their path)
* //@param the codec used to compress the blocks
* @throws IOException if the file can not be opened
*/
public ParquetFileReader(Configuration configuration, Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
this.filePath = filePath;
FileSystem fs = filePath.getFileSystem(configuration);
this.f = fs.open(filePath);
this.blocks = blocks;
for (ColumnDescriptor col : columns) {
paths.put(ColumnPath.get(col.getPath()), col);
}
this.codecFactory = new CodecFactory(configuration);
}
示例4: readNextRowGroup
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
/**
* Reads all the columns requested from the row group at the current file position.
*
* @return the PageReadStore which can provide PageReaders for each column.
* @throws IOException if an error occurs while reading
*/
public PageReadStore readNextRowGroup() throws IOException {
if (currentBlock == blocks.size()) {
return null;
}
BlockMetaData block = blocks.get(currentBlock);
if (block.getRowCount() == 0) {
throw new RuntimeException("Illegal row group of 0 rows");
}
ColumnChunkPageReadStore columnChunkPageReadStore = new ColumnChunkPageReadStore(block.getRowCount());
// prepare the list of consecutive chunks to read them in one scan
List<ConsecutiveChunkList> allChunks = new ArrayList<ConsecutiveChunkList>();
ConsecutiveChunkList currentChunks = null;
for (ColumnChunkMetaData mc : block.getColumns()) {
ColumnPath pathKey = mc.getPath();
BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
ColumnDescriptor columnDescriptor = paths.get(pathKey);
if (columnDescriptor != null) {
long startingPos = mc.getStartingPos();
// first chunk or not consecutive => new list
if (currentChunks == null || currentChunks.endPos() != startingPos) {
currentChunks = new ConsecutiveChunkList(startingPos);
allChunks.add(currentChunks);
}
currentChunks.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int) mc.getTotalSize()));
}
}
// actually read all the chunks
for (ConsecutiveChunkList consecutiveChunks : allChunks) {
final List<Chunk> chunks = consecutiveChunks.readAll(f);
for (Chunk chunk : chunks) {
columnChunkPageReadStore.addColumn(chunk.descriptor.col, chunk.readAllPages());
}
}
++currentBlock;
return columnChunkPageReadStore;
}
示例5: makeBlockFromStats
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
public static BlockMetaData makeBlockFromStats(IntStatistics stats, long valueCount) {
BlockMetaData blockMetaData = new BlockMetaData();
ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
PrimitiveTypeName.INT32,
CompressionCodecName.GZIP,
new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
stats,
100l, 100l, valueCount, 100l, 100l);
blockMetaData.addColumn(column);
blockMetaData.setTotalByteSize(200l);
blockMetaData.setRowCount(valueCount);
return blockMetaData;
}
示例6: newBlock
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
private BlockMetaData newBlock(long start, long compressedBlockSize) {
BlockMetaData blockMetaData = new BlockMetaData();
long uncompressedSize = compressedBlockSize * 2;//assuming the compression ratio is 2
ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
PrimitiveTypeName.BINARY,
CompressionCodecName.GZIP,
new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
new BinaryStatistics(),
start, 0l, 0l, compressedBlockSize, uncompressedSize);
blockMetaData.addColumn(column);
blockMetaData.setTotalByteSize(uncompressedSize);
return blockMetaData;
}
示例7: getIntColumnMeta
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
private static ColumnChunkMetaData getIntColumnMeta(IntStatistics stats, long valueCount) {
return ColumnChunkMetaData.get(ColumnPath.get("int", "column"),
PrimitiveTypeName.INT32,
CompressionCodecName.GZIP,
new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
stats,
0L, 0L, valueCount, 0L, 0L);
}
示例8: getDoubleColumnMeta
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, long valueCount) {
return ColumnChunkMetaData.get(ColumnPath.get("double", "column"),
PrimitiveTypeName.DOUBLE,
CompressionCodecName.GZIP,
new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
stats,
0L, 0L, valueCount, 0L, 0L);
}
示例9: getDictionariesByColumnOrdinal
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
private static Map<Integer, ParquetDictionaryDescriptor> getDictionariesByColumnOrdinal(
BlockMetaData blockMetadata,
Path path,
Configuration configuration,
MessageType requestedSchema,
TupleDomain<HiveColumnHandle> effectivePredicate)
{
// todo should we call release?
ParquetCodecFactory codecFactory = new ParquetCodecFactory(configuration);
ImmutableMap.Builder<Integer, ParquetDictionaryDescriptor> dictionaries = ImmutableMap.builder();
for (int ordinal = 0; ordinal < blockMetadata.getColumns().size(); ordinal++) {
ColumnChunkMetaData columnChunkMetaData = blockMetadata.getColumns().get(ordinal);
for (int i = 0; i < requestedSchema.getColumns().size(); i++) {
ColumnDescriptor columnDescriptor = requestedSchema.getColumns().get(i);
if (isColumnPredicate(columnDescriptor, effectivePredicate) &&
columnChunkMetaData.getPath().equals(ColumnPath.get(columnDescriptor.getPath())) &&
isOnlyDictionaryEncodingPages(columnChunkMetaData.getEncodings())) {
DictionaryPage dictionaryPage;
try (FSDataInputStream inputStream = path.getFileSystem(configuration).open(path)) {
inputStream.seek(columnChunkMetaData.getStartingPos());
int totalSize = Ints.checkedCast(columnChunkMetaData.getTotalSize());
byte[] buffer = new byte[totalSize];
inputStream.readFully(buffer);
dictionaryPage = readDictionaryPage(buffer, codecFactory, columnChunkMetaData.getCodec());
dictionaries.put(ordinal, new ParquetDictionaryDescriptor(columnDescriptor, dictionaryPage));
}
catch (IOException ignored) {
}
break;
}
}
}
return dictionaries.build();
}
示例10: getColumnChunk
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
private ColumnChunkMetaData getColumnChunk(ColumnPath columnPath) {
ColumnChunkMetaData c = columns.get(columnPath);
checkArgument(c != null, "Column " + columnPath.toDotString() + " not found in schema!");
return c;
}
示例11: fromParquetMetadata
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException {
MessageType messageType = fromParquetSchema(parquetMetadata.getSchema());
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
List<RowGroup> row_groups = parquetMetadata.getRow_groups();
if (row_groups != null) {
for (RowGroup rowGroup : row_groups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
List<ColumnChunk> columns = rowGroup.getColumns();
String filePath = columns.get(0).getFile_path();
for (ColumnChunk columnChunk : columns) {
if ((filePath == null && columnChunk.getFile_path() != null)
|| (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
throw new ParquetDecodingException("all column chunks of the same row group must be in the same file for now");
}
parquet.format.ColumnMetaData metaData = columnChunk.meta_data;
ColumnPath path = getPath(metaData);
ColumnChunkMetaData column = ColumnChunkMetaData.get(
path,
messageType.getType(path.toArray()).asPrimitiveType().getPrimitiveTypeName(),
CompressionCodecName.fromParquet(metaData.codec),
fromFormatEncodings(metaData.encodings),
fromParquetStatistics(metaData.statistics, messageType.getType(path.toArray()).asPrimitiveType().getPrimitiveTypeName()),
metaData.data_page_offset,
metaData.dictionary_page_offset,
metaData.num_values,
metaData.total_compressed_size,
metaData.total_uncompressed_size);
// TODO
// index_page_offset
// key_value_metadata
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map<String, String> keyValueMetaData = new HashMap<String, String>();
List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata();
if (key_value_metadata != null) {
for (KeyValue keyValue : key_value_metadata) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
return new ParquetMetadata(
new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, parquetMetadata.getCreated_by()),
blocks);
}
示例12: getPath
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
private ColumnPath getPath(parquet.format.ColumnMetaData metaData) {
String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
return ColumnPath.get(path);
}
示例13: readFooter
import parquet.hadoop.metadata.ColumnPath; //导入依赖的package包/类
public static ParquetMetadata readFooter(Configuration configuration, Path file)
throws IOException
{
FileSystem fileSystem = file.getFileSystem(configuration);
FileStatus fileStatus = fileSystem.getFileStatus(file);
try (FSDataInputStream inputStream = fileSystem.open(file)) {
// Parquet File Layout:
//
// MAGIC
// variable: Data
// variable: Metadata
// 4 bytes: MetadataLength
// MAGIC
long length = fileStatus.getLen();
checkArgument(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length;
inputStream.seek(metadataLengthIndex);
int metadataLength = readIntLittleEndian(inputStream);
byte[] magic = new byte[MAGIC.length];
inputStream.readFully(magic);
checkArgument(Arrays.equals(MAGIC, magic),
"Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));
long metadataIndex = metadataLengthIndex - metadataLength;
checkArgument(metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
"Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex);
inputStream.seek(metadataIndex);
FileMetaData fileMetaData = readFileMetaData(inputStream);
List<SchemaElement> schema = fileMetaData.getSchema();
checkArgument(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);
MessageType messageType = readParquetSchema(schema);
List<BlockMetaData> blocks = new ArrayList<>();
List<RowGroup> rowGroups = fileMetaData.getRow_groups();
if (rowGroups != null) {
for (RowGroup rowGroup : rowGroups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
List<ColumnChunk> columns = rowGroup.getColumns();
checkArgument(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
String filePath = columns.get(0).getFile_path();
for (ColumnChunk columnChunk : columns) {
checkArgument((filePath == null && columnChunk.getFile_path() == null)
|| (filePath != null && filePath.equals(columnChunk.getFile_path())),
"all column chunks of the same row group must be in the same file");
ColumnMetaData metaData = columnChunk.meta_data;
String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
ColumnPath columnPath = ColumnPath.get(path);
ColumnChunkMetaData column = ColumnChunkMetaData.get(
columnPath,
messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(),
CompressionCodecName.fromParquet(metaData.codec),
readEncodings(metaData.encodings),
readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()),
metaData.data_page_offset,
metaData.dictionary_page_offset,
metaData.num_values,
metaData.total_compressed_size,
metaData.total_uncompressed_size);
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map<String, String> keyValueMetaData = new HashMap<>();
List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
if (keyValueList != null) {
for (KeyValue keyValue : keyValueList) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
}
}