本文整理汇总了Java中parquet.hadoop.metadata.BlockMetaData类的典型用法代码示例。如果您正苦于以下问题:Java BlockMetaData类的具体用法?Java BlockMetaData怎么用?Java BlockMetaData使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
BlockMetaData类属于parquet.hadoop.metadata包,在下文中一共展示了BlockMetaData类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getRowGroupNumbersFromFileSplit
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
示例2: initialize
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public void initialize(MessageType fileSchema,
Map<String, String> fileMetadata,
Path file, List<BlockMetaData> blocks, Configuration configuration)
throws IOException {
// initialize a ReadContext for this file
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
configuration, toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();
this.fileSchema = fileSchema;
this.file = file;
this.columnCount = requestedSchema.getPaths().size();
this.recordConverter = readSupport.prepareForRead(
configuration, fileMetadata, fileSchema, readContext);
this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
List<ColumnDescriptor> columns = requestedSchema.getColumns();
reader = new ParquetFileReader(configuration, file, blocks, columns);
for (BlockMetaData block : blocks) {
total += block.getRowCount();
}
LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
示例3: initReader
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
private void initReader() throws IOException {
if (reader != null) {
reader.close();
reader = null;
}
if (footersIterator.hasNext()) {
Footer footer = footersIterator.next();
List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks();
MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema();
List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups(
filter, blocks, fileSchema);
fileInfo.setBlockMetaDatas(blocks);
fileInfo.setFileSchema(fileSchema);
fileInfo.setFilteredBlocks(filteredBlocks);
reader = new InternalParquetRecordReader<T>(readSupport, filter);
reader.initialize(fileSchema,
footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(),
footer.getFile(), filteredBlocks, conf);
}
}
示例4: add
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
private static void add(ParquetMetadata footer) {
for (BlockMetaData blockMetaData : footer.getBlocks()) {
++blockCount;
MessageType schema = footer.getFileMetaData().getSchema();
recordCount += blockMetaData.getRowCount();
List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
for (ColumnChunkMetaData columnMetaData : columns) {
ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
add(
desc,
columnMetaData.getValueCount(),
columnMetaData.getTotalSize(),
columnMetaData.getTotalUncompressedSize(),
columnMetaData.getEncodings(),
columnMetaData.getStatistics());
}
}
}
示例5: mergeFooters
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
String rootPath = root.toUri().getPath();
GlobalMetaData fileMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Footer footer : footers) {
String footerPath = footer.getFile().toUri().getPath();
if (!footerPath.startsWith(rootPath)) {
throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
}
footerPath = footerPath.substring(rootPath.length());
while (footerPath.startsWith("/")) {
footerPath = footerPath.substring(1);
}
fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
block.setPath(footerPath);
blocks.add(block);
}
}
return new ParquetMetadata(fileMetaData.merge(), blocks);
}
示例6: checkBelongingToANewHDFSBlock
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* @param rowGroupMetadata
* @return true if the mid point of row group is in a new hdfs block, and also move the currentHDFSBlock pointer to the correct index that contains the row group;
* return false if the mid point of row group is in the same hdfs block
*/
private boolean checkBelongingToANewHDFSBlock(BlockMetaData rowGroupMetadata) {
boolean isNewHdfsBlock = false;
long rowGroupMidPoint = rowGroupMetadata.getStartingPos() + (rowGroupMetadata.getCompressedSize() / 2);
//if mid point is not in the current HDFS block any more, return true
while (rowGroupMidPoint > getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex)) {
isNewHdfsBlock = true;
currentMidPointHDFSBlockIndex++;
if (currentMidPointHDFSBlockIndex >= hdfsBlocks.length)
throw new ParquetDecodingException("the row group is not in hdfs blocks in the file: midpoint of row groups is "
+ rowGroupMidPoint
+ ", the end of the hdfs block is "
+ getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex - 1));
}
while (rowGroupMetadata.getStartingPos() > getHDFSBlockEndingPosition(currentStartHdfsBlockIndex)) {
currentStartHdfsBlockIndex++;
if (currentStartHdfsBlockIndex >= hdfsBlocks.length)
throw new ParquetDecodingException("The row group does not start in this file: row group offset is "
+ rowGroupMetadata.getStartingPos()
+ " but the end of hdfs blocks of file is "
+ getHDFSBlockEndingPosition(currentStartHdfsBlockIndex));
}
return isNewHdfsBlock;
}
示例7: generateSplits
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* groups together all the data blocks for the same HDFS block
*
* @param rowGroupBlocks data blocks (row groups)
* @param hdfsBlocksArray hdfs blocks
* @param fileStatus the containing file
* @param requestedSchema the schema requested by the user
* @param readSupportMetadata the metadata provided by the readSupport implementation in init
* @param minSplitSize the mapred.min.split.size
* @param maxSplitSize the mapred.max.split.size
* @return the splits (one per HDFS block)
* @throws IOException If hosts can't be retrieved for the HDFS block
*/
static <T> List<ParquetInputSplit> generateSplits(
List<BlockMetaData> rowGroupBlocks,
BlockLocation[] hdfsBlocksArray,
FileStatus fileStatus,
String requestedSchema,
Map<String, String> readSupportMetadata, long minSplitSize, long maxSplitSize) throws IOException {
List<SplitInfo> splitRowGroups =
generateSplitInfo(rowGroupBlocks, hdfsBlocksArray, minSplitSize, maxSplitSize);
//generate splits from rowGroups of each split
List<ParquetInputSplit> resultSplits = new ArrayList<ParquetInputSplit>();
for (SplitInfo splitInfo : splitRowGroups) {
ParquetInputSplit split = splitInfo.getParquetInputSplit(fileStatus, requestedSchema, readSupportMetadata);
resultSplits.add(split);
}
return resultSplits;
}
示例8: visit
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
@Override
public List<BlockMetaData> visit(FilterCompat.FilterPredicateCompat filterPredicateCompat) {
FilterPredicate filterPredicate = filterPredicateCompat.getFilterPredicate();
// check that the schema of the filter matches the schema of the file
SchemaCompatibilityValidator.validate(filterPredicate, schema);
List<BlockMetaData> filteredBlocks = new ArrayList<BlockMetaData>();
for (BlockMetaData block : blocks) {
if (!StatisticsFilter.canDrop(filterPredicate, block.getColumns())) {
filteredBlocks.add(block);
}
}
return filteredBlocks;
}
示例9: toParquetMetadata
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
List<RowGroup> rowGroups = new ArrayList<RowGroup>();
int numRows = 0;
for (BlockMetaData block : blocks) {
numRows += block.getRowCount();
addRowGroup(parquetMetadata, rowGroups, block);
}
FileMetaData fileMetaData = new FileMetaData(
currentVersion,
toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
numRows,
rowGroups);
Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
for (Entry<String, String> keyValue : keyValues) {
addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
}
fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
return fileMetaData;
}
示例10: generateSplitByDeprecatedConstructor
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
private List<ParquetInputSplit> generateSplitByDeprecatedConstructor(long min, long max) throws
IOException {
List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
List<ClientSideMetadataSplitStrategy.SplitInfo> splitInfos = ClientSideMetadataSplitStrategy
.generateSplitInfo(blocks, hdfsBlocks, min, max);
for (ClientSideMetadataSplitStrategy.SplitInfo splitInfo : splitInfos) {
BlockMetaData lastRowGroup = splitInfo.getRowGroups().get(splitInfo.getRowGroupCount() - 1);
long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();
ParquetInputSplit split = new ParquetInputSplit(fileStatus.getPath(),
splitInfo.hdfsBlock.getOffset(), end, splitInfo.hdfsBlock.getHosts(),
splitInfo.rowGroups, schema.toString(), null, null, extramd);
splits.add(split);
}
return splits;
}
示例11: ParquetReader
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public ParquetReader(MessageType fileSchema,
Map<String, String> extraMetadata,
MessageType requestedSchema,
Path file,
List<BlockMetaData> blocks,
Configuration configuration)
throws IOException
{
this.fileSchema = fileSchema;
this.extraMetadata = extraMetadata;
this.requestedSchema = requestedSchema;
this.file = file;
this.blocks = blocks;
this.configuration = configuration;
this.fileReader = new ParquetFileReader(configuration, file, blocks, requestedSchema.getColumns());
for (BlockMetaData block : blocks) {
fileRowCount += block.getRowCount();
}
}
示例12: ParquetFileReader
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
public ParquetFileReader(
Configuration configuration,
Path file,
List<BlockMetaData> blocks,
List<ColumnDescriptor> columns)
throws IOException
{
this.file = file;
this.inputStream = file.getFileSystem(configuration).open(file);
this.blocks = blocks;
if (!blocks.isEmpty()) {
for (ColumnDescriptor columnDescriptor : columns) {
for (ColumnChunkMetaData metadata : blocks.get(0).getColumns()) {
if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) {
columnMetadata.put(columnDescriptor, metadata);
}
}
}
}
this.codecFactory = new ParquetCodecFactory(configuration);
}
示例13: getBlocks
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* get the blocks (row groups) of the parquet files.
* @return
*/
public List<BlockMetaData> getBlocks ()
{
if (this.blockMetaDataList == null || this.blockMetaDataList.size() == 0)
{
this.blockMetaDataList = new ArrayList<BlockMetaData>();
for (ParquetFileMetadata meta : this.fileMetaDataList)
{
this.blockMetaDataList.addAll(meta.getBlocks());
}
}
return this.blockMetaDataList;
}
示例14: getTotalSize
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
/**
* get the total compressed size of the parquet files.
* @return
*/
@Override
public long getTotalSize ()
{
long size = 0;
for (BlockMetaData meta : this.getBlocks())
{
size += meta.getCompressedSize();
}
return size;
}
示例15: testGetStat
import parquet.hadoop.metadata.BlockMetaData; //导入依赖的package包/类
@Test
public void testGetStat () throws IOException, MetadataException
{
//ParquetMetadataStat stat = new ParquetMetadataStat("10.172.96.77", 9000, "/tmp/hive-root/hive_2015-02-04_10-57-36_131_1404874572956637570-1/_tmp.-ext-10002");
ParquetMetadataStat stat = new ParquetMetadataStat("192.168.124.15", 9000, "/msra/parquet_test");
double[] columnSize = stat.getAvgColumnChunkSize();
List<String> names = stat.getFieldNames();
int i = 0;
double total = 0;
for (double size: columnSize)
{
//System.out.println(names.get(i) + "\t" + size);
total += size;
i++;
}
System.out.println(total/1024/1024 + "\t" + stat.getRowGroupCount() + "\t" + stat.getFileCount());
for (BlockMetaData bm : stat.getBlocks())
{
System.out.println(bm.getCompressedSize() + ", " + bm.getTotalByteSize() + ", " + bm.getRowCount());
}
List<ParquetFileMetadata> metaDatas = stat.getFileMetaData();
System.out.println(metaDatas.get(0).getFileMetaData().getCreatedBy());
Map<String, String> keyValueMetaData = metaDatas.get(0).getFileMetaData().getKeyValueMetaData();
for (String key : keyValueMetaData.keySet())
{
System.out.println(key + "=" + keyValueMetaData.get(key));
}
for (int j = 0; j < names.size(); ++j)
{
System.out.println(names.get(j) + "\t" + columnSize[j] + "\n");
}
}