本文整理汇总了Java中org.apache.parquet.hadoop.metadata.ParquetMetadata.getBlocks方法的典型用法代码示例。如果您正苦于以下问题:Java ParquetMetadata.getBlocks方法的具体用法?Java ParquetMetadata.getBlocks怎么用?Java ParquetMetadata.getBlocks使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.parquet.hadoop.metadata.ParquetMetadata
的用法示例。
在下文中一共展示了ParquetMetadata.getBlocks方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: test
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Override
public void test() throws IOException {
Configuration configuration = new Configuration();
ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
super.fsPath, ParquetMetadataConverter.NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(configuration,
metadata.getFileMetaData(),
super.fsPath,
metadata.getBlocks(),
metadata.getFileMetaData().getSchema().getColumns());
PageStatsValidator validator = new PageStatsValidator();
PageReadStore pageReadStore;
while ((pageReadStore = reader.readNextRowGroup()) != null) {
validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
}
}
示例2: getRowGroupNumbersFromFileSplit
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private static List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
示例3: read
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void read(String fileName) throws IOException
{
Path path = new Path(fileName);
Configuration conf = new Configuration();
conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
ParquetMetadata metadata = ParquetFileReader.readFooter(conf, path, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, metadata.getFileMetaData(), path, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns());
PageReadStore pageReadStore;
PageReader pageReader;
DataPage page;
while ((pageReadStore = reader.readNextRowGroup()) != null) {
for (ColumnDescriptor cd: metadata.getFileMetaData().getSchema().getColumns()) {
pageReader = pageReadStore.getPageReader(cd);
page = pageReader.readPage();
}
}
}
示例4: getRowGroupNumbersFromFileSplit
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
示例5: checkCompatibility
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private static void checkCompatibility(ParquetMetadata metadata) {
// make sure we can map Parquet blocks to Chunks
for (BlockMetaData block : metadata.getBlocks()) {
if (block.getRowCount() > Integer.MAX_VALUE) {
IcedHashMapGeneric.IcedHashMapStringObject dbg = new IcedHashMapGeneric.IcedHashMapStringObject();
dbg.put("startingPos", block.getStartingPos());
dbg.put("rowCount", block.getRowCount());
throw new H2OUnsupportedDataFileException("Unsupported Parquet file (technical limitation).",
"Current implementation doesn't support Parquet files with blocks larger than " +
Integer.MAX_VALUE + " rows.", dbg); // because we map each block to a single H2O Chunk
}
}
// check that file doesn't have nested structures
MessageType schema = metadata.getFileMetaData().getSchema();
for (String[] path : schema.getPaths())
if (path.length != 1) {
throw new H2OUnsupportedDataFileException("Parquet files with nested structures are not supported.",
"Detected a column with a nested structure " + Arrays.asList(path));
}
}
示例6: add
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private static void add(ParquetMetadata footer) {
for (BlockMetaData blockMetaData : footer.getBlocks()) {
++ blockCount;
MessageType schema = footer.getFileMetaData().getSchema();
recordCount += blockMetaData.getRowCount();
List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
for (ColumnChunkMetaData columnMetaData : columns) {
ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
add(
desc,
columnMetaData.getValueCount(),
columnMetaData.getTotalSize(),
columnMetaData.getTotalUncompressedSize(),
columnMetaData.getEncodings(),
columnMetaData.getStatistics());
}
}
}
示例7: toParquetMetadata
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
List<RowGroup> rowGroups = new ArrayList<RowGroup>();
long numRows = 0;
for (BlockMetaData block : blocks) {
numRows += block.getRowCount();
addRowGroup(parquetMetadata, rowGroups, block);
}
FileMetaData fileMetaData = new FileMetaData(
currentVersion,
toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
numRows,
rowGroups);
Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
for (Entry<String, String> keyValue : keyValues) {
addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
}
fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));
return fileMetaData;
}
示例8: run
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1,
"A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1,
"Cannot process multiple Parquet files.");
String source = targets.get(0);
ParquetMetadata footer = ParquetFileReader.readFooter(
getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);
console.info("\nFile path: {}", source);
console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());
Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData();
if (kv != null && !kv.isEmpty()) {
console.info("Properties:");
String format = " %" + maxSize(kv.keySet()) + "s: %s";
for (Map.Entry<String, String> entry : kv.entrySet()) {
console.info(String.format(format, entry.getKey(), entry.getValue()));
}
} else {
console.info("Properties: (none)");
}
MessageType schema = footer.getFileMetaData().getSchema();
console.info("Schema:\n{}", schema);
List<BlockMetaData> rowGroups = footer.getBlocks();
for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
printRowGroup(console, index, rowGroups.get(index), schema);
}
console.info("");
return 0;
}
示例9: getParquetFileMetadata
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private ParquetFileMetadata getParquetFileMetadata(FileStatus file) throws IOException {
final ParquetMetadata metadata;
metadata = SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER);
MessageType schema = metadata.getFileMetaData().getSchema();
Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
schema.getPaths();
for (String[] path : schema.getPaths()) {
originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
}
List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();
ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
if(logger.isDebugEnabled()){
logger.debug(containsCorruptDates.toString());
}
final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
for (BlockMetaData rowGroup : metadata.getBlocks()) {
List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
long length = 0;
for (ColumnChunkMetaData col : rowGroup.getColumns()) {
ColumnMetadata columnMetadata;
boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
Statistics<?> stats = col.getStatistics();
String[] columnName = col.getPath().toArray();
SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
ColumnTypeMetadata columnTypeMetadata =
new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));
columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
if (statsAvailable) {
// Write stats only if minVal==maxVal. Also, we then store only maxVal
Object mxValue = null;
if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
stats.genericGetMax().equals(stats.genericGetMin())) {
mxValue = stats.genericGetMax();
if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
&& columnTypeMetadata.originalType == OriginalType.DATE) {
mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
}
}
columnMetadata =
new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
} else {
columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
}
columnMetadataList.add(columnMetadata);
length += col.getTotalSize();
}
RowGroupMetadata rowGroupMeta =
new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
rowGroupMetadataList.add(rowGroupMeta);
}
return new ParquetFileMetadata(file, file.getLen(), rowGroupMetadataList, columnTypeInfo);
}
示例10: getReaders
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Override
public List<RecordReader> getReaders(final UnifiedParquetReader unifiedReader) throws ExecutionSetupException {
final ParquetMetadata footer = unifiedReader.getFooter();
long counter = 0;
for(final BlockMetaData rowGroup : footer.getBlocks()) {
counter += rowGroup.getRowCount();
}
final long rowCount = counter;
final RecordReader reader = new AbstractRecordReader(unifiedReader.context, Collections.<SchemaPath>emptyList()) {
private long remainingRowCount = rowCount;
@Override
public void setup(OutputMutator output) throws ExecutionSetupException {
}
@Override
public int next() {
if (numRowsPerBatch > remainingRowCount) {
int toReturn = (int) remainingRowCount;
remainingRowCount = 0;
return toReturn;
}
remainingRowCount -= numRowsPerBatch;
return (int)numRowsPerBatch;
}
@Override
public void close() throws Exception {
}
};
return Collections.singletonList(reader);
}
示例11: ParquetRowReader
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
public ParquetRowReader(Configuration configuration, Path filePath, ReadSupport<T> readSupport) throws IOException
{
this.filePath = filePath;
ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, filePath, ParquetMetadataConverter.NO_FILTER);
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
FileMetaData fileMetadata = parquetMetadata.getFileMetaData();
this.fileSchema = fileMetadata.getSchema();
Map<String, String> keyValueMetadata = fileMetadata.getKeyValueMetaData();
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
configuration, toSetMultiMap(keyValueMetadata), fileSchema));
this.columnIOFactory = new ColumnIOFactory(fileMetadata.getCreatedBy());
this.requestedSchema = readContext.getRequestedSchema();
this.recordConverter = readSupport.prepareForRead(
configuration, fileMetadata.getKeyValueMetaData(), fileSchema, readContext);
List<ColumnDescriptor> columns = requestedSchema.getColumns();
reader = new ParquetFileReader(configuration, fileMetadata, filePath, blocks, columns);
long total = 0;
for (BlockMetaData block : blocks) {
total += block.getRowCount();
}
this.total = total;
this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
logger.info("ParquetRowReader initialized will read a total of " + total + " records.");
}
示例12: findFirstBlock
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private static BlockMetaData findFirstBlock(ParquetMetadata metadata) {
BlockMetaData firstBlockMeta = metadata.getBlocks().get(0);
for (BlockMetaData meta : metadata.getBlocks()) {
if (meta.getStartingPos() < firstBlockMeta.getStartingPos()) {
firstBlockMeta = meta;
}
}
return firstBlockMeta;
}
示例13: showDetails
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) {
showDetails(out, meta.getFileMetaData());
long i = 1;
for (BlockMetaData bmeta : meta.getBlocks()) {
out.println();
showDetails(out, bmeta, i++);
}
}
示例14: testMergedMetadata
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void testMergedMetadata() throws IOException {
Path combinedFile = newTemp();
ParquetFileWriter writer = new ParquetFileWriter(
CONF, FILE_SCHEMA, combinedFile);
writer.start();
writer.appendFile(CONF, file1);
writer.appendFile(CONF, file2);
writer.end(EMPTY_METADATA);
ParquetMetadata combinedFooter = ParquetFileReader.readFooter(
CONF, combinedFile, NO_FILTER);
ParquetMetadata f1Footer = ParquetFileReader.readFooter(
CONF, file1, NO_FILTER);
ParquetMetadata f2Footer = ParquetFileReader.readFooter(
CONF, file2, NO_FILTER);
LinkedList<BlockMetaData> expectedRowGroups = new LinkedList<BlockMetaData>();
expectedRowGroups.addAll(f1Footer.getBlocks());
expectedRowGroups.addAll(f2Footer.getBlocks());
Assert.assertEquals("Combined should have the right number of row groups",
expectedRowGroups.size(),
combinedFooter.getBlocks().size());
long nextStart = 4;
for (BlockMetaData rowGroup : combinedFooter.getBlocks()) {
BlockMetaData expected = expectedRowGroups.removeFirst();
Assert.assertEquals("Row count should match",
expected.getRowCount(), rowGroup.getRowCount());
Assert.assertEquals("Compressed size should match",
expected.getCompressedSize(), rowGroup.getCompressedSize());
Assert.assertEquals("Total size should match",
expected.getTotalByteSize(), rowGroup.getTotalByteSize());
Assert.assertEquals("Start pos should be at the last row group's end",
nextStart, rowGroup.getStartingPos());
assertColumnsEquivalent(expected.getColumns(), rowGroup.getColumns());
nextStart = rowGroup.getStartingPos() + rowGroup.getTotalByteSize();
}
}
示例15: testAllowDroppingColumns
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void testAllowDroppingColumns() throws IOException {
MessageType droppedColumnSchema = Types.buildMessage()
.required(BINARY).as(UTF8).named("string")
.named("AppendTest");
Path droppedColumnFile = newTemp();
ParquetFileWriter writer = new ParquetFileWriter(
CONF, droppedColumnSchema, droppedColumnFile);
writer.start();
writer.appendFile(CONF, file1);
writer.appendFile(CONF, file2);
writer.end(EMPTY_METADATA);
LinkedList<Group> expected = new LinkedList<Group>();
expected.addAll(file1content);
expected.addAll(file2content);
ParquetMetadata footer = ParquetFileReader.readFooter(
CONF, droppedColumnFile, NO_FILTER);
for (BlockMetaData rowGroup : footer.getBlocks()) {
Assert.assertEquals("Should have only the string column",
1, rowGroup.getColumns().size());
}
ParquetReader<Group> reader = ParquetReader
.builder(new GroupReadSupport(), droppedColumnFile)
.build();
Group next;
while ((next = reader.read()) != null) {
Group expectedNext = expected.removeFirst();
Assert.assertEquals("Each string should match",
expectedNext.getString("string", 0), next.getString("string", 0));
}
Assert.assertEquals("All records should be present", 0, expected.size());
}