本文整理汇总了Java中org.apache.parquet.hadoop.metadata.ParquetMetadata类的典型用法代码示例。如果您正苦于以下问题:Java ParquetMetadata类的具体用法?Java ParquetMetadata怎么用?Java ParquetMetadata使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ParquetMetadata类属于org.apache.parquet.hadoop.metadata包,在下文中一共展示了ParquetMetadata类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: test
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public void test() throws IOException {
Configuration configuration = new Configuration();
ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
super.fsPath, ParquetMetadataConverter.NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(configuration,
metadata.getFileMetaData(),
super.fsPath,
metadata.getBlocks(),
metadata.getFileMetaData().getSchema().getColumns());
PageStatsValidator validator = new PageStatsValidator();
PageReadStore pageReadStore;
while ((pageReadStore = reader.readNextRowGroup()) != null) {
validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
}
}
示例2: readDictionaries
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
* Return dictionary per row group for all binary columns in given parquet file.
* @param fs filesystem object.
* @param filePath parquet file to scan
* @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
* @throws IOException
*/
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CodecFactory codecFactory) throws IOException {
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
if (parquetMetadata.getBlocks().size() > 1) {
throw new IOException(
format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
parquetMetadata.getBlocks().size(), filePath));
}
final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();
for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
}
final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
try(final FSDataInputStream in = fs.open(filePath)) {
for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
if (isBinaryType(columnChunkMetaData.getType())) {
final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
// if first page is dictionary encoded then load dictionary, otherwise skip this column.
final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
} else {
columnsToSkip.add(column);
}
}
}
}
return new ImmutablePair<>(dictionaries, columnsToSkip);
}
示例3: DeprecatedParquetVectorizedReader
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public DeprecatedParquetVectorizedReader(
OperatorContext operatorContext,
String path,
int rowGroupIndex,
FileSystem fs,
CodecFactory codecFactory,
ParquetMetadata footer,
List<SchemaPath> columns,
ParquetReaderUtility.DateCorruptionStatus dateCorruptionStatus,
boolean readInt96AsTimeStamp,
Map<String, GlobalDictionaryFieldInfo> globalDictionaryColumns,
GlobalDictionaries globalDictionaries) throws ExecutionSetupException {
super(operatorContext, columns);
this.hadoopPath = new Path(path);
this.fileSystem = fs;
this.codecFactory = codecFactory;
this.rowGroupIndex = rowGroupIndex;
this.footer = footer;
this.dateCorruptionStatus = dateCorruptionStatus;
this.readInt96AsTimeStamp = readInt96AsTimeStamp;
this.globalDictionaryColumns = globalDictionaryColumns == null? Collections.<String, GlobalDictionaryFieldInfo>emptyMap() : globalDictionaryColumns;
this.globalDictionaries = globalDictionaries;
this.singleInputStream = null;
}
示例4: getReaders
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public List<RecordReader> getReaders(UnifiedParquetReader unifiedReader) throws ExecutionSetupException {
final ParquetMetadata footer = unifiedReader.getFooter();
final DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer,
unifiedReader.columnsInGroupScan, unifiedReader.autoCorrectCorruptDates);
List<RecordReader> returnList = new ArrayList<>();
returnList.add(unifiedReader.addFilterIfNecessary(
new DeprecatedParquetVectorizedReader(
unifiedReader.context,
unifiedReader.readEntry.getPath(), unifiedReader.readEntry.getRowGroupIndex(), unifiedReader.fs,
CodecFactory.createDirectCodecFactory(
unifiedReader.fs.getConf(),
new ParquetDirectByteBufferAllocator(unifiedReader.context.getAllocator()), 0),
footer,
unifiedReader.realFields,
containsCorruptDates,
unifiedReader.readInt96AsTimeStamp,
unifiedReader.globalDictionaryFieldInfoMap,
unifiedReader.dictionaries
)
));
return returnList;
}
示例5: FileSplitParquetRecordReader
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public FileSplitParquetRecordReader(
final OperatorContext oContext,
final ParquetReaderFactory readerFactory,
final List<SchemaPath> columnsToRead,
final List<SchemaPath> groupScanColumns,
final List<FilterCondition> conditions,
final FileSplit fileSplit,
final ParquetMetadata footer,
final JobConf jobConf,
final boolean vectorize,
final boolean enableDetailedTracing
) {
this.oContext = oContext;
this.columnsToRead = columnsToRead;
this.groupScanColumns = groupScanColumns;
this.conditions = conditions;
this.fileSplit = fileSplit;
this.footer = footer;
this.jobConf = jobConf;
this.readerFactory = readerFactory;
this.vectorize = vectorize;
this.enableDetailedTracing = enableDetailedTracing;
}
示例6: getRowGroupNumbersFromFileSplit
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private static List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
示例7: read
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Test
public void read(String fileName) throws IOException
{
Path path = new Path(fileName);
Configuration conf = new Configuration();
conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
ParquetMetadata metadata = ParquetFileReader.readFooter(conf, path, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, metadata.getFileMetaData(), path, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns());
PageReadStore pageReadStore;
PageReader pageReader;
DataPage page;
while ((pageReadStore = reader.readNextRowGroup()) != null) {
for (ColumnDescriptor cd: metadata.getFileMetaData().getSchema().getColumns()) {
pageReader = pageReadStore.getPageReader(cd);
page = pageReader.readPage();
}
}
}
示例8: ParquetFooterStatCollector
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public ParquetFooterStatCollector(ParquetMetadata footer, int rowGroupIndex, Map<String, String> implicitColValues,
boolean autoCorrectCorruptDates, OptionManager options) {
this.footer = footer;
this.rowGroupIndex = rowGroupIndex;
// Reasons to pass implicit columns and their values:
// 1. Differentiate implicit columns from regular non-exist columns. Implicit columns do not
// exist in parquet metadata. Without such knowledge, implicit columns is treated as non-exist
// column. A condition on non-exist column would lead to canDrop = true, which is not the
// right behavior for condition on implicit columns.
// 2. Pass in the implicit column name with corresponding values, and wrap them in Statistics with
// min and max having same value. This expands the possibility of pruning.
// For example, regCol = 5 or dir0 = 1995. If regCol is not a partition column, we would not do
// any partition pruning in the current partition pruning logical. Pass the implicit column values
// may allow us to prune some row groups using condition regCol = 5 or dir0 = 1995.
this.implicitColValues = implicitColValues;
this.autoCorrectCorruptDates = autoCorrectCorruptDates;
this.options = options;
}
示例9: getScanBatch
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private RecordBatch getScanBatch() throws Exception {
List<RecordReader> readers = Lists.newArrayList();
for (String path : inputPaths) {
ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), new Path(path));
for (int i = 0; i < footer.getBlocks().size(); i++) {
readers.add(new ParquetRecordReader(fragContext,
path,
i,
fs,
CodecFactory.createDirectCodecFactory(fs.getConf(),
new ParquetDirectByteBufferAllocator(opContext.getAllocator()), 0),
footer,
columnsToRead,
ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_NO_CORRUPTION));
}
}
RecordBatch scanBatch = new ScanBatch(null, fragContext, readers);
return scanBatch;
}
示例10: testIntPredicateAgainstAllNullColWithEval
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Test
public void testIntPredicateAgainstAllNullColWithEval() throws Exception {
// intAllNull.parquet has only one int column with all values being NULL.
// column values statistics: num_nulls: 25, min/max is not defined
final File file = dirTestWatcher.getRootDir()
.toPath()
.resolve(Paths.get("parquetFilterPush", "intTbl", "intAllNull.parquet"))
.toFile();
ParquetMetadata footer = getParquetMetaData(file);
testParquetRowGroupFilterEval(footer, "intCol = 100", true);
testParquetRowGroupFilterEval(footer, "intCol = 0", true);
testParquetRowGroupFilterEval(footer, "intCol = -100", true);
testParquetRowGroupFilterEval(footer, "intCol > 10", true);
testParquetRowGroupFilterEval(footer, "intCol >= 10", true);
testParquetRowGroupFilterEval(footer, "intCol < 10", true);
testParquetRowGroupFilterEval(footer, "intCol <= 10", true);
}
示例11: testDatePredicateAgainstDrillCTASHelper
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private void testDatePredicateAgainstDrillCTASHelper(ParquetMetadata footer) throws Exception{
testParquetRowGroupFilterEval(footer, "o_orderdate = cast('1992-01-01' as date)", false);
testParquetRowGroupFilterEval(footer, "o_orderdate = cast('1991-12-31' as date)", true);
testParquetRowGroupFilterEval(footer, "o_orderdate >= cast('1991-12-31' as date)", false);
testParquetRowGroupFilterEval(footer, "o_orderdate >= cast('1992-01-03' as date)", false);
testParquetRowGroupFilterEval(footer, "o_orderdate >= cast('1992-01-04' as date)", true);
testParquetRowGroupFilterEval(footer, "o_orderdate > cast('1992-01-01' as date)", false);
testParquetRowGroupFilterEval(footer, "o_orderdate > cast('1992-01-03' as date)", true);
testParquetRowGroupFilterEval(footer, "o_orderdate <= cast('1992-01-01' as date)", false);
testParquetRowGroupFilterEval(footer, "o_orderdate <= cast('1991-12-31' as date)", true);
testParquetRowGroupFilterEval(footer, "o_orderdate < cast('1992-01-02' as date)", false);
testParquetRowGroupFilterEval(footer, "o_orderdate < cast('1992-01-01' as date)", true);
}
示例12: getRowGroupNumbersFromFileSplit
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
示例13: checkCompatibility
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private static void checkCompatibility(ParquetMetadata metadata) {
// make sure we can map Parquet blocks to Chunks
for (BlockMetaData block : metadata.getBlocks()) {
if (block.getRowCount() > Integer.MAX_VALUE) {
IcedHashMapGeneric.IcedHashMapStringObject dbg = new IcedHashMapGeneric.IcedHashMapStringObject();
dbg.put("startingPos", block.getStartingPos());
dbg.put("rowCount", block.getRowCount());
throw new H2OUnsupportedDataFileException("Unsupported Parquet file (technical limitation).",
"Current implementation doesn't support Parquet files with blocks larger than " +
Integer.MAX_VALUE + " rows.", dbg); // because we map each block to a single H2O Chunk
}
}
// check that file doesn't have nested structures
MessageType schema = metadata.getFileMetaData().getSchema();
for (String[] path : schema.getPaths())
if (path.length != 1) {
throw new H2OUnsupportedDataFileException("Parquet files with nested structures are not supported.",
"Detected a column with a nested structure " + Arrays.asList(path));
}
}
示例14: readFirstRecords
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private static ParquetPreviewParseWriter readFirstRecords(ParquetParseSetup initSetup, ByteVec vec, int cnt) {
ParquetMetadata metadata = VecParquetReader.readFooter(initSetup.parquetMetadata);
List<BlockMetaData> blockMetaData;
if (metadata.getBlocks().isEmpty()) {
blockMetaData = Collections.<BlockMetaData>emptyList();
} else {
final BlockMetaData firstBlock = findFirstBlock(metadata);
blockMetaData = Collections.singletonList(firstBlock);
}
ParquetMetadata startMetadata = new ParquetMetadata(metadata.getFileMetaData(), blockMetaData);
ParquetPreviewParseWriter ppWriter = new ParquetPreviewParseWriter(initSetup);
VecParquetReader reader = new VecParquetReader(vec, startMetadata, ppWriter, ppWriter._roughTypes);
try {
int recordCnt = 0;
Integer recordNum;
do {
recordNum = reader.read();
} while ((recordNum != null) && (++recordCnt < cnt));
return ppWriter;
} catch (IOException e) {
throw new RuntimeException("Failed to read the first few records", e);
}
}
示例15: execute
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
Path inpath = new Path(input);
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath, NO_FILTER);
MessageType schema = metaData.getFileMetaData().getSchema();
boolean showmd = !options.hasOption('m');
boolean showdt = !options.hasOption('d');
boolean cropoutput = !options.hasOption('n');
Set<String> showColumns = null;
if (options.hasOption('c')) {
String[] cols = options.getOptionValues('c');
showColumns = new HashSet<String>(Arrays.asList(cols));
}
PrettyPrintWriter out = prettyPrintWriter(cropoutput);
dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}