本文整理汇总了Java中org.apache.parquet.hadoop.ParquetFileReader类的典型用法代码示例。如果您正苦于以下问题:Java ParquetFileReader类的具体用法?Java ParquetFileReader怎么用?Java ParquetFileReader使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
ParquetFileReader类属于org.apache.parquet.hadoop包,在下文中一共展示了ParquetFileReader类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: test
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public void test() throws IOException {
Configuration configuration = new Configuration();
ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
super.fsPath, ParquetMetadataConverter.NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(configuration,
metadata.getFileMetaData(),
super.fsPath,
metadata.getBlocks(),
metadata.getFileMetaData().getSchema().getColumns());
PageStatsValidator validator = new PageStatsValidator();
PageReadStore pageReadStore;
while ((pageReadStore = reader.readNextRowGroup()) != null) {
validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
}
}
示例2: getParquetSchema
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
private String getParquetSchema(String source) throws IOException {
Formats.Format format;
try (SeekableInput in = openSeekable(source)) {
format = Formats.detectFormat((InputStream) in);
in.seek(0);
switch (format) {
case PARQUET:
return new ParquetFileReader(
getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER)
.getFileMetaData().getSchema().toString();
default:
throw new IllegalArgumentException(String.format(
"Could not get a Parquet schema for format %s: %s", format, source));
}
}
}
示例3: readDictionaries
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
/**
* Return dictionary per row group for all binary columns in given parquet file.
* @param fs filesystem object.
* @param filePath parquet file to scan
* @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
* @throws IOException
*/
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CodecFactory codecFactory) throws IOException {
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
if (parquetMetadata.getBlocks().size() > 1) {
throw new IOException(
format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
parquetMetadata.getBlocks().size(), filePath));
}
final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();
for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
}
final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
try(final FSDataInputStream in = fs.open(filePath)) {
for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
if (isBinaryType(columnChunkMetaData.getType())) {
final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
// if first page is dictionary encoded then load dictionary, otherwise skip this column.
final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
} else {
columnsToSkip.add(column);
}
}
}
}
return new ImmutablePair<>(dictionaries, columnsToSkip);
}
示例4: read
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Test
public void read(String fileName) throws IOException
{
Path path = new Path(fileName);
Configuration conf = new Configuration();
conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
ParquetMetadata metadata = ParquetFileReader.readFooter(conf, path, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, metadata.getFileMetaData(), path, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns());
PageReadStore pageReadStore;
PageReader pageReader;
DataPage page;
while ((pageReadStore = reader.readNextRowGroup()) != null) {
for (ColumnDescriptor cd: metadata.getFileMetaData().getSchema().getColumns()) {
pageReader = pageReadStore.getPageReader(cd);
page = pageReader.readPage();
}
}
}
示例5: getScanBatch
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
private RecordBatch getScanBatch() throws Exception {
List<RecordReader> readers = Lists.newArrayList();
for (String path : inputPaths) {
ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), new Path(path));
for (int i = 0; i < footer.getBlocks().size(); i++) {
readers.add(new ParquetRecordReader(fragContext,
path,
i,
fs,
CodecFactory.createDirectCodecFactory(fs.getConf(),
new ParquetDirectByteBufferAllocator(opContext.getAllocator()), 0),
footer,
columnsToRead,
ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_NO_CORRUPTION));
}
}
RecordBatch scanBatch = new ScanBatch(null, fragContext, readers);
return scanBatch;
}
示例6: initialize
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public void initialize(FileMetaData parquetFileMetadata,
Path file, List<BlockMetaData> blocks, Configuration configuration)
throws IOException {
// initialize a ReadContext for this file
Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
configuration, toSetMultiMap(fileMetadata), fileSchema));
this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
this.requestedSchema = readContext.getRequestedSchema();
this.fileSchema = parquetFileMetadata.getSchema();
this.file = file;
this.columnCount = requestedSchema.getPaths().size();
this.recordConverter = readSupport.prepareForRead(
configuration, fileMetadata, fileSchema, readContext);
this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
List<ColumnDescriptor> columns = requestedSchema.getColumns();
reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
for (BlockMetaData block : blocks) {
total += block.getRowCount();
}
this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
示例7: ParquetReader
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
private ParquetReader(Configuration conf,
Path file,
ReadSupport<T> readSupport,
Filter filter) throws IOException {
this.readSupport = readSupport;
this.filter = checkNotNull(filter, "filter");
this.conf = conf;
FileSystem fs = file.getFileSystem(conf);
List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
this.footersIterator = footers.iterator();
for (Footer footer : footers) {
for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
totalRowCount += block.getRowCount();
}
}
}
示例8: initialize
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public void initialize(MessageType fileSchema,
FileMetaData parquetFileMetadata,
Path file, List<BlockMetaData> blocks, Configuration configuration)
throws IOException {
// initialize a ReadContext for this file
Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
configuration, toSetMultiMap(fileMetadata), fileSchema));
this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
this.requestedSchema = readContext.getRequestedSchema();
this.fileSchema = fileSchema;
this.file = file;
this.columnCount = requestedSchema.getPaths().size();
this.recordConverter = readSupport.prepareForRead(
configuration, fileMetadata, fileSchema, readContext);
this.strictTypeChecking = true;
List<ColumnDescriptor> columns = requestedSchema.getColumns();
reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
for (BlockMetaData block : blocks) {
total += block.getRowCount();
}
Log.info("RecordReader initialized will read a total of " + total + " records.");
}
示例9: execute
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
Path inputPath = new Path(input);
FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath);
List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false);
PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
.withAutoColumn()
.withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
.withColumnPadding(1)
.build();
for(Footer f: footers) {
out.format("file: %s%n" , f.getFile());
MetadataUtils.showDetails(out, f.getParquetMetadata());
out.flushColumns();
}
}
示例10: execute
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
Path inpath = new Path(input);
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath, NO_FILTER);
MessageType schema = metaData.getFileMetaData().getSchema();
boolean showmd = !options.hasOption('m');
boolean showdt = !options.hasOption('d');
boolean cropoutput = !options.hasOption('n');
Set<String> showColumns = null;
if (options.hasOption('c')) {
String[] cols = options.getOptionValues('c');
showColumns = new HashSet<String>(Arrays.asList(cols));
}
PrettyPrintWriter out = prettyPrintWriter(cropoutput);
dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}
示例11: run
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1,
"A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1,
"Cannot process multiple Parquet files.");
String source = targets.get(0);
ParquetMetadata footer = ParquetFileReader.readFooter(
getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);
console.info("\nFile path: {}", source);
console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());
Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData();
if (kv != null && !kv.isEmpty()) {
console.info("Properties:");
String format = " %" + maxSize(kv.keySet()) + "s: %s";
for (Map.Entry<String, String> entry : kv.entrySet()) {
console.info(String.format(format, entry.getKey(), entry.getValue()));
}
} else {
console.info("Properties: (none)");
}
MessageType schema = footer.getFileMetaData().getSchema();
console.info("Schema:\n{}", schema);
List<BlockMetaData> rowGroups = footer.getBlocks();
for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
printRowGroup(console, index, rowGroups.get(index), schema);
}
console.info("");
return 0;
}
示例12: fromParquet
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public static Schema fromParquet(Configuration conf, URI location) throws IOException {
Path path = new Path(location);
FileSystem fs = path.getFileSystem(conf);
ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path);
String schemaString = footer.getFileMetaData()
.getKeyValueMetaData().get("parquet.avro.schema");
if (schemaString == null) {
// try the older property
schemaString = footer.getFileMetaData()
.getKeyValueMetaData().get("avro.schema");
}
if (schemaString != null) {
return new Schema.Parser().parse(schemaString);
} else {
return new AvroSchemaConverter()
.convert(footer.getFileMetaData().getSchema());
}
}
示例13: getBatchSchema
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public BatchSchema getBatchSchema(final FileSelection selection, final FileSystemWrapper fs) {
final SabotContext context = ((ParquetFormatPlugin)formatPlugin).getContext();
try (
BufferAllocator sampleAllocator = context.getAllocator().newChildAllocator("sample-alloc", 0, Long.MAX_VALUE);
OperatorContextImpl operatorContext = new OperatorContextImpl(context.getConfig(), sampleAllocator, context.getOptionManager(), 1000);
SampleMutator mutator = new SampleMutator(context)
){
final Optional<FileStatus> firstFileO = selection.getFirstFile();
if(!firstFileO.isPresent()) {
throw UserException.dataReadError().message("Unable to find any files for datasets.").build(logger);
}
final FileStatus firstFile = firstFileO.get();
final ParquetMetadata footer = ParquetFileReader.readFooter(fsPlugin.getFsConf(), firstFile, ParquetMetadataConverter.NO_FILTER);
final ParquetReaderUtility.DateCorruptionStatus dateStatus = ParquetReaderUtility.detectCorruptDates(footer, GroupScan.ALL_COLUMNS,
((ParquetFormatPlugin)formatPlugin).getConfig().autoCorrectCorruptDates);
final boolean readInt96AsTimeStamp = operatorContext.getOptions().getOption(PARQUET_READER_INT96_AS_TIMESTAMP).bool_val;
final ImplicitFilesystemColumnFinder finder = new ImplicitFilesystemColumnFinder(context.getOptionManager(), fs, GroupScan.ALL_COLUMNS);
try(RecordReader reader =
new AdditionalColumnsRecordReader(
new ParquetRowiseReader(operatorContext, footer, 0, firstFile.getPath().toString(), GroupScan.ALL_COLUMNS, fs, dateStatus, readInt96AsTimeStamp, true),
finder.getImplicitFieldsForSample(selection)
)) {
reader.setup(mutator);
mutator.allocate(100);
//TODO DX-3873: remove the next() call here. We need this for now since we don't populate inner list types until next.
reader.next();
mutator.getContainer().buildSchema(BatchSchema.SelectionVectorMode.NONE);
return mutator.getContainer().getSchema();
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
示例14: runTestAndValidate
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public void runTestAndValidate(String selection, String validationSelection, String inputTable, String outputFile, boolean sort) throws Exception {
try {
deleteTableIfExists(outputFile);
test("use dfs_test");
// test("ALTER SESSION SET `planner.add_producer_consumer` = false");
String query = select(selection, inputTable, sort);
System.out.println(outputFile);
String create = "CREATE TABLE " + outputFile + " AS " + query;
String validateQuery = select(validationSelection, outputFile, sort);
test(create);
test(validateQuery); // TODO: remove
testBuilder()
.unOrdered()
.sqlQuery(validateQuery)
.sqlBaselineQuery(query)
.go();
Configuration hadoopConf = new Configuration();
Path output = new Path(getDfsTestTmpSchemaLocation(), outputFile);
FileSystem fs = output.getFileSystem(hadoopConf);
for (FileStatus file : fs.listStatus(output)) {
ParquetMetadata footer = ParquetFileReader.readFooter(hadoopConf, file, SKIP_ROW_GROUPS);
String version = footer.getFileMetaData().getKeyValueMetaData().get(DREMIO_VERSION_PROPERTY);
assertEquals(DremioVersionInfo.getVersion(), version);
PageHeaderUtil.validatePageHeaders(file.getPath(), footer);
}
} finally {
deleteTableIfExists(outputFile);
}
}
示例15: ParquetRowReader
import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public ParquetRowReader(Configuration configuration, Path filePath, ReadSupport<T> readSupport) throws IOException
{
this.filePath = filePath;
ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, filePath, ParquetMetadataConverter.NO_FILTER);
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
FileMetaData fileMetadata = parquetMetadata.getFileMetaData();
this.fileSchema = fileMetadata.getSchema();
Map<String, String> keyValueMetadata = fileMetadata.getKeyValueMetaData();
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
configuration, toSetMultiMap(keyValueMetadata), fileSchema));
this.columnIOFactory = new ColumnIOFactory(fileMetadata.getCreatedBy());
this.requestedSchema = readContext.getRequestedSchema();
this.recordConverter = readSupport.prepareForRead(
configuration, fileMetadata.getKeyValueMetaData(), fileSchema, readContext);
List<ColumnDescriptor> columns = requestedSchema.getColumns();
reader = new ParquetFileReader(configuration, fileMetadata, filePath, blocks, columns);
long total = 0;
for (BlockMetaData block : blocks) {
total += block.getRowCount();
}
this.total = total;
this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
logger.info("ParquetRowReader initialized will read a total of " + total + " records.");
}