当前位置: 首页>>代码示例>>Java>>正文


Java ParquetFileReader类代码示例

本文整理汇总了Java中org.apache.parquet.hadoop.ParquetFileReader的典型用法代码示例。如果您正苦于以下问题:Java ParquetFileReader类的具体用法?Java ParquetFileReader怎么用?Java ParquetFileReader使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


ParquetFileReader类属于org.apache.parquet.hadoop包,在下文中一共展示了ParquetFileReader类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: test

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:19,代码来源:TestStatistics.java

示例2: getParquetSchema

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
private String getParquetSchema(String source) throws IOException {
  Formats.Format format;
  try (SeekableInput in = openSeekable(source)) {
    format = Formats.detectFormat((InputStream) in);
    in.seek(0);

    switch (format) {
      case PARQUET:
        return new ParquetFileReader(
            getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER)
            .getFileMetaData().getSchema().toString();
      default:
        throw new IllegalArgumentException(String.format(
            "Could not get a Parquet schema for format %s: %s", format, source));
    }
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:18,代码来源:SchemaCommand.java

示例3: readDictionaries

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
/**
 * Return dictionary per row group for all binary columns in given parquet file.
 * @param fs filesystem object.
 * @param filePath parquet file to scan
 * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
 * @throws IOException
 */
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CodecFactory codecFactory) throws IOException {
  final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
  if (parquetMetadata.getBlocks().size() > 1) {
    throw new IOException(
      format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
        parquetMetadata.getBlocks().size(), filePath));
  }
  final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
  final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();

  for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
    columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
  }

  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
  final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
  try(final FSDataInputStream in = fs.open(filePath)) {
    for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
      if (isBinaryType(columnChunkMetaData.getType())) {
        final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
        // if first page is dictionary encoded then load dictionary, otherwise skip this column.
        final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
        if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
          dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
        } else {
          columnsToSkip.add(column);
        }
      }
    }
  }
  return new ImmutablePair<>(dictionaries, columnsToSkip);
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:40,代码来源:LocalDictionariesReader.java

示例4: read

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Test
public void read(String fileName) throws IOException
{
    Path path = new Path(fileName);
    Configuration conf = new Configuration();
    conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());

    ParquetMetadata metadata = ParquetFileReader.readFooter(conf, path, NO_FILTER);
    ParquetFileReader reader = new ParquetFileReader(conf, metadata.getFileMetaData(), path, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns());
    PageReadStore pageReadStore;
    PageReader pageReader;
    DataPage page;
    while ((pageReadStore = reader.readNextRowGroup()) != null) {
        for (ColumnDescriptor cd: metadata.getFileMetaData().getSchema().getColumns()) {
            pageReader = pageReadStore.getPageReader(cd);
            page = pageReader.readPage();
        }
    }
}
 
开发者ID:dbiir,项目名称:RealtimeAnalysis,代码行数:20,代码来源:ParquetFileReaderTest.java

示例5: getScanBatch

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
private RecordBatch getScanBatch() throws Exception {
  List<RecordReader> readers = Lists.newArrayList();

  for (String path : inputPaths) {
    ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), new Path(path));

    for (int i = 0; i < footer.getBlocks().size(); i++) {
      readers.add(new ParquetRecordReader(fragContext,
          path,
          i,
          fs,
          CodecFactory.createDirectCodecFactory(fs.getConf(),
              new ParquetDirectByteBufferAllocator(opContext.getAllocator()), 0),
          footer,
          columnsToRead,
          ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_NO_CORRUPTION));
    }
  }

  RecordBatch scanBatch = new ScanBatch(null, fragContext, readers);
  return scanBatch;
}
 
开发者ID:axbaretto,项目名称:drill,代码行数:23,代码来源:MiniPlanUnitTestBase.java

示例6: initialize

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
 
开发者ID:apache,项目名称:tajo,代码行数:24,代码来源:InternalParquetRecordReader.java

示例7: ParquetReader

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}
 
开发者ID:apache,项目名称:tajo,代码行数:20,代码来源:ParquetReader.java

示例8: initialize

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public void initialize(MessageType fileSchema,
                       FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
        throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
          configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = fileSchema;
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
          configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = true;
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  Log.info("RecordReader initialized will read a total of " + total + " records.");
}
 
开发者ID:h2oai,项目名称:h2o-3,代码行数:24,代码来源:H2OInternalParquetReader.java

示例9: execute

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  
  Configuration conf = new Configuration();
  Path inputPath = new Path(input);
  FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath);
  List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false);

  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                           .withAutoColumn()
                                           .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                           .withColumnPadding(1)
                                           .build();

  for(Footer f: footers) {
    out.format("file: %s%n" , f.getFile());
    MetadataUtils.showDetails(out, f.getParquetMetadata());
    out.flushColumns();
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:25,代码来源:ShowMetaCommand.java

示例10: execute

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);

    String[] args = options.getArgs();
    String input = args[0];

    Configuration conf = new Configuration();
    Path inpath = new Path(input);

    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath, NO_FILTER);
    MessageType schema = metaData.getFileMetaData().getSchema();

    boolean showmd = !options.hasOption('m');
    boolean showdt = !options.hasOption('d');
    boolean cropoutput = !options.hasOption('n');

    Set<String> showColumns = null;
    if (options.hasOption('c')) {
        String[] cols = options.getOptionValues('c');
        showColumns = new HashSet<String>(Arrays.asList(cols));
    }

    PrettyPrintWriter out = prettyPrintWriter(cropoutput);
    dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:27,代码来源:DumpCommand.java

示例11: run

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() >= 1,
      "A Parquet file is required.");
  Preconditions.checkArgument(targets.size() == 1,
      "Cannot process multiple Parquet files.");

  String source = targets.get(0);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);

  console.info("\nFile path:  {}", source);
  console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());

  Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData();
  if (kv != null && !kv.isEmpty()) {
    console.info("Properties:");
    String format = "  %" + maxSize(kv.keySet()) + "s: %s";
    for (Map.Entry<String, String> entry : kv.entrySet()) {
      console.info(String.format(format, entry.getKey(), entry.getValue()));
    }
  } else {
    console.info("Properties: (none)");
  }

  MessageType schema = footer.getFileMetaData().getSchema();
  console.info("Schema:\n{}", schema);

  List<BlockMetaData> rowGroups = footer.getBlocks();
  for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
    printRowGroup(console, index, rowGroups.get(index), schema);
  }

  console.info("");

  return 0;
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:39,代码来源:ParquetMetadataCommand.java

示例12: fromParquet

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public static Schema fromParquet(Configuration conf, URI location) throws IOException {
  Path path = new Path(location);
  FileSystem fs = path.getFileSystem(conf);

  ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path);

  String schemaString = footer.getFileMetaData()
      .getKeyValueMetaData().get("parquet.avro.schema");
  if (schemaString == null) {
    // try the older property
    schemaString = footer.getFileMetaData()
        .getKeyValueMetaData().get("avro.schema");
  }

  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter()
        .convert(footer.getFileMetaData().getSchema());
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:22,代码来源:Schemas.java

示例13: getBatchSchema

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public BatchSchema getBatchSchema(final FileSelection selection, final FileSystemWrapper fs) {
  final SabotContext context = ((ParquetFormatPlugin)formatPlugin).getContext();
  try (
    BufferAllocator sampleAllocator = context.getAllocator().newChildAllocator("sample-alloc", 0, Long.MAX_VALUE);
    OperatorContextImpl operatorContext = new OperatorContextImpl(context.getConfig(), sampleAllocator, context.getOptionManager(), 1000);
    SampleMutator mutator = new SampleMutator(context)
  ){
    final Optional<FileStatus> firstFileO = selection.getFirstFile();
    if(!firstFileO.isPresent()) {
      throw UserException.dataReadError().message("Unable to find any files for datasets.").build(logger);
    }
    final FileStatus firstFile = firstFileO.get();
    final ParquetMetadata footer = ParquetFileReader.readFooter(fsPlugin.getFsConf(), firstFile, ParquetMetadataConverter.NO_FILTER);
    final ParquetReaderUtility.DateCorruptionStatus dateStatus = ParquetReaderUtility.detectCorruptDates(footer, GroupScan.ALL_COLUMNS,
      ((ParquetFormatPlugin)formatPlugin).getConfig().autoCorrectCorruptDates);
    final boolean readInt96AsTimeStamp = operatorContext.getOptions().getOption(PARQUET_READER_INT96_AS_TIMESTAMP).bool_val;
    final ImplicitFilesystemColumnFinder finder = new  ImplicitFilesystemColumnFinder(context.getOptionManager(), fs, GroupScan.ALL_COLUMNS);

    try(RecordReader reader =
          new AdditionalColumnsRecordReader(
            new ParquetRowiseReader(operatorContext, footer, 0, firstFile.getPath().toString(), GroupScan.ALL_COLUMNS, fs, dateStatus, readInt96AsTimeStamp, true),
            finder.getImplicitFieldsForSample(selection)
          )) {

      reader.setup(mutator);

      mutator.allocate(100);
      //TODO DX-3873: remove the next() call here. We need this for now since we don't populate inner list types until next.
      reader.next();

      mutator.getContainer().buildSchema(BatchSchema.SelectionVectorMode.NONE);
      return mutator.getContainer().getSchema();
    }
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:39,代码来源:ParquetFormatDatasetAccessor.java

示例14: runTestAndValidate

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public void runTestAndValidate(String selection, String validationSelection, String inputTable, String outputFile, boolean sort) throws Exception {
  try {
    deleteTableIfExists(outputFile);
    test("use dfs_test");
//    test("ALTER SESSION SET `planner.add_producer_consumer` = false");
    String query = select(selection, inputTable, sort);
    System.out.println(outputFile);
    String create = "CREATE TABLE " + outputFile + " AS " + query;
    String validateQuery = select(validationSelection, outputFile, sort);
    test(create);
    test(validateQuery); // TODO: remove
    testBuilder()
        .unOrdered()
        .sqlQuery(validateQuery)
        .sqlBaselineQuery(query)
        .go();

    Configuration hadoopConf = new Configuration();
    Path output = new Path(getDfsTestTmpSchemaLocation(), outputFile);
    FileSystem fs = output.getFileSystem(hadoopConf);
    for (FileStatus file : fs.listStatus(output)) {
      ParquetMetadata footer = ParquetFileReader.readFooter(hadoopConf, file, SKIP_ROW_GROUPS);
      String version = footer.getFileMetaData().getKeyValueMetaData().get(DREMIO_VERSION_PROPERTY);
      assertEquals(DremioVersionInfo.getVersion(), version);
      PageHeaderUtil.validatePageHeaders(file.getPath(), footer);
    }
  } finally {
    deleteTableIfExists(outputFile);
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:31,代码来源:TestParquetWriter.java

示例15: ParquetRowReader

import org.apache.parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public ParquetRowReader(Configuration configuration, Path filePath, ReadSupport<T> readSupport) throws IOException
{
    this.filePath = filePath;

    ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, filePath, ParquetMetadataConverter.NO_FILTER);
    List<BlockMetaData> blocks = parquetMetadata.getBlocks();

    FileMetaData fileMetadata = parquetMetadata.getFileMetaData();
    this.fileSchema = fileMetadata.getSchema();
    Map<String, String> keyValueMetadata = fileMetadata.getKeyValueMetaData();
    ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
            configuration, toSetMultiMap(keyValueMetadata), fileSchema));
    this.columnIOFactory = new ColumnIOFactory(fileMetadata.getCreatedBy());

    this.requestedSchema = readContext.getRequestedSchema();
    this.recordConverter = readSupport.prepareForRead(
            configuration, fileMetadata.getKeyValueMetaData(), fileSchema, readContext);

    List<ColumnDescriptor> columns = requestedSchema.getColumns();

    reader = new ParquetFileReader(configuration, fileMetadata, filePath, blocks, columns);

    long total = 0;
    for (BlockMetaData block : blocks) {
        total += block.getRowCount();
    }
    this.total = total;

    this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
    logger.info("ParquetRowReader initialized will read a total of " + total + " records.");
}
 
开发者ID:CyberAgent,项目名称:embulk-input-parquet_hadoop,代码行数:32,代码来源:ParquetRowReader.java


注:本文中的org.apache.parquet.hadoop.ParquetFileReader类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。