当前位置: 首页>>代码示例>>Java>>正文


Java ParquetMetadata.getFileMetaData方法代码示例

本文整理汇总了Java中org.apache.parquet.hadoop.metadata.ParquetMetadata.getFileMetaData方法的典型用法代码示例。如果您正苦于以下问题:Java ParquetMetadata.getFileMetaData方法的具体用法?Java ParquetMetadata.getFileMetaData怎么用?Java ParquetMetadata.getFileMetaData使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.parquet.hadoop.metadata.ParquetMetadata的用法示例。


在下文中一共展示了ParquetMetadata.getFileMetaData方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: test

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:19,代码来源:TestStatistics.java

示例2: read

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void read(String fileName) throws IOException
{
    Path path = new Path(fileName);
    Configuration conf = new Configuration();
    conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());

    ParquetMetadata metadata = ParquetFileReader.readFooter(conf, path, NO_FILTER);
    ParquetFileReader reader = new ParquetFileReader(conf, metadata.getFileMetaData(), path, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns());
    PageReadStore pageReadStore;
    PageReader pageReader;
    DataPage page;
    while ((pageReadStore = reader.readNextRowGroup()) != null) {
        for (ColumnDescriptor cd: metadata.getFileMetaData().getSchema().getColumns()) {
            pageReader = pageReadStore.getPageReader(cd);
            page = pageReader.readPage();
        }
    }
}
 
开发者ID:dbiir,项目名称:RealtimeAnalysis,代码行数:20,代码来源:ParquetFileReaderTest.java

示例3: readFirstRecords

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private static ParquetPreviewParseWriter readFirstRecords(ParquetParseSetup initSetup, ByteVec vec, int cnt) {
  ParquetMetadata metadata = VecParquetReader.readFooter(initSetup.parquetMetadata);
  List<BlockMetaData> blockMetaData;
  if (metadata.getBlocks().isEmpty()) {
    blockMetaData = Collections.<BlockMetaData>emptyList();
  } else {
    final BlockMetaData firstBlock = findFirstBlock(metadata);
    blockMetaData = Collections.singletonList(firstBlock);
  }
  ParquetMetadata startMetadata = new ParquetMetadata(metadata.getFileMetaData(), blockMetaData);
  ParquetPreviewParseWriter ppWriter = new ParquetPreviewParseWriter(initSetup);
  VecParquetReader reader = new VecParquetReader(vec, startMetadata, ppWriter, ppWriter._roughTypes);
  try {
    int recordCnt = 0;
    Integer recordNum;
    do {
      recordNum = reader.read();
    } while ((recordNum != null) && (++recordCnt < cnt));
    return ppWriter;
  } catch (IOException e) {
    throw new RuntimeException("Failed to read the first few records", e);
  }
}
 
开发者ID:h2oai,项目名称:h2o-3,代码行数:24,代码来源:ParquetParser.java

示例4: ParquetRowReader

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
public ParquetRowReader(Configuration configuration, Path filePath, ReadSupport<T> readSupport) throws IOException
{
    this.filePath = filePath;

    ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, filePath, ParquetMetadataConverter.NO_FILTER);
    List<BlockMetaData> blocks = parquetMetadata.getBlocks();

    FileMetaData fileMetadata = parquetMetadata.getFileMetaData();
    this.fileSchema = fileMetadata.getSchema();
    Map<String, String> keyValueMetadata = fileMetadata.getKeyValueMetaData();
    ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
            configuration, toSetMultiMap(keyValueMetadata), fileSchema));
    this.columnIOFactory = new ColumnIOFactory(fileMetadata.getCreatedBy());

    this.requestedSchema = readContext.getRequestedSchema();
    this.recordConverter = readSupport.prepareForRead(
            configuration, fileMetadata.getKeyValueMetaData(), fileSchema, readContext);

    List<ColumnDescriptor> columns = requestedSchema.getColumns();

    reader = new ParquetFileReader(configuration, fileMetadata, filePath, blocks, columns);

    long total = 0;
    for (BlockMetaData block : blocks) {
        total += block.getRowCount();
    }
    this.total = total;

    this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
    logger.info("ParquetRowReader initialized will read a total of " + total + " records.");
}
 
开发者ID:CyberAgent,项目名称:embulk-input-parquet_hadoop,代码行数:32,代码来源:ParquetRowReader.java

示例5: ParquetFileReader

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
/**
 * @param conf the Hadoop Configuration
 * @param file Path to a parquet file
 * @param footer a {@link ParquetMetadata} footer already read from the file
 * @throws IOException if the file can not be opened
 */
@Deprecated
public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) throws IOException {
  this.converter = new ParquetMetadataConverter(conf);
  this.file = HadoopInputFile.fromPath(file, conf);
  this.f = this.file.newStream();
  this.options = HadoopReadOptions.builder(conf).build();
  this.footer = footer;
  this.fileMetaData = footer.getFileMetaData();
  this.blocks = filterRowGroups(footer.getBlocks());
  for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:20,代码来源:ParquetFileReader.java

示例6: mergeMetadataFiles

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
/**
 * Given a list of metadata files, merge them into a single ParquetMetadata
 * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files,  Configuration conf) throws IOException {
  Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");

  GlobalMetaData globalMetaData = null;
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();

  for (Path p : files) {
    ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
    FileMetaData fmd = pmd.getFileMetaData();
    globalMetaData = mergeInto(fmd, globalMetaData, true);
    blocks.addAll(pmd.getBlocks());
  }

  // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
  return new ParquetMetadata(globalMetaData.merge(), blocks);
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:23,代码来源:ParquetFileWriter.java

示例7: readBlocksFromFile

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException {
  List<PageReadStore> rowGroups = new ArrayList<PageReadStore>();

  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(),
      metadata.getFileMetaData().getSchema().getColumns());

  PageReadStore group;
  while ((group = fileReader.readNextRowGroup()) != null) {
    rowGroups.add(group);
  }

  return rowGroups;
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:15,代码来源:FileEncodingsIT.java

示例8: getSplit

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
/**
 * gets a ParquetInputSplit corresponding to a split given by Hive
 *
 * @param oldSplit The split given by Hive
 * @param conf The JobConf of the Hive job
 * @return a ParquetInputSplit corresponding to the oldSplit
 * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
 */
protected ParquetInputSplit getSplit(
    final InputSplit oldSplit,
    final JobConf conf
    ) throws IOException {
  if (oldSplit instanceof FileSplit) {
    FileSplit fileSplit = (FileSplit) oldSplit;
    final long splitStart = fileSplit.getStart();
    final long splitLength = fileSplit.getLength();
    final Path finalPath = fileSplit.getPath();
    final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());

    final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
    final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
    final ReadContext readContext =
        new DataWritableReadSupport()
          .init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());

    schemaSize = MessageTypeParser.parseMessageType(
          readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)
        ).getFieldCount();
     return new ParquetInputSplit(
              finalPath,
              splitStart,
              splitStart + splitLength,
              splitLength,
              fileSplit.getLocations(),
              null);
  } else {
    throw new IllegalArgumentException("Unknown split type: " + oldSplit);
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:40,代码来源:ParquetRecordReaderWrapper.java

示例9: check

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private String check(String file) throws IOException {
  Path path = qualifiedPath(file);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), path, ParquetMetadataConverter.NO_FILTER);

  FileMetaData meta = footer.getFileMetaData();
  String createdBy = meta.getCreatedBy();
  if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
    // create fake metadata that will read corrupt stats and return them
    FileMetaData fakeMeta = new FileMetaData(
        meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);

    // get just the binary columns
    List<ColumnDescriptor> columns = Lists.newArrayList();
    Iterables.addAll(columns, Iterables.filter(
        meta.getSchema().getColumns(),
        new Predicate<ColumnDescriptor>() {
          @Override
          public boolean apply(@Nullable ColumnDescriptor input) {
            return input != null && input.getType() == BINARY;
          }
        }));

    // now check to see if the data is actually corrupt
    ParquetFileReader reader = new ParquetFileReader(getConf(),
        fakeMeta, path, footer.getBlocks(), columns);

    try {
      PageStatsValidator validator = new PageStatsValidator();
      for (PageReadStore pages = reader.readNextRowGroup(); pages != null;
           pages = reader.readNextRowGroup()) {
        validator.validate(columns, pages);
      }
    } catch (BadStatsException e) {
      return e.getMessage();
    }
  }

  return null;
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:41,代码来源:CheckParquet251Command.java

示例10: test

import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void test() throws Exception {
  Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
  Path root = file.getParent();
  FileSystem fs = file.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  Encoding dataEncoding = PLAIN;
  int valueCount = 10;
  int d = 1;
  int r = 2;
  int v = 3;
  BytesInput definitionLevels = BytesInput.fromInt(d);
  BytesInput repetitionLevels = BytesInput.fromInt(r);
  Statistics<?> statistics = new BinaryStatistics();
  BytesInput data = BytesInput.fromInt(v);
  int rowCount = 5;
  int nullCount = 1;

  {
    ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
    writer.start();
    writer.startBlock(rowCount);
    {
      ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema , new HeapByteBufferAllocator());
      PageWriter pageWriter = store.getPageWriter(col);
      pageWriter.writePageV2(
          rowCount, nullCount, valueCount,
          repetitionLevels, definitionLevels,
          dataEncoding, data,
          statistics);
      store.flushToFileWriter(writer);
    }
    writer.endBlock();
    writer.end(new HashMap<String, String>());
  }

  {
    ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
    ParquetFileReader reader = new ParquetFileReader(
        conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
    PageReadStore rowGroup = reader.readNextRowGroup();
    PageReader pageReader = rowGroup.getPageReader(col);
    DataPageV2 page = (DataPageV2)pageReader.readPage();
    assertEquals(rowCount, page.getRowCount());
    assertEquals(nullCount, page.getNullCount());
    assertEquals(valueCount, page.getValueCount());
    assertEquals(d, intValue(page.getDefinitionLevels()));
    assertEquals(r, intValue(page.getRepetitionLevels()));
    assertEquals(dataEncoding, page.getDataEncoding());
    assertEquals(v, intValue(page.getData()));
    assertEquals(statistics.toString(), page.getStatistics().toString());
    reader.close();
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:60,代码来源:TestColumnChunkPageWriteStore.java


注:本文中的org.apache.parquet.hadoop.metadata.ParquetMetadata.getFileMetaData方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。