当前位置: 首页>>代码示例>>Java>>正文


Java ParquetMetadata类代码示例

本文整理汇总了Java中parquet.hadoop.metadata.ParquetMetadata的典型用法代码示例。如果您正苦于以下问题:Java ParquetMetadata类的具体用法?Java ParquetMetadata怎么用?Java ParquetMetadata使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


ParquetMetadata类属于parquet.hadoop.metadata包,在下文中一共展示了ParquetMetadata类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getRowGroupNumbersFromFileSplit

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
    final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();

  final long splitStart = split.getStart();
  final long splitLength = split.getLength();

  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}
 
开发者ID:skhalifa,项目名称:QDrill,代码行数:25,代码来源:HiveDrillNativeScanBatchCreator.java

示例2: getSchema

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public DatasetJsonRecord getSchema(Path path) throws IOException {
    DatasetJsonRecord record = null;
    if (!fs.exists(path))
        LOG.error("file path : {} not in hdfs", path);
    else {
        try {
            ParquetMetadata readFooter = ParquetFileReader.readFooter(fs.getConf(), path, ParquetMetadataConverter.NO_FILTER);
            Map<String, String> schema = readFooter.getFileMetaData().getKeyValueMetaData();
            String allFields = schema.get("org.apache.spark.sql.parquet.row.metadata");
            FileStatus status = fs.getFileStatus(path);
            String storage = STORAGE_TYPE;
            String abstractPath = path.toUri().getPath();
            String codec = "parquet.codec";
            record = new DatasetJsonRecord(allFields, abstractPath, status.getModificationTime(), status.getOwner(), status.getGroup(),
                    status.getPermission().toString(), codec, storage, "");
            LOG.info("parquetfileanalyzer parse path :{},schema is {}", path.toUri().getPath(), record.toCsvString());

        } catch (Exception e) {
            LOG.error("path : {} content " + " is not Parquet File format content  ", path.toUri().getPath());
            LOG.info(e.getStackTrace().toString());
        }
    }
    return record;

}
 
开发者ID:thomas-young-2013,项目名称:wherehowsX,代码行数:27,代码来源:ParquetFileAnalyzer.java

示例3: ParquetRecordReader

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public ParquetRecordReader(
    FragmentContext fragmentContext,
    long batchSize,
    String path,
    int rowGroupIndex,
    FileSystem fs,
    DirectCodecFactory codecFactory,
    ParquetMetadata footer,
    List<SchemaPath> columns) throws ExecutionSetupException {
  this.hadoopPath = new Path(path);
  this.fileSystem = fs;
  this.codecFactory = codecFactory;
  this.rowGroupIndex = rowGroupIndex;
  this.batchSize = batchSize;
  this.footer = footer;
  this.fragmentContext = fragmentContext;
  setColumns(columns);
}
 
开发者ID:skhalifa,项目名称:QDrill,代码行数:19,代码来源:ParquetRecordReader.java

示例4: add

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private static void add(ParquetMetadata footer) {
    for (BlockMetaData blockMetaData : footer.getBlocks()) {
        ++blockCount;
        MessageType schema = footer.getFileMetaData().getSchema();
        recordCount += blockMetaData.getRowCount();
        List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
        for (ColumnChunkMetaData columnMetaData : columns) {
            ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
            add(
                    desc,
                    columnMetaData.getValueCount(),
                    columnMetaData.getTotalSize(),
                    columnMetaData.getTotalUncompressedSize(),
                    columnMetaData.getEncodings(),
                    columnMetaData.getStatistics());
        }
    }
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:19,代码来源:PrintFooter.java

示例5: mergeFooters

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
    String rootPath = root.toUri().getPath();
    GlobalMetaData fileMetaData = null;
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    for (Footer footer : footers) {
        String footerPath = footer.getFile().toUri().getPath();
        if (!footerPath.startsWith(rootPath)) {
            throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
        }
        footerPath = footerPath.substring(rootPath.length());
        while (footerPath.startsWith("/")) {
            footerPath = footerPath.substring(1);
        }
        fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
        for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
            block.setPath(footerPath);
            blocks.add(block);
        }
    }
    return new ParquetMetadata(fileMetaData.merge(), blocks);
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:22,代码来源:ParquetFileWriter.java

示例6: toParquetMetadata

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
    List<BlockMetaData> blocks = parquetMetadata.getBlocks();
    List<RowGroup> rowGroups = new ArrayList<RowGroup>();
    int numRows = 0;
    for (BlockMetaData block : blocks) {
        numRows += block.getRowCount();
        addRowGroup(parquetMetadata, rowGroups, block);
    }
    FileMetaData fileMetaData = new FileMetaData(
            currentVersion,
            toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
            numRows,
            rowGroups);

    Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
    for (Entry<String, String> keyValue : keyValues) {
        addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
    }

    fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
    return fileMetaData;
}
 
开发者ID:grokcoder,项目名称:pbase,代码行数:23,代码来源:ParquetMetadataConverter.java

示例7: readSchema

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public SchemaDescription readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    ConfigurationProxy conf = new ConfigurationProxy();
    FileSystem fs = FileSystem.get( new URI( file ), conf );
    FileStatus fileStatus = fs.getFileStatus( new Path( file ) );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new SchemaDescription();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.createSchemaDescription( schema );
    }
  } );
}
 
开发者ID:pentaho,项目名称:pentaho-hadoop-shims,代码行数:17,代码来源:PentahoParquetInputFormat.java

示例8: convertParquetSchemaToKettleWithTwoValidRows

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Test
public void convertParquetSchemaToKettleWithTwoValidRows() throws Exception {

  int pentahoValueMetaTypeFirstRow = 2;
  boolean allowNullFirstRow = false;
  int pentahoValueMetaTypeSecondRow = 5;
  boolean allowNullSecondRow = false;

  String expectedKettleSchema = ParquetUtils
    .createSchema( pentahoValueMetaTypeFirstRow, allowNullFirstRow, pentahoValueMetaTypeSecondRow,
      allowNullSecondRow ).marshall();

  urlTestResources = Thread.currentThread().getContextClassLoader().getResource( PARQUET_FILE );

  ConfigurationProxy conf = new ConfigurationProxy();
  conf.set( "fs.defaultFS", "file:///" );
  ParquetMetadata meta = ParquetFileReader
    .readFooter( conf, new Path( Paths.get( urlTestResources.toURI() ).toString() ),
      ParquetMetadataConverter.NO_FILTER );
  MessageType schema = meta.getFileMetaData().getSchema();

  SchemaDescription kettleSchema = ParquetConverter.createSchemaDescription( schema );
  String marshallKettleSchema = kettleSchema.marshall();
  Assert.assertEquals( marshallKettleSchema, expectedKettleSchema );
}
 
开发者ID:pentaho,项目名称:pentaho-hadoop-shims,代码行数:26,代码来源:ParquetConverterTest.java

示例9: execute

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  
  Configuration conf = new Configuration();
  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, new Path(input));

  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                           .withAutoColumn()
                                           .withAutoCrop()
                                           .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                           .withColumnPadding(1)
                                           .build();

  MetadataUtils.showDetails(out, metaData);
  out.flushColumns();
}
 
开发者ID:wesleypeck,项目名称:parquet-tools,代码行数:21,代码来源:ShowMetaCommand.java

示例10: getMetadatas

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public static ParquetMetadata[] getMetadatas (FileStatus[] fileStatuses, Configuration conf) throws IOException
{
    ParquetMetadata[] res = new ParquetMetadata[fileStatuses.length];
    for (int i = 0; i < fileStatuses.length; ++i)
    {
        res[i] = ParquetFileReader.readFooter(conf, fileStatuses[i].getPath(), NO_FILTER);
    }
    return res;
}
 
开发者ID:dbiir,项目名称:rainbow,代码行数:10,代码来源:LocalParquetEvaluator.java

示例11: DrillParquetReader

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public DrillParquetReader(FragmentContext fragmentContext, ParquetMetadata footer, RowGroupReadEntry entry,
    List<SchemaPath> columns, DrillFileSystem fileSystem) {
  this.footer = footer;
  this.fileSystem = fileSystem;
  this.entry = entry;
  setColumns(columns);
  this.fragmentContext = fragmentContext;
  fillLevelCheckFrequency = this.fragmentContext.getOptions().getOption(ExecConstants.PARQUET_VECTOR_FILL_CHECK_THRESHOLD).num_val.intValue();
  fillLevelCheckThreshold = this.fragmentContext.getOptions().getOption(ExecConstants.PARQUET_VECTOR_FILL_THRESHOLD).num_val.intValue();
}
 
开发者ID:skhalifa,项目名称:QDrill,代码行数:11,代码来源:DrillParquetReader.java

示例12: readFooter

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
 * An updated footer reader that tries to read the entire footer without knowing the length.
 * This should reduce the amount of seek/read roundtrips in most workloads.
 * @param fs
 * @param status
 * @return
 * @throws IOException
 */
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
  final FileSystem fs = status.getPath().getFileSystem(config);
  try(FSDataInputStream file = fs.open(status.getPath())) {

    final long fileLength = status.getLen();
    Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());

    int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
    byte[] footerBytes = new byte[len];
    readFully(file, fileLength - len, footerBytes, 0, len);

    checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
    final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);

    if(size > footerBytes.length - FOOTER_METADATA_SIZE){
      // if the footer is larger than our initial read, we need to read the rest.
      byte[] origFooterBytes = footerBytes;
      int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;

      footerBytes = new byte[size];

      readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
      System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
    }else{
      int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
      footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
    }

    ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(new ByteArrayInputStream(footerBytes));
    Footer footer = new Footer(status.getPath(), metadata);
    return footer;
  }
}
 
开发者ID:skhalifa,项目名称:QDrill,代码行数:42,代码来源:FooterGatherer.java

示例13: isComplex

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private static boolean isComplex(ParquetMetadata footer) {
  MessageType schema = footer.getFileMetaData().getSchema();

  for (Type type : schema.getFields()) {
    if (!type.isPrimitive()) {
      return true;
    }
  }
  for (ColumnDescriptor col : schema.getColumns()) {
    if (col.getMaxRepetitionLevel() > 0) {
      return true;
    }
  }
  return false;
}
 
开发者ID:skhalifa,项目名称:QDrill,代码行数:16,代码来源:ParquetScanBatchCreator.java

示例14: validateFooters

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private void validateFooters(final List<Footer> metadata) {
  logger.debug(metadata.toString());
  assertEquals(3, metadata.size());
  for (Footer footer : metadata) {
    final File file = new File(footer.getFile().toUri());
    assertTrue(file.getName(), file.getName().startsWith("part"));
    assertTrue(file.getPath(), file.exists());
    final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
    assertEquals(2, parquetMetadata.getBlocks().size());
    final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
    assertEquals("bar", keyValueMetaData.get("foo"));
    assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
  }
}
 
开发者ID:skhalifa,项目名称:QDrill,代码行数:15,代码来源:ParquetRecordReaderTest.java

示例15: getDatasetDescriptorFromParquetFile

import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
    throws IOException {

  ArrayList<FileStatus> files = new ArrayList<FileStatus>();
  FileStatus[] dirs;
  dirs = fs.globStatus(fs.makeQualified(getInputPath()));
  for (int i = 0; (dirs != null && i < dirs.length); i++) {
    files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
    // We only check one file, so exit the loop when we have at least
    // one.
    if (files.size() > 0) {
      break;
    }
  }

  ParquetMetadata parquetMetadata;
  try {
    parquetMetadata =
        ParquetFileReader.readFooter(job.getConfiguration(),
            fs.makeQualified(files.get(0).getPath()));
  } catch (IOException e) {
    LOG.error("Wrong file format. Please check the export file's format.", e);
    throw e;
  }
  MessageType schema = parquetMetadata.getFileMetaData().getSchema();
  Schema avroSchema = new AvroSchemaConverter().convert(schema);
  DatasetDescriptor descriptor =
      new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
          .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
  return descriptor;
}
 
开发者ID:aliyun,项目名称:aliyun-maxcompute-data-collectors,代码行数:32,代码来源:HdfsOdpsImportJob.java


注:本文中的parquet.hadoop.metadata.ParquetMetadata类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。