Java ParquetFileReader类代码示例

本文整理汇总了Java中parquet.hadoop.ParquetFileReader类的典型用法代码示例。如果您正苦于以下问题：Java ParquetFileReader类的具体用法？Java ParquetFileReader怎么用？Java ParquetFileReader使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

ParquetFileReader类属于parquet.hadoop包，在下文中一共展示了ParquetFileReader类的12个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getSchema

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public DatasetJsonRecord getSchema(Path path) throws IOException {
    DatasetJsonRecord record = null;
    if (!fs.exists(path))
        LOG.error("file path : {} not in hdfs", path);
    else {
        try {
            ParquetMetadata readFooter = ParquetFileReader.readFooter(fs.getConf(), path, ParquetMetadataConverter.NO_FILTER);
            Map<String, String> schema = readFooter.getFileMetaData().getKeyValueMetaData();
            String allFields = schema.get("org.apache.spark.sql.parquet.row.metadata");
            FileStatus status = fs.getFileStatus(path);
            String storage = STORAGE_TYPE;
            String abstractPath = path.toUri().getPath();
            String codec = "parquet.codec";
            record = new DatasetJsonRecord(allFields, abstractPath, status.getModificationTime(), status.getOwner(), status.getGroup(),
                    status.getPermission().toString(), codec, storage, "");
            LOG.info("parquetfileanalyzer parse path :{},schema is {}", path.toUri().getPath(), record.toCsvString());

        } catch (Exception e) {
            LOG.error("path : {} content " + " is not Parquet File format content  ", path.toUri().getPath());
            LOG.info(e.getStackTrace().toString());
        }
    }
    return record;

}

开发者ID:thomas-young-2013，项目名称:wherehowsX，代码行数:27，代码来源:ParquetFileAnalyzer.java

示例2: readSchema

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public SchemaDescription readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    ConfigurationProxy conf = new ConfigurationProxy();
    FileSystem fs = FileSystem.get( new URI( file ), conf );
    FileStatus fileStatus = fs.getFileStatus( new Path( file ) );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new SchemaDescription();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.createSchemaDescription( schema );
    }
  } );
}

开发者ID:pentaho，项目名称:pentaho-hadoop-shims，代码行数:17，代码来源:PentahoParquetInputFormat.java

示例3: convertParquetSchemaToKettleWithTwoValidRows

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Test
public void convertParquetSchemaToKettleWithTwoValidRows() throws Exception {

  int pentahoValueMetaTypeFirstRow = 2;
  boolean allowNullFirstRow = false;
  int pentahoValueMetaTypeSecondRow = 5;
  boolean allowNullSecondRow = false;

  String expectedKettleSchema = ParquetUtils
    .createSchema( pentahoValueMetaTypeFirstRow, allowNullFirstRow, pentahoValueMetaTypeSecondRow,
      allowNullSecondRow ).marshall();

  urlTestResources = Thread.currentThread().getContextClassLoader().getResource( PARQUET_FILE );

  ConfigurationProxy conf = new ConfigurationProxy();
  conf.set( "fs.defaultFS", "file:///" );
  ParquetMetadata meta = ParquetFileReader
    .readFooter( conf, new Path( Paths.get( urlTestResources.toURI() ).toString() ),
      ParquetMetadataConverter.NO_FILTER );
  MessageType schema = meta.getFileMetaData().getSchema();

  SchemaDescription kettleSchema = ParquetConverter.createSchemaDescription( schema );
  String marshallKettleSchema = kettleSchema.marshall();
  Assert.assertEquals( marshallKettleSchema, expectedKettleSchema );
}

开发者ID:pentaho，项目名称:pentaho-hadoop-shims，代码行数:26，代码来源:ParquetConverterTest.java

示例4: execute

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  
  Configuration conf = new Configuration();
  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, new Path(input));

  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                           .withAutoColumn()
                                           .withAutoCrop()
                                           .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                           .withColumnPadding(1)
                                           .build();

  MetadataUtils.showDetails(out, metaData);
  out.flushColumns();
}

开发者ID:wesleypeck，项目名称:parquet-tools，代码行数:21，代码来源:ShowMetaCommand.java

示例5: getMetadatas

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public static ParquetMetadata[] getMetadatas (FileStatus[] fileStatuses, Configuration conf) throws IOException
{
    ParquetMetadata[] res = new ParquetMetadata[fileStatuses.length];
    for (int i = 0; i < fileStatuses.length; ++i)
    {
        res[i] = ParquetFileReader.readFooter(conf, fileStatuses[i].getPath(), NO_FILTER);
    }
    return res;
}

开发者ID:dbiir，项目名称:rainbow，代码行数:10，代码来源:LocalParquetEvaluator.java

示例6: getFooters

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
  final List<TimedRunnable<Footer>> readers = Lists.newArrayList();
  List<Footer> foundFooters = Lists.newArrayList();
  for(FileStatus status : statuses){


    if(status.isDirectory()){
      // first we check for summary file.
      FileSystem fs = status.getPath().getFileSystem(conf);

      final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
      if (fs.exists(summaryPath)){
        FileStatus summaryStatus = fs.getFileStatus(summaryPath);
        foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
        continue;
      }

      // else we handle as normal file.
      for(FileStatus inStatus : fs.listStatus(status.getPath(), new DrillPathFilter())){
        readers.add(new FooterReader(conf, inStatus));
      }
    }else{
      readers.add(new FooterReader(conf, status));
    }

  }
  if(!readers.isEmpty()){
    foundFooters.addAll(TimedRunnable.run("Fetch Parquet Footers", logger, readers, parallelism));
  }

  return foundFooters;
}

开发者ID:skhalifa，项目名称:QDrill，代码行数:33，代码来源:FooterGatherer.java

示例7: getDatasetDescriptorFromParquetFile

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
    throws IOException {

  ArrayList<FileStatus> files = new ArrayList<FileStatus>();
  FileStatus[] dirs;
  dirs = fs.globStatus(fs.makeQualified(getInputPath()));
  for (int i = 0; (dirs != null && i < dirs.length); i++) {
    files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
    // We only check one file, so exit the loop when we have at least
    // one.
    if (files.size() > 0) {
      break;
    }
  }

  ParquetMetadata parquetMetadata;
  try {
    parquetMetadata =
        ParquetFileReader.readFooter(job.getConfiguration(),
            fs.makeQualified(files.get(0).getPath()));
  } catch (IOException e) {
    LOG.error("Wrong file format. Please check the export file's format.", e);
    throw e;
  }
  MessageType schema = parquetMetadata.getFileMetaData().getSchema();
  Schema avroSchema = new AvroSchemaConverter().convert(schema);
  DatasetDescriptor descriptor =
      new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
          .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
  return descriptor;
}

开发者ID:aliyun，项目名称:aliyun-maxcompute-data-collectors，代码行数:32，代码来源:HdfsOdpsImportJob.java

示例8: execute

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);

    String[] args = options.getArgs();
    String input = args[0];

    Configuration conf = new Configuration();
    Path inpath = new Path(input);

    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath);
    MessageType schema = metaData.getFileMetaData().getSchema();

    PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                             .withAutoColumn()
                                             .withAutoCrop()
                                             .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES)
                                             .withColumnPadding(1)
                                             .withMaxBufferedLines(1000000)
                                             .withFlushOnTab()
                                             .build();

    boolean showmd = !options.hasOption('m');
    boolean showdt = !options.hasOption('d');

    Set<String> showColumns = null;
    if (options.hasOption('c')) {
        String[] cols = options.getOptionValues('c');
        showColumns = new HashSet<String>(Arrays.asList(cols));
    }

    dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}

开发者ID:wesleypeck，项目名称:parquet-tools，代码行数:34，代码来源:DumpCommand.java

示例9: ParquetFileMetadata

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public ParquetFileMetadata(Configuration conf, Path hdfsFilePath) throws IOException
{
    this.metaData = ParquetFileReader.readFooter(conf, hdfsFilePath, NO_FILTER);
}

开发者ID:dbiir，项目名称:rainbow，代码行数:5，代码来源:ParquetFileMetadata.java

示例10: execute

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public static LocalMetrics execute (FileStatus[] fileStatuses, ParquetMetadata[] metadatas, String[] columnNames, Configuration conf) throws IOException
{
    boolean printColumns = true;
    List<ParquetFileReader> readers = new ArrayList<ParquetFileReader>();
    List<Column> columns = new ArrayList<Column>();
    for (int i = 0; i < fileStatuses.length; ++i)
    {
        FileStatus status = fileStatuses[i];
        ParquetMetadata metadata = metadatas[i];

        MessageType schema = metadata.getFileMetaData().getSchema();

        List<ColumnDescriptor> columnDescriptors = new ArrayList<ColumnDescriptor>();

        for (String columnName : columnNames)
        {
            int fieldIndex = schema.getFieldIndex(columnName.toLowerCase());
            ColumnDescriptor descriptor = schema.getColumns().get(fieldIndex);

            columnDescriptors.add(descriptor);

            if (printColumns)
            {
                Column column = new Column();
                column.setIndex(fieldIndex);
                column.setName(schema.getFieldName(column.getIndex()));
                column.setDescriptor(descriptor);
                columns.add(column);
            }
        }
        printColumns = false;

        readers.add(new ParquetFileReader(conf, status.getPath(), metadata.getBlocks(), columnDescriptors));
    }

    long time  = System.currentTimeMillis();
    long rowCount = 0;
    long rowGroupCount = 0;
    long readerCount = readers.size();
    for (ParquetFileReader reader : readers)
    {
        PageReadStore pageReadStore;
        while ((pageReadStore = reader.readNextRowGroup()) != null)
        {
            rowGroupCount ++;
            rowCount += pageReadStore.getRowCount();
        }
        reader.close();
    }
    LocalMetrics metrics = new LocalMetrics(columns, readerCount, rowGroupCount, rowCount, System.currentTimeMillis()-time);
    return metrics;
}

开发者ID:dbiir，项目名称:rainbow，代码行数:53，代码来源:LocalParquetEvaluator.java

示例11: testPerformance

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Test
  @Ignore
  public void testPerformance(@Injectable final DrillbitContext bitContext,
                              @Injectable UserServer.UserClientConnection connection) throws Exception {
    final DrillConfig c = DrillConfig.create();
    final FunctionImplementationRegistry registry = new FunctionImplementationRegistry(c);
    final FragmentContext context = new FragmentContext(bitContext, BitControl.PlanFragment.getDefaultInstance(), connection, registry);

//    new NonStrictExpectations() {
//      {
//        context.getAllocator(); result = BufferAllocator.getAllocator(DrillConfig.create());
//      }
//    };

    final String fileName = "/tmp/parquet_test_performance.parquet";
    final HashMap<String, FieldInfo> fields = new HashMap<>();
    final ParquetTestProperties props = new ParquetTestProperties(1, 20 * 1000 * 1000, DEFAULT_BYTES_PER_PAGE, fields);
    populateFieldInfoMap(props);
    //generateParquetFile(fileName, props);

    final Configuration dfsConfig = new Configuration();
    final List<Footer> footers = ParquetFileReader.readFooters(dfsConfig, new Path(fileName));
    final Footer f = footers.iterator().next();

    final List<SchemaPath> columns = Lists.newArrayList();
    columns.add(new SchemaPath("_MAP.integer", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.bigInt", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.f", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.d", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.b", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.bin", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.bin2", ExpressionPosition.UNKNOWN));
    int totalRowCount = 0;

    final FileSystem fs = new CachedSingleFileSystem(fileName);
    final BufferAllocator allocator = RootAllocatorFactory.newRoot(c);
    for(int i = 0; i < 25; i++) {
      final ParquetRecordReader rr = new ParquetRecordReader(context, 256000, fileName, 0, fs,
          new DirectCodecFactory(dfsConfig, allocator), f.getParquetMetadata(), columns);
      final TestOutputMutator mutator = new TestOutputMutator(allocator);
      rr.setup(null, mutator);
      final Stopwatch watch = new Stopwatch();
      watch.start();

      int rowCount = 0;
      while ((rowCount = rr.next()) > 0) {
        totalRowCount += rowCount;
      }
      System.out.println(String.format("Time completed: %s. ", watch.elapsed(TimeUnit.MILLISECONDS)));
      rr.close();
    }

    allocator.close();
    System.out.println(String.format("Total row count %s", totalRowCount));
  }

开发者ID:skhalifa，项目名称:QDrill，代码行数:56，代码来源:ParquetRecordReaderTest.java

示例12: run

import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public int run(String[] args) throws Exception {
if(args.length < 2) {
    LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]");
    return 1;
}
String inputFile = args[0];
String outputFile = args[1];
String compression = (args.length > 2) ? args[2] : "none";

Path parquetFilePath = null;
// Find a file in case a directory was passed
RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true);
while(it.hasNext()) {
    FileStatus fs = it.next();
    if(fs.isFile()) {
	parquetFilePath = fs.getPath();
	break;
    }
}
if(parquetFilePath == null) {
    LOG.error("No file found for " + inputFile);
    return 1;
}
LOG.info("Getting schema from " + parquetFilePath);
ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
LOG.info(schema);
GroupWriteSupport.setSchema(schema, getConf());

       Job job = new Job(getConf());
       job.setJarByClass(getClass());
       job.setJobName(getClass().getName());
       job.setMapperClass(ReadRequestMap.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(ExampleInputFormat.class);
job.setOutputFormatClass(ExampleOutputFormat.class);

CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
if(compression.equalsIgnoreCase("snappy")) {
    codec = CompressionCodecName.SNAPPY;
} else if(compression.equalsIgnoreCase("gzip")) {
    codec = CompressionCodecName.GZIP;
}
LOG.info("Output compression: " + codec);
ExampleOutputFormat.setCompression(job, codec);

FileInputFormat.setInputPaths(job, new Path(inputFile));
       FileOutputFormat.setOutputPath(job, new Path(outputFile));

       job.waitForCompletion(true);

       return 0;
   }

开发者ID:cloudera，项目名称:parquet-examples，代码行数:54，代码来源:TestReadWriteParquet.java

注：本文中的parquet.hadoop.ParquetFileReader类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。