当前位置: 首页>>代码示例>>Java>>正文


Java ParquetFileWriter类代码示例

本文整理汇总了Java中org.apache.parquet.hadoop.ParquetFileWriter的典型用法代码示例。如果您正苦于以下问题:Java ParquetFileWriter类的具体用法?Java ParquetFileWriter怎么用?Java ParquetFileWriter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


ParquetFileWriter类属于org.apache.parquet.hadoop包,在下文中一共展示了ParquetFileWriter类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: isDirReadable

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
boolean isDirReadable(DrillFileSystem fs, FileStatus dir) {
  Path p = new Path(dir.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
  try {
    if (fs.exists(p)) {
      return true;
    } else {

      if (metaDataFileExists(fs, dir)) {
        return true;
      }
      List<FileStatus> statuses = DrillFileSystemUtil.listFiles(fs, dir.getPath(), false);
      return !statuses.isEmpty() && super.isFileReadable(fs, statuses.get(0));
    }
  } catch (IOException e) {
    logger.info("Failure while attempting to check for Parquet metadata file.", e);
    return false;
  }
}
 
开发者ID:axbaretto,项目名称:drill,代码行数:19,代码来源:ParquetFormatPlugin.java

示例2: mergeOutput

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
@Override
protected boolean mergeOutput(FileSystem fs, String sourceFolder, String targetFile) {
    try {
        FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
        List<Path> sourceFiles = new ArrayList<>();
        for (FileStatus sourceStatus : sourceStatuses) {
            sourceFiles.add(sourceStatus.getPath());
        }
        FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf()).getFileMetaData();
        ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(), new Path(targetFile),
                ParquetFileWriter.Mode.CREATE);
        writer.start();
        for (Path input : sourceFiles) {
            writer.appendFile(fs.getConf(), input);
        }
        writer.end(mergedMeta.getKeyValueMetaData());
    } catch (Exception e) {
        LOG.error("Error when merging files in {}.\n{}", sourceFolder, e.getMessage());
        return false;
    }
    return true;
}
 
开发者ID:Talend,项目名称:components,代码行数:23,代码来源:ParquetHdfsFileSink.java

示例3: execute

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
  // Prepare arguments
  List<String> args = options.getArgList();
  List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1));
  Path outputFile = new Path(args.get(args.size() - 1));

  // Merge schema and extraMeta
  FileMetaData mergedMeta = mergedMetadata(inputFiles);
  PrintWriter out = new PrintWriter(Main.out, true);

  // Merge data
  ParquetFileWriter writer = new ParquetFileWriter(conf,
          mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE);
  writer.start();
  boolean tooSmallFilesMerged = false;
  for (Path input: inputFiles) {
    if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) {
      out.format("Warning: file %s is too small, length: %d\n",
        input,
        input.getFileSystem(conf).getFileStatus(input).getLen());
      tooSmallFilesMerged = true;
    }

    writer.appendFile(HadoopInputFile.fromPath(input, conf));
  }

  if (tooSmallFilesMerged) {
    out.println("Warning: you merged too small files. " +
      "Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, " +
      "which usually leads to bad query performance!");
  }
  writer.end(mergedMeta.getKeyValueMetaData());
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:35,代码来源:MergeCommand.java

示例4: createDataFile

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
private static Path createDataFile() throws IOException {
    File parquetFile = File.createTempFile("test-", "." + FILE_EXTENSION);
    readerSchema = new Schema.Parser().parse(
            ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc"));
    projectionSchema = new Schema.Parser().parse(
            ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people_projection.avsc"));

    try (ParquetWriter writer = AvroParquetWriter.<GenericRecord>builder(new Path(parquetFile.toURI()))
            .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) {

        IntStream.range(0, NUM_RECORDS).forEach(index -> {
            GenericRecord datum = new GenericData.Record(readerSchema);
            datum.put(FIELD_INDEX, index);
            datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID()));
            datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID()));
            try {
                OFFSETS_BY_INDEX.put(index, Long.valueOf(index));
                writer.write(datum);
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        });
    }
    Path path = new Path(new Path(fsUri), parquetFile.getName());
    fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path);
    return path;
}
 
开发者ID:mmolimar,项目名称:kafka-connect-fs,代码行数:28,代码来源:ParquetFileReaderTest.java

示例5: getColNameToSchemaElementMapping

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  HashMap<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
  for (SchemaElement se : fileMetaData.getSchema()) {
    schemaElements.put(se.getName(), se);
  }
  return schemaElements;
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:9,代码来源:ParquetReaderUtility.java

示例6: checkMagicBytes

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
private static void checkMagicBytes(FileStatus status, byte[] data, int offset) throws IOException {
  for(int i =0, v = offset; i < MAGIC_LENGTH; i++, v++){
    if(ParquetFileWriter.MAGIC[i] != data[v]){
      byte[] magic = ArrayUtils.subarray(data, offset, offset + MAGIC_LENGTH);
      throw new IOException(status.getPath() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(ParquetFileWriter.MAGIC) + " but found " + Arrays.toString(magic));
    }
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:9,代码来源:SingletonParquetFooterCache.java

示例7: readFooter

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
/**
 * An updated footer reader that tries to read the entire footer without knowing the length.
 * This should reduce the amount of seek/read roundtrips in most workloads.
 * @param fs
 * @param status
 * @return
 * @throws IOException
 */
public static ParquetMetadata readFooter(
  final FileSystem fs,
  final FileStatus status,
  ParquetMetadataConverter.MetadataFilter filter) throws IOException {
  try(FSDataInputStream file = fs.open(status.getPath())) {
    final long fileLength = status.getLen();
    Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());

    int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
    byte[] footerBytes = new byte[len];
    readFully(file, fileLength - len, footerBytes, 0, len);

    checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
    final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);

    if(size > footerBytes.length - FOOTER_METADATA_SIZE){
      // if the footer is larger than our initial read, we need to read the rest.
      byte[] origFooterBytes = footerBytes;
      int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;

      footerBytes = new byte[size];

      readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
      System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
    }else{
      int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
      footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
    }

    return ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(new ByteArrayInputStream(footerBytes), filter);
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:41,代码来源:SingletonParquetFooterCache.java

示例8: getFooters

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
  final List<TimedRunnable<Footer>> readers = Lists.newArrayList();
  List<Footer> foundFooters = Lists.newArrayList();
  for (FileStatus status : statuses) {


    if (status.isDirectory()){
      // first we check for summary file.
      FileSystem fs = status.getPath().getFileSystem(conf);

      final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
      if (fs.exists(summaryPath)){
        FileStatus summaryStatus = fs.getFileStatus(summaryPath);
        foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
        continue;
      }

      // else we handle as normal file.
      for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)){
        readers.add(new FooterReader(conf, inStatus));
      }
    } else {
      readers.add(new FooterReader(conf, status));
    }

  }
  if(!readers.isEmpty()){
    foundFooters.addAll(TimedRunnable.run("Fetch Parquet Footers", logger, readers, parallelism));
  }

  return foundFooters;
}
 
开发者ID:axbaretto,项目名称:drill,代码行数:33,代码来源:FooterGatherer.java

示例9: readFooter

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
/**
 * An updated footer reader that tries to read the entire footer without knowing the length.
 * This should reduce the amount of seek/read roundtrips in most workloads.
 * @param fs
 * @param status
 * @return
 * @throws IOException
 */
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
  final FileSystem fs = status.getPath().getFileSystem(config);
  try(FSDataInputStream file = fs.open(status.getPath())) {

    final long fileLength = status.getLen();
    Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());

    int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
    byte[] footerBytes = new byte[len];
    readFully(file, fileLength - len, footerBytes, 0, len);

    checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
    final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);

    if(size > footerBytes.length - FOOTER_METADATA_SIZE){
      // if the footer is larger than our initial read, we need to read the rest.
      byte[] origFooterBytes = footerBytes;
      int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;

      footerBytes = new byte[size];

      readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
      System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
    }else{
      int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
      footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
    }

    ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(new ByteArrayInputStream(footerBytes));
    Footer footer = new Footer(status.getPath(), metadata);
    return footer;
  }
}
 
开发者ID:axbaretto,项目名称:drill,代码行数:42,代码来源:FooterGatherer.java

示例10: initWriter

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
public static ParquetWriter<Group> initWriter(MessageType schema, String fileName) throws IOException {

    GroupWriteSupport.setSchema(schema, conf);

    ParquetWriter<Group> writer =
        new ParquetWriter<Group>(initFile(fileName),
            ParquetFileWriter.Mode.OVERWRITE,
            new GroupWriteSupport(),
            CompressionCodecName.SNAPPY,
            1024,
            1024,
            512,
            true, // enable dictionary encoding,
            false,
            ParquetProperties.WriterVersion.PARQUET_1_0, conf
        );

    return writer;
  }
 
开发者ID:axbaretto,项目名称:drill,代码行数:20,代码来源:ParquetSimpleTestFileGenerator.java

示例11: createWriter

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
private ParquetWriter<PageReader> createWriter(PluginTask task, Schema schema, int processorIndex)
{
    final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());

    final Path path = new Path(buildPath(task, processorIndex));
    final CompressionCodecName codec = CompressionCodecName.valueOf(task.getCompressionCodec());
    final int blockSize = task.getBlockSize();
    final int pageSize = task.getPageSize();
    final Configuration conf = createConfiguration(task.getExtraConfigurations());
    final boolean overwrite = task.getOverwrite();

    ParquetWriter<PageReader> writer = null;
    try {
        EmbulkWriterBuilder builder = new EmbulkWriterBuilder(path, schema, timestampFormatters)
                .withCompressionCodec(codec)
                .withRowGroupSize(blockSize)
                .withPageSize(pageSize)
                .withDictionaryPageSize(pageSize)
                .withConf(conf);

        if (overwrite) {
            builder.withWriteMode(ParquetFileWriter.Mode.OVERWRITE);
        }

        writer = builder.build();
    }
    catch (IOException e) {
        Throwables.propagate(e);
    }
    return writer;
}
 
开发者ID:choplin,项目名称:embulk-output-parquet,代码行数:32,代码来源:ParquetOutputPlugin.java

示例12: createDataFile

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
@Before
public void createDataFile() throws Exception {
  File file = temp.newFile("test.parquet");
  this.path = new Path(file.toString());

  MessageType type = Types.buildMessage()
      .required(INT64).named("id")
      .required(BINARY).as(UTF8).named("data")
      .named("test");

  SimpleGroupFactory factory = new SimpleGroupFactory(type);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
      .withType(type)
      .build();

  try {
    for (long i = 0; i < 1000; i += 1) {
      Group g = factory.newGroup();
      g.add(0, i);
      g.add(1, "data-" + i);
      writer.write(g);
    }
  } finally {
    writer.close();
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:29,代码来源:TestFiltersWithMissingColumns.java

示例13: mergedMetadata

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
private FileMetaData mergedMetadata(List<Path> inputFiles) throws IOException {
  return ParquetFileWriter.mergeMetadataFiles(inputFiles, conf).getFileMetaData();
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:4,代码来源:MergeCommand.java

示例14: run

import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() == 1,
      "CSV path is required.");

  if (header != null) {
    // if a header is given on the command line, don't assume one is in the file
    noHeader = true;
  }

  CSVProperties props = new CSVProperties.Builder()
      .delimiter(delimiter)
      .escape(escape)
      .quote(quote)
      .header(header)
      .hasHeader(!noHeader)
      .linesToSkip(linesToSkip)
      .charset(charsetName)
      .build();

  String source = targets.get(0);

  Schema csvSchema;
  if (avroSchemaFile != null) {
    csvSchema = Schemas.fromAvsc(open(avroSchemaFile));
  } else {
    Set<String> required = ImmutableSet.of();
    if (requiredFields != null) {
      required = ImmutableSet.copyOf(requiredFields);
    }

    String filename = new File(source).getName();
    String recordName;
    if (filename.contains(".")) {
      recordName = filename.substring(0, filename.indexOf("."));
    } else {
      recordName = filename;
    }

    csvSchema = AvroCSV.inferNullableSchema(
        recordName, open(source), props, required);
  }

  long count = 0;
  try (AvroCSVReader<Record> reader = new AvroCSVReader<>(
      open(source), props, csvSchema, Record.class, true)) {
      CompressionCodecName codec = Codecs.parquetCodec(compressionCodecName);
    try (ParquetWriter<Record> writer = AvroParquetWriter
        .<Record>builder(qualifiedPath(outputPath))
        .withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0)
        .withWriteMode(overwrite ?
            ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE)
        .withCompressionCodec(codec)
        .withDictionaryEncoding(true)
        .withDictionaryPageSize(dictionaryPageSize)
        .withPageSize(pageSize)
        .withRowGroupSize(rowGroupSize)
        .withDataModel(GenericData.get())
        .withConf(getConf())
        .withSchema(csvSchema)
        .build()) {
      for (Record record : reader) {
        writer.write(record);
      }
    } catch (RuntimeException e) {
      throw new RuntimeException("Failed on record " + count, e);
    }
  }

  return 0;
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:73,代码来源:ConvertCSVCommand.java


注:本文中的org.apache.parquet.hadoop.ParquetFileWriter类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。