当前位置: 首页>>代码示例>>Java>>正文


Java ParquetReader类代码示例

本文整理汇总了Java中org.apache.parquet.hadoop.ParquetReader的典型用法代码示例。如果您正苦于以下问题:Java ParquetReader类的具体用法?Java ParquetReader怎么用?Java ParquetReader使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


ParquetReader类属于org.apache.parquet.hadoop包,在下文中一共展示了ParquetReader类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: read

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public List<Value> read() throws IOException
{
    spark.conf().set(SQLConf$.MODULE$.PARQUET_WRITE_LEGACY_FORMAT().key(), isLegacyFormat);

    Dataset<Row> dataFrame = spark.createDataFrame(data, schema).repartition(1);
    File file = new File(SparkTestBase.this.tempFolder.getRoot(), name);
    dataFrame.write().options(options).parquet(file.getPath());

    ArrayList<Value> results = new ArrayList<>();
    try (ParquetReader<Value> reader = ParquetReader
            .builder(new MessagePackReadSupport(), new Path(file.getPath()))
            .build()) {
        Value v;
        while ((v = reader.read()) != null) {
            results.add(v);
        }
    }
    return results;
}
 
开发者ID:CyberAgent,项目名称:embulk-input-parquet_hadoop,代码行数:20,代码来源:SparkTestBase.java

示例2: openParquetReader

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
private ParquetReader<Element> openParquetReader() throws IOException {
    final boolean isEntity = schemaUtils.getEntityGroups().contains(group);
    final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
    LOGGER.debug("Opening a new Parquet reader for file: {}", filePath);
    if (null != filter) {
        return new ParquetElementReader.Builder<Element>(filePath)
                .isEntity(isEntity)
                .usingConverter(converter)
                .withFilter(FilterCompat.get(filter))
                .build();
    } else {
        return new ParquetElementReader.Builder<Element>(filePath)
                .isEntity(isEntity)
                .usingConverter(converter)
                .build();
    }
}
 
开发者ID:gchq,项目名称:Gaffer,代码行数:18,代码来源:RetrieveElementsFromFile.java

示例3: getSchema

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Override
public DatasetJsonRecord getSchema(Path targetFilePath)
  throws IOException {
  System.out.println("parquet file path : " + targetFilePath.toUri().getPath());

  SeekableInput sin = new FsInput(targetFilePath, fs.getConf());
  ParquetReader<GenericRecord> reader = AvroParquetReader.<GenericRecord>builder(targetFilePath).build();

  String schemaString = reader.read().getSchema().toString();
  String storage = STORAGE_TYPE;
  String abstractPath = targetFilePath.toUri().getPath();

  FileStatus fstat = fs.getFileStatus(targetFilePath);
  // TODO set codec
  DatasetJsonRecord datasetJsonRecord =
    new DatasetJsonRecord(schemaString, abstractPath, fstat.getModificationTime(), fstat.getOwner(), fstat.getGroup(),
      fstat.getPermission().toString(), null, storage, "");
  reader.close();
  sin.close();
  return datasetJsonRecord;
}
 
开发者ID:linkedin,项目名称:WhereHows,代码行数:22,代码来源:ParquetFileAnalyzer.java

示例4: getSampleData

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Override
public SampleDataRecord getSampleData(Path targetFilePath)
  throws IOException {
  ParquetReader<GenericRecord> reader = AvroParquetReader.<GenericRecord>builder(targetFilePath).build();

  Iterator<GenericRecord> iter = Collections.singletonList(reader.read()).iterator();
  int count = 0;
  List<Object> list = new ArrayList<Object>();
  //JSONArray list = new JSONArray();
  while (iter.hasNext() && count < 10) {
    // TODO handle out of memory error
    list.add(iter.next().toString().replaceAll("[\\n\\r\\p{C}]", "").replaceAll("\"", "\\\""));
    count++;
  }
  SampleDataRecord sampleDataRecord = new SampleDataRecord(targetFilePath.toUri().getPath(), list);

  return sampleDataRecord;
}
 
开发者ID:linkedin,项目名称:WhereHows,代码行数:19,代码来源:ParquetFileAnalyzer.java

示例5: validateParquetFile

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public void validateParquetFile(Path parquetFile, long recourdCount) throws IOException {
  ParquetReader reader = AvroParquetReader.builder(parquetFile)
    .build();

  for(long i = 0; i < recourdCount; i++) {
    GenericData.Record actualRow = (GenericData.Record) reader.read();
    Assert.assertNotNull("Can't read row " + i, actualRow);

    Assert.assertEquals("Value different in row " + i + " for key b", actualRow.get("b"), i % 2 == 0);
    Assert.assertEquals("Value different in row " + i + " for key s", actualRow.get("s"), new Utf8(String.valueOf(i)));
    Assert.assertEquals("Value different in row " + i + " for key l", actualRow.get("l"), i);
    Assert.assertEquals("Value different in row " + i + " for key l100", actualRow.get("l100"), i%100);
    Assert.assertEquals("Value different in row " + i + " for key s100", actualRow.get("s100"), new Utf8(String.valueOf(i % 100)));
  }

  Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
 
开发者ID:streamsets,项目名称:datacollector,代码行数:18,代码来源:LargeInputFileIT.java

示例6: validateParquetFile

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public void validateParquetFile(Path parquetFile, List<Map<String, Object>> data) throws IOException {
  ParquetReader reader = AvroParquetReader.builder(parquetFile)
    .build();

  int position = 0;
  for(Map<String, Object> expectedRow : data) {
    GenericData.Record actualRow = (GenericData.Record) reader.read();
    Assert.assertNotNull("Can't read row " + position, actualRow);

    for(Map.Entry<String, Object> entry : expectedRow.entrySet()) {
      Object value = actualRow.get(entry.getKey());
      Assert.assertEquals("Different value on row " + position + " for key " + entry.getKey(), entry.getValue(), value);
    }
  }

  Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
 
开发者ID:streamsets,项目名称:datacollector,代码行数:18,代码来源:BaseAvroParquetConvertIT.java

示例7: read

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOException
{
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build();
  for (int i = 0; i < nRows; i++) {
    Group group = reader.read();
    blackhole.consume(group.getBinary("binary_field", 0));
    blackhole.consume(group.getInteger("int32_field", 0));
    blackhole.consume(group.getLong("int64_field", 0));
    blackhole.consume(group.getBoolean("boolean_field", 0));
    blackhole.consume(group.getFloat("float_field", 0));
    blackhole.consume(group.getDouble("double_field", 0));
    blackhole.consume(group.getBinary("flba_field", 0));
    blackhole.consume(group.getInt96("int96_field", 0));
  }
  reader.close();
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:17,代码来源:ReadBenchmarks.java

示例8: countFilteredRecords

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{
  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), path)
      .withFilter(FilterCompat.get(pred))
      .build();

  long count = 0;
  try {
    while (reader.read() != null) {
      count += 1;
    }
  } finally {
    reader.close();
  }
  return count;
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:17,代码来源:TestFiltersWithMissingColumns.java

示例9: readFile

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public static List<Group> readFile(File f, Filter filter) throws IOException {
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  ParquetReader<Group> reader =
      ParquetReader.builder(new GroupReadSupport(), new Path(f.getAbsolutePath()))
                   .withConf(conf)
                   .withFilter(filter)
                   .build();

  Group current;
  List<Group> users = new ArrayList<Group>();

  current = reader.read();
  while (current != null) {
    users.add(current);
    current = reader.read();
  }

  return users;
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:22,代码来源:PhoneBookWriter.java

示例10: testWriteFile

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFile() throws IOException, InterruptedException, TException {
  final AddressBook a = new AddressBook(
      Arrays.asList(
          new Person(
              new Name("Bob", "Roberts"),
              0,
              "[email protected]",
              Arrays.asList(new PhoneNumber("1234567890")))));

  final Path fileToCreate = createFile(a);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  int i = 0;
  while((g = reader.read()) != null) {
    assertEquals(a.persons.size(), g.getFieldRepetitionCount("persons"));
    assertEquals(a.persons.get(0).email, g.getGroup("persons", 0).getGroup(0, 0).getString("email", 0));
    // just some sanity check, we're testing the various layers somewhere else
    ++i;
  }
  assertEquals("read 1 record", 1, i);

}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:26,代码来源:TestThriftToParquetFileWriter.java

示例11: testWriteFileListOfMap

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFileListOfMap() throws IOException, InterruptedException, TException {
  Map<String, String> map1 = new HashMap<String,String>();
  map1.put("key11", "value11");
  map1.put("key12", "value12");
  Map<String, String> map2 = new HashMap<String,String>();
  map2.put("key21", "value21");
  final TestMapInList listMap = new TestMapInList("listmap",
      Arrays.asList(map1, map2));

  final Path fileToCreate = createFile(listMap);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  while((g = reader.read()) != null) {
    assertEquals(listMap.names.size(),
        g.getGroup("names", 0).getFieldRepetitionCount("names_tuple"));
    assertEquals(listMap.names.get(0).size(),
        g.getGroup("names", 0).getGroup("names_tuple", 0).getFieldRepetitionCount("map"));
    assertEquals(listMap.names.get(1).size(),
        g.getGroup("names", 0).getGroup("names_tuple", 1).getFieldRepetitionCount("map"));
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:25,代码来源:TestThriftToParquetFileWriter.java

示例12: testWriteFileMapOfList

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFileMapOfList() throws IOException, InterruptedException, TException {
  Map<String, List<String>> map = new HashMap<String,List<String>>();
  map.put("key", Arrays.asList("val1","val2"));
  final TestListInMap mapList = new TestListInMap("maplist", map);
  final Path fileToCreate = createFile(mapList);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  while((g = reader.read()) != null) {
    assertEquals("key",
        g.getGroup("names", 0).getGroup("map",0).getBinary("key", 0).toStringUsingUTF8());
    assertEquals(map.get("key").size(),
        g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getFieldRepetitionCount(0));
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:18,代码来源:TestThriftToParquetFileWriter.java

示例13: testWriteFileMapOfLists

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFileMapOfLists() throws IOException, InterruptedException, TException {
  Map<List<String>, List<String>> map = new HashMap<List<String>,List<String>>();
  map.put(Arrays.asList("key1","key2"), Arrays.asList("val1","val2"));
  final TestListsInMap mapList = new TestListsInMap("maplists", map);
  final Path fileToCreate = createFile(mapList);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  while((g = reader.read()) != null) {
    assertEquals("key1",
        g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 0).toStringUsingUTF8());
    assertEquals("key2",
        g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 1).toStringUsingUTF8());
    assertEquals("val1",
        g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 0).toStringUsingUTF8());
    assertEquals("val2",
        g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 1).toStringUsingUTF8());
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:22,代码来源:TestThriftToParquetFileWriter.java

示例14: read

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public static <D> List<D> read(GenericData model, Schema schema, File file) throws IOException {
  List<D> data = new ArrayList<D>();
  Configuration conf = new Configuration(false);
  AvroReadSupport.setRequestedProjection(conf, schema);
  AvroReadSupport.setAvroReadSchema(conf, schema);
  ParquetReader<D> fileReader = AvroParquetReader
      .<D>builder(new Path(file.toString()))
      .withDataModel(model) // reflect disables compatibility
      .withConf(conf)
      .build();

  try {
    D datum;
    while ((datum = fileReader.read()) != null) {
      data.add(datum);
    }
  } finally {
    fileReader.close();
  }

  return data;
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:23,代码来源:AvroTestUtil.java

示例15: testFilterOnSubAttribute

import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testFilterOnSubAttribute() throws IOException {
  Path path = writeCarsToParquetFile(1, CompressionCodecName.UNCOMPRESSED, false);
  
  ParquetReader<Car> reader = new AvroParquetReader<Car>(testConf, path, column("engine.type", equalTo(EngineType.DIESEL)));
  assertEquals(reader.read().toString(), getVwPassat().toString());
  assertNull(reader.read());

  reader = new AvroParquetReader<Car>(testConf, path, column("engine.capacity", equalTo(1.4f)));
  assertEquals(getVwPolo().toString(), reader.read().toString());
  assertNull(reader.read());

  reader = new AvroParquetReader<Car>(testConf, path, column("engine.hasTurboCharger", equalTo(true)));
  assertEquals(getBmwMini().toString(), reader.read().toString());
  assertNull(reader.read());
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:17,代码来源:TestSpecificReadWrite.java


注:本文中的org.apache.parquet.hadoop.ParquetReader类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。