本文整理汇总了Java中org.apache.parquet.hadoop.ParquetReader类的典型用法代码示例。如果您正苦于以下问题:Java ParquetReader类的具体用法?Java ParquetReader怎么用?Java ParquetReader使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
ParquetReader类属于org.apache.parquet.hadoop包,在下文中一共展示了ParquetReader类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: read
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public List<Value> read() throws IOException
{
spark.conf().set(SQLConf$.MODULE$.PARQUET_WRITE_LEGACY_FORMAT().key(), isLegacyFormat);
Dataset<Row> dataFrame = spark.createDataFrame(data, schema).repartition(1);
File file = new File(SparkTestBase.this.tempFolder.getRoot(), name);
dataFrame.write().options(options).parquet(file.getPath());
ArrayList<Value> results = new ArrayList<>();
try (ParquetReader<Value> reader = ParquetReader
.builder(new MessagePackReadSupport(), new Path(file.getPath()))
.build()) {
Value v;
while ((v = reader.read()) != null) {
results.add(v);
}
}
return results;
}
示例2: openParquetReader
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
private ParquetReader<Element> openParquetReader() throws IOException {
final boolean isEntity = schemaUtils.getEntityGroups().contains(group);
final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
LOGGER.debug("Opening a new Parquet reader for file: {}", filePath);
if (null != filter) {
return new ParquetElementReader.Builder<Element>(filePath)
.isEntity(isEntity)
.usingConverter(converter)
.withFilter(FilterCompat.get(filter))
.build();
} else {
return new ParquetElementReader.Builder<Element>(filePath)
.isEntity(isEntity)
.usingConverter(converter)
.build();
}
}
示例3: getSchema
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Override
public DatasetJsonRecord getSchema(Path targetFilePath)
throws IOException {
System.out.println("parquet file path : " + targetFilePath.toUri().getPath());
SeekableInput sin = new FsInput(targetFilePath, fs.getConf());
ParquetReader<GenericRecord> reader = AvroParquetReader.<GenericRecord>builder(targetFilePath).build();
String schemaString = reader.read().getSchema().toString();
String storage = STORAGE_TYPE;
String abstractPath = targetFilePath.toUri().getPath();
FileStatus fstat = fs.getFileStatus(targetFilePath);
// TODO set codec
DatasetJsonRecord datasetJsonRecord =
new DatasetJsonRecord(schemaString, abstractPath, fstat.getModificationTime(), fstat.getOwner(), fstat.getGroup(),
fstat.getPermission().toString(), null, storage, "");
reader.close();
sin.close();
return datasetJsonRecord;
}
示例4: getSampleData
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Override
public SampleDataRecord getSampleData(Path targetFilePath)
throws IOException {
ParquetReader<GenericRecord> reader = AvroParquetReader.<GenericRecord>builder(targetFilePath).build();
Iterator<GenericRecord> iter = Collections.singletonList(reader.read()).iterator();
int count = 0;
List<Object> list = new ArrayList<Object>();
//JSONArray list = new JSONArray();
while (iter.hasNext() && count < 10) {
// TODO handle out of memory error
list.add(iter.next().toString().replaceAll("[\\n\\r\\p{C}]", "").replaceAll("\"", "\\\""));
count++;
}
SampleDataRecord sampleDataRecord = new SampleDataRecord(targetFilePath.toUri().getPath(), list);
return sampleDataRecord;
}
示例5: validateParquetFile
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public void validateParquetFile(Path parquetFile, long recourdCount) throws IOException {
ParquetReader reader = AvroParquetReader.builder(parquetFile)
.build();
for(long i = 0; i < recourdCount; i++) {
GenericData.Record actualRow = (GenericData.Record) reader.read();
Assert.assertNotNull("Can't read row " + i, actualRow);
Assert.assertEquals("Value different in row " + i + " for key b", actualRow.get("b"), i % 2 == 0);
Assert.assertEquals("Value different in row " + i + " for key s", actualRow.get("s"), new Utf8(String.valueOf(i)));
Assert.assertEquals("Value different in row " + i + " for key l", actualRow.get("l"), i);
Assert.assertEquals("Value different in row " + i + " for key l100", actualRow.get("l100"), i%100);
Assert.assertEquals("Value different in row " + i + " for key s100", actualRow.get("s100"), new Utf8(String.valueOf(i % 100)));
}
Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
示例6: validateParquetFile
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public void validateParquetFile(Path parquetFile, List<Map<String, Object>> data) throws IOException {
ParquetReader reader = AvroParquetReader.builder(parquetFile)
.build();
int position = 0;
for(Map<String, Object> expectedRow : data) {
GenericData.Record actualRow = (GenericData.Record) reader.read();
Assert.assertNotNull("Can't read row " + position, actualRow);
for(Map.Entry<String, Object> entry : expectedRow.entrySet()) {
Object value = actualRow.get(entry.getKey());
Assert.assertEquals("Different value on row " + position + " for key " + entry.getKey(), entry.getValue(), value);
}
}
Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
示例7: read
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOException
{
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build();
for (int i = 0; i < nRows; i++) {
Group group = reader.read();
blackhole.consume(group.getBinary("binary_field", 0));
blackhole.consume(group.getInteger("int32_field", 0));
blackhole.consume(group.getLong("int64_field", 0));
blackhole.consume(group.getBoolean("boolean_field", 0));
blackhole.consume(group.getFloat("float_field", 0));
blackhole.consume(group.getDouble("double_field", 0));
blackhole.consume(group.getBinary("flba_field", 0));
blackhole.consume(group.getInt96("int96_field", 0));
}
reader.close();
}
示例8: countFilteredRecords
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{
ParquetReader<Group> reader = ParquetReader
.builder(new GroupReadSupport(), path)
.withFilter(FilterCompat.get(pred))
.build();
long count = 0;
try {
while (reader.read() != null) {
count += 1;
}
} finally {
reader.close();
}
return count;
}
示例9: readFile
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public static List<Group> readFile(File f, Filter filter) throws IOException {
Configuration conf = new Configuration();
GroupWriteSupport.setSchema(schema, conf);
ParquetReader<Group> reader =
ParquetReader.builder(new GroupReadSupport(), new Path(f.getAbsolutePath()))
.withConf(conf)
.withFilter(filter)
.build();
Group current;
List<Group> users = new ArrayList<Group>();
current = reader.read();
while (current != null) {
users.add(current);
current = reader.read();
}
return users;
}
示例10: testWriteFile
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFile() throws IOException, InterruptedException, TException {
final AddressBook a = new AddressBook(
Arrays.asList(
new Person(
new Name("Bob", "Roberts"),
0,
"[email protected]",
Arrays.asList(new PhoneNumber("1234567890")))));
final Path fileToCreate = createFile(a);
ParquetReader<Group> reader = createRecordReader(fileToCreate);
Group g = null;
int i = 0;
while((g = reader.read()) != null) {
assertEquals(a.persons.size(), g.getFieldRepetitionCount("persons"));
assertEquals(a.persons.get(0).email, g.getGroup("persons", 0).getGroup(0, 0).getString("email", 0));
// just some sanity check, we're testing the various layers somewhere else
++i;
}
assertEquals("read 1 record", 1, i);
}
示例11: testWriteFileListOfMap
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFileListOfMap() throws IOException, InterruptedException, TException {
Map<String, String> map1 = new HashMap<String,String>();
map1.put("key11", "value11");
map1.put("key12", "value12");
Map<String, String> map2 = new HashMap<String,String>();
map2.put("key21", "value21");
final TestMapInList listMap = new TestMapInList("listmap",
Arrays.asList(map1, map2));
final Path fileToCreate = createFile(listMap);
ParquetReader<Group> reader = createRecordReader(fileToCreate);
Group g = null;
while((g = reader.read()) != null) {
assertEquals(listMap.names.size(),
g.getGroup("names", 0).getFieldRepetitionCount("names_tuple"));
assertEquals(listMap.names.get(0).size(),
g.getGroup("names", 0).getGroup("names_tuple", 0).getFieldRepetitionCount("map"));
assertEquals(listMap.names.get(1).size(),
g.getGroup("names", 0).getGroup("names_tuple", 1).getFieldRepetitionCount("map"));
}
}
示例12: testWriteFileMapOfList
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFileMapOfList() throws IOException, InterruptedException, TException {
Map<String, List<String>> map = new HashMap<String,List<String>>();
map.put("key", Arrays.asList("val1","val2"));
final TestListInMap mapList = new TestListInMap("maplist", map);
final Path fileToCreate = createFile(mapList);
ParquetReader<Group> reader = createRecordReader(fileToCreate);
Group g = null;
while((g = reader.read()) != null) {
assertEquals("key",
g.getGroup("names", 0).getGroup("map",0).getBinary("key", 0).toStringUsingUTF8());
assertEquals(map.get("key").size(),
g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getFieldRepetitionCount(0));
}
}
示例13: testWriteFileMapOfLists
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFileMapOfLists() throws IOException, InterruptedException, TException {
Map<List<String>, List<String>> map = new HashMap<List<String>,List<String>>();
map.put(Arrays.asList("key1","key2"), Arrays.asList("val1","val2"));
final TestListsInMap mapList = new TestListsInMap("maplists", map);
final Path fileToCreate = createFile(mapList);
ParquetReader<Group> reader = createRecordReader(fileToCreate);
Group g = null;
while((g = reader.read()) != null) {
assertEquals("key1",
g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 0).toStringUsingUTF8());
assertEquals("key2",
g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 1).toStringUsingUTF8());
assertEquals("val1",
g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 0).toStringUsingUTF8());
assertEquals("val2",
g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 1).toStringUsingUTF8());
}
}
示例14: read
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public static <D> List<D> read(GenericData model, Schema schema, File file) throws IOException {
List<D> data = new ArrayList<D>();
Configuration conf = new Configuration(false);
AvroReadSupport.setRequestedProjection(conf, schema);
AvroReadSupport.setAvroReadSchema(conf, schema);
ParquetReader<D> fileReader = AvroParquetReader
.<D>builder(new Path(file.toString()))
.withDataModel(model) // reflect disables compatibility
.withConf(conf)
.build();
try {
D datum;
while ((datum = fileReader.read()) != null) {
data.add(datum);
}
} finally {
fileReader.close();
}
return data;
}
示例15: testFilterOnSubAttribute
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testFilterOnSubAttribute() throws IOException {
Path path = writeCarsToParquetFile(1, CompressionCodecName.UNCOMPRESSED, false);
ParquetReader<Car> reader = new AvroParquetReader<Car>(testConf, path, column("engine.type", equalTo(EngineType.DIESEL)));
assertEquals(reader.read().toString(), getVwPassat().toString());
assertNull(reader.read());
reader = new AvroParquetReader<Car>(testConf, path, column("engine.capacity", equalTo(1.4f)));
assertEquals(getVwPolo().toString(), reader.read().toString());
assertNull(reader.read());
reader = new AvroParquetReader<Car>(testConf, path, column("engine.hasTurboCharger", equalTo(true)));
assertEquals(getBmwMini().toString(), reader.read().toString());
assertNull(reader.read());
}