本文整理汇总了Java中parquet.hadoop.metadata.ParquetMetadata.getBlocks方法的典型用法代码示例。如果您正苦于以下问题:Java ParquetMetadata.getBlocks方法的具体用法?Java ParquetMetadata.getBlocks怎么用?Java ParquetMetadata.getBlocks使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类parquet.hadoop.metadata.ParquetMetadata
的用法示例。
在下文中一共展示了ParquetMetadata.getBlocks方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getRowGroupNumbersFromFileSplit
import parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
示例2: add
import parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private static void add(ParquetMetadata footer) {
for (BlockMetaData blockMetaData : footer.getBlocks()) {
++blockCount;
MessageType schema = footer.getFileMetaData().getSchema();
recordCount += blockMetaData.getRowCount();
List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
for (ColumnChunkMetaData columnMetaData : columns) {
ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
add(
desc,
columnMetaData.getValueCount(),
columnMetaData.getTotalSize(),
columnMetaData.getTotalUncompressedSize(),
columnMetaData.getEncodings(),
columnMetaData.getStatistics());
}
}
}
示例3: toParquetMetadata
import parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
List<RowGroup> rowGroups = new ArrayList<RowGroup>();
int numRows = 0;
for (BlockMetaData block : blocks) {
numRows += block.getRowCount();
addRowGroup(parquetMetadata, rowGroups, block);
}
FileMetaData fileMetaData = new FileMetaData(
currentVersion,
toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
numRows,
rowGroups);
Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
for (Entry<String, String> keyValue : keyValues) {
addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
}
fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
return fileMetaData;
}
示例4: showDetails
import parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) {
showDetails(out, meta.getFileMetaData());
long i = 1;
for (BlockMetaData bmeta : meta.getBlocks()) {
out.println();
showDetails(out, bmeta, i++);
}
}
示例5: createHDFSPageSource
import parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private Optional<ConnectorPageSource> createHDFSPageSource(
Path path,
long start,
long length,
List<HDFSColumnHandle> columns)
{
Optional<FileSystem> fileSystemOptional = fsFactory.getFS(path);
FileSystem fileSystem;
ParquetDataSource dataSource;
if (fileSystemOptional.isPresent()) {
fileSystem = fileSystemOptional.get();
}
else {
throw new RuntimeException("Could not find filesystem for path " + path);
}
try {
dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length);
// default length is file size, which means whole file is a split
length = dataSource.getSize();
ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
List<Type> fields = columns.stream()
.filter(column -> column.getColType() != HDFSColumnHandle.ColumnType.NOTVALID)
.map(column -> getParquetType(column, fileSchema))
.filter(Objects::nonNull)
.collect(Collectors.toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
List<BlockMetaData> blocks = new ArrayList<>();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
blocks.add(block);
}
}
ParquetReader parquetReader = new ParquetReader(
fileSchema,
requestedSchema,
blocks,
dataSource,
typeManager);
return Optional.of(new HDFSPageSource(
parquetReader,
dataSource,
fileSchema,
requestedSchema,
length,
columns,
typeManager));
}
catch (IOException e) {
log.error(e);
return Optional.empty();
}
}
示例6: getSplits
import parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers,
long maxSplitSize, long minSplitSize, ReadContext readContext)
throws IOException {
List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
Filter filter = ParquetInputFormat.getFilter(configuration);
long rowGroupsDropped = 0;
long totalRowGroups = 0;
for (Footer footer : footers) {
final Path file = footer.getFile();
LOG.debug(file);
FileSystem fs = file.getFileSystem(configuration);
FileStatus fileStatus = fs.getFileStatus(file);
ParquetMetadata parquetMetaData = footer.getParquetMetadata();
List<BlockMetaData> blocks = parquetMetaData.getBlocks();
List<BlockMetaData> filteredBlocks;
totalRowGroups += blocks.size();
filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
rowGroupsDropped += blocks.size() - filteredBlocks.size();
if (filteredBlocks.isEmpty()) {
continue;
}
BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
splits.addAll(
generateSplits(
filteredBlocks,
fileBlockLocations,
fileStatus,
readContext.getRequestedSchema().toString(),
readContext.getReadSupportMetadata(),
minSplitSize,
maxSplitSize)
);
}
if (rowGroupsDropped > 0 && totalRowGroups > 0) {
int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100);
LOG.info("Dropping " + rowGroupsDropped + " row groups that do not pass filter predicate! (" + percentDropped + "%)");
} else {
LOG.info("There were no row groups that could be dropped due to filter predicates");
}
return splits;
}
示例7: test
import parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void test() throws Exception {
Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
Path root = file.getParent();
FileSystem fs = file.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
ColumnDescriptor col = schema.getColumns().get(0);
Encoding dataEncoding = PLAIN;
int valueCount = 10;
int d = 1;
int r = 2;
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
Statistics<?> statistics = new BinaryStatistics();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
{
ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
writer.start();
writer.startBlock(rowCount);
{
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, initialSize);
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePageV2(
rowCount, nullCount, valueCount,
repetitionLevels, definitionLevels,
dataEncoding, data,
statistics);
store.flushToFileWriter(writer);
}
writer.endBlock();
writer.end(new HashMap<String, String>());
}
{
ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, file, footer.getBlocks(), schema.getColumns());
PageReadStore rowGroup = reader.readNextRowGroup();
PageReader pageReader = rowGroup.getPageReader(col);
DataPageV2 page = (DataPageV2) pageReader.readPage();
assertEquals(rowCount, page.getRowCount());
assertEquals(nullCount, page.getNullCount());
assertEquals(valueCount, page.getValueCount());
assertEquals(d, intValue(page.getDefinitionLevels()));
assertEquals(r, intValue(page.getRepetitionLevels()));
assertEquals(dataEncoding, page.getDataEncoding());
assertEquals(v, intValue(page.getData()));
assertEquals(statistics.toString(), page.getStatistics().toString());
reader.close();
}
}
示例8: test
import parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void test() throws Exception {
Configuration conf = new Configuration();
Path root = new Path("target/tests/TestParquetWriter/");
FileSystem fs = root.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = parseMessageType(
"message test { "
+ "required binary binary_field; "
+ "required int32 int32_field; "
+ "required int64 int64_field; "
+ "required boolean boolean_field; "
+ "required float float_field; "
+ "required double double_field; "
+ "required fixed_len_byte_array(3) flba_field; "
+ "required int96 int96_field; "
+ "optional binary null_field; "
+ "} ");
GroupWriteSupport.setSchema(schema, conf);
SimpleGroupFactory f = new SimpleGroupFactory(schema);
Map<String, Encoding> expected = new HashMap<String, Encoding>();
expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
expected.put("1000-" + PARQUET_1_0, PLAIN);
expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
for (int modulo : asList(10, 1000)) {
for (WriterVersion version : WriterVersion.values()) {
Path file = new Path(root, version.name() + "_" + modulo);
ParquetWriter<Group> writer = new ParquetWriter<Group>(
file,
new GroupWriteSupport(null),
UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
for (int i = 0; i < 1000; i++) {
writer.write(
f.newGroup()
.append("binary_field", "test" + (i % modulo))
.append("int32_field", 32)
.append("int64_field", 64l)
.append("boolean_field", true)
.append("float_field", 1.0f)
.append("double_field", 2.0d)
.append("flba_field", "foo")
.append("int96_field", Binary.fromByteArray(new byte[12])));
}
writer.close();
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
for (int i = 0; i < 1000; i++) {
Group group = reader.read();
assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
assertEquals(32, group.getInteger("int32_field", 0));
assertEquals(64l, group.getLong("int64_field", 0));
assertEquals(true, group.getBoolean("boolean_field", 0));
assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
assertEquals(Binary.fromByteArray(new byte[12]), group.getInt96("int96_field", 0));
assertEquals(0, group.getFieldRepetitionCount("null_field"));
}
reader.close();
ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
for (BlockMetaData blockMetaData : footer.getBlocks()) {
for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
if (column.getPath().toDotString().equals("binary_field")) {
String key = modulo + "-" + version;
Encoding expectedEncoding = expected.get(key);
assertTrue(
key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
column.getEncodings().contains(expectedEncoding));
}
}
}
}
}
}
示例9: test
import parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void test() throws Exception {
Configuration conf = new Configuration();
Path root = new Path("target/tests/TestParquetWriter/");
enforceEmptyDir(conf, root);
MessageType schema = parseMessageType(
"message test { "
+ "required binary binary_field; "
+ "required int32 int32_field; "
+ "required int64 int64_field; "
+ "required boolean boolean_field; "
+ "required float float_field; "
+ "required double double_field; "
+ "required fixed_len_byte_array(3) flba_field; "
+ "required int96 int96_field; "
+ "} ");
GroupWriteSupport.setSchema(schema, conf);
SimpleGroupFactory f = new SimpleGroupFactory(schema);
Map<String, Encoding> expected = new HashMap<String, Encoding>();
expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
expected.put("1000-" + PARQUET_1_0, PLAIN);
expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
for (int modulo : asList(10, 1000)) {
for (WriterVersion version : WriterVersion.values()) {
Path file = new Path(root, version.name() + "_" + modulo);
ParquetWriter<Group> writer = new ParquetWriter<Group>(
file,
new GroupWriteSupport(null),
UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
for (int i = 0; i < 1000; i++) {
writer.write(
f.newGroup()
.append("binary_field", "test" + (i % modulo))
.append("int32_field", 32)
.append("int64_field", 64l)
.append("boolean_field", true)
.append("float_field", 1.0f)
.append("double_field", 2.0d)
.append("flba_field", "foo")
.append("int96_field", Binary.fromByteArray(new byte[12])));
}
writer.close();
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
for (int i = 0; i < 1000; i++) {
Group group = reader.read();
assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
assertEquals(32, group.getInteger("int32_field", 0));
assertEquals(64l, group.getLong("int64_field", 0));
assertEquals(true, group.getBoolean("boolean_field", 0));
assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
assertEquals(Binary.fromByteArray(new byte[12]), group.getInt96("int96_field", 0));
}
reader.close();
ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
for (BlockMetaData blockMetaData : footer.getBlocks()) {
for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
if (column.getPath().toDotString().equals("binary_field")) {
String key = modulo + "-" + version;
Encoding expectedEncoding = expected.get(key);
assertTrue(
key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
column.getEncodings().contains(expectedEncoding));
}
}
}
}
}
}
示例10: createParquetPageSource
import parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
public static ParquetPageSource createParquetPageSource(
Configuration configuration,
Path path,
long start,
long length,
Properties schema,
List<HiveColumnHandle> columns,
List<HivePartitionKey> partitionKeys,
boolean useParquetColumnNames,
TypeManager typeManager,
boolean predicatePushdownEnabled,
TupleDomain<HiveColumnHandle> effectivePredicate)
{
try {
ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(configuration, path);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
List<parquet.schema.Type> fields = columns.stream()
.filter(column -> !column.isPartitionKey())
.map(column -> getParquetType(column, fileSchema, useParquetColumnNames))
.filter(Objects::nonNull)
.collect(toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
List<BlockMetaData> blocks = new ArrayList<>();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
blocks.add(block);
}
}
if (predicatePushdownEnabled) {
ParquetPredicate parquetPredicate = buildParquetPredicate(columns, effectivePredicate, fileMetaData.getSchema(), typeManager);
blocks = blocks.stream()
.filter(block -> predicateMatches(parquetPredicate, block, configuration, path, requestedSchema, effectivePredicate))
.collect(toList());
}
ParquetReader parquetReader = new ParquetReader(fileMetaData.getSchema(),
fileMetaData.getKeyValueMetaData(),
requestedSchema,
path,
blocks,
configuration);
return new ParquetPageSource(parquetReader,
requestedSchema,
path,
length,
schema,
columns,
partitionKeys,
effectivePredicate,
typeManager);
}
catch (IOException e) {
throw Throwables.propagate(e);
}
}