本文整理汇总了Java中org.apache.parquet.hadoop.metadata.ParquetMetadata.getFileMetaData方法的典型用法代码示例。如果您正苦于以下问题:Java ParquetMetadata.getFileMetaData方法的具体用法?Java ParquetMetadata.getFileMetaData怎么用?Java ParquetMetadata.getFileMetaData使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.parquet.hadoop.metadata.ParquetMetadata
的用法示例。
在下文中一共展示了ParquetMetadata.getFileMetaData方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: test
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Override
public void test() throws IOException {
Configuration configuration = new Configuration();
ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
super.fsPath, ParquetMetadataConverter.NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(configuration,
metadata.getFileMetaData(),
super.fsPath,
metadata.getBlocks(),
metadata.getFileMetaData().getSchema().getColumns());
PageStatsValidator validator = new PageStatsValidator();
PageReadStore pageReadStore;
while ((pageReadStore = reader.readNextRowGroup()) != null) {
validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
}
}
示例2: read
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void read(String fileName) throws IOException
{
Path path = new Path(fileName);
Configuration conf = new Configuration();
conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
ParquetMetadata metadata = ParquetFileReader.readFooter(conf, path, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, metadata.getFileMetaData(), path, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns());
PageReadStore pageReadStore;
PageReader pageReader;
DataPage page;
while ((pageReadStore = reader.readNextRowGroup()) != null) {
for (ColumnDescriptor cd: metadata.getFileMetaData().getSchema().getColumns()) {
pageReader = pageReadStore.getPageReader(cd);
page = pageReader.readPage();
}
}
}
示例3: readFirstRecords
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private static ParquetPreviewParseWriter readFirstRecords(ParquetParseSetup initSetup, ByteVec vec, int cnt) {
ParquetMetadata metadata = VecParquetReader.readFooter(initSetup.parquetMetadata);
List<BlockMetaData> blockMetaData;
if (metadata.getBlocks().isEmpty()) {
blockMetaData = Collections.<BlockMetaData>emptyList();
} else {
final BlockMetaData firstBlock = findFirstBlock(metadata);
blockMetaData = Collections.singletonList(firstBlock);
}
ParquetMetadata startMetadata = new ParquetMetadata(metadata.getFileMetaData(), blockMetaData);
ParquetPreviewParseWriter ppWriter = new ParquetPreviewParseWriter(initSetup);
VecParquetReader reader = new VecParquetReader(vec, startMetadata, ppWriter, ppWriter._roughTypes);
try {
int recordCnt = 0;
Integer recordNum;
do {
recordNum = reader.read();
} while ((recordNum != null) && (++recordCnt < cnt));
return ppWriter;
} catch (IOException e) {
throw new RuntimeException("Failed to read the first few records", e);
}
}
示例4: ParquetRowReader
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
public ParquetRowReader(Configuration configuration, Path filePath, ReadSupport<T> readSupport) throws IOException
{
this.filePath = filePath;
ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, filePath, ParquetMetadataConverter.NO_FILTER);
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
FileMetaData fileMetadata = parquetMetadata.getFileMetaData();
this.fileSchema = fileMetadata.getSchema();
Map<String, String> keyValueMetadata = fileMetadata.getKeyValueMetaData();
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
configuration, toSetMultiMap(keyValueMetadata), fileSchema));
this.columnIOFactory = new ColumnIOFactory(fileMetadata.getCreatedBy());
this.requestedSchema = readContext.getRequestedSchema();
this.recordConverter = readSupport.prepareForRead(
configuration, fileMetadata.getKeyValueMetaData(), fileSchema, readContext);
List<ColumnDescriptor> columns = requestedSchema.getColumns();
reader = new ParquetFileReader(configuration, fileMetadata, filePath, blocks, columns);
long total = 0;
for (BlockMetaData block : blocks) {
total += block.getRowCount();
}
this.total = total;
this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
logger.info("ParquetRowReader initialized will read a total of " + total + " records.");
}
示例5: ParquetFileReader
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
/**
* @param conf the Hadoop Configuration
* @param file Path to a parquet file
* @param footer a {@link ParquetMetadata} footer already read from the file
* @throws IOException if the file can not be opened
*/
@Deprecated
public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) throws IOException {
this.converter = new ParquetMetadataConverter(conf);
this.file = HadoopInputFile.fromPath(file, conf);
this.f = this.file.newStream();
this.options = HadoopReadOptions.builder(conf).build();
this.footer = footer;
this.fileMetaData = footer.getFileMetaData();
this.blocks = filterRowGroups(footer.getBlocks());
for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
paths.put(ColumnPath.get(col.getPath()), col);
}
}
示例6: mergeMetadataFiles
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
/**
* Given a list of metadata files, merge them into a single ParquetMetadata
* Requires that the schemas be compatible, and the extraMetadata be exactly equal.
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf) throws IOException {
Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");
GlobalMetaData globalMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Path p : files) {
ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
FileMetaData fmd = pmd.getFileMetaData();
globalMetaData = mergeInto(fmd, globalMetaData, true);
blocks.addAll(pmd.getBlocks());
}
// collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
return new ParquetMetadata(globalMetaData.merge(), blocks);
}
示例7: readBlocksFromFile
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException {
List<PageReadStore> rowGroups = new ArrayList<PageReadStore>();
ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER);
ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(),
metadata.getFileMetaData().getSchema().getColumns());
PageReadStore group;
while ((group = fileReader.readNextRowGroup()) != null) {
rowGroups.add(group);
}
return rowGroups;
}
示例8: getSplit
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
/**
* gets a ParquetInputSplit corresponding to a split given by Hive
*
* @param oldSplit The split given by Hive
* @param conf The JobConf of the Hive job
* @return a ParquetInputSplit corresponding to the oldSplit
* @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
*/
protected ParquetInputSplit getSplit(
final InputSplit oldSplit,
final JobConf conf
) throws IOException {
if (oldSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) oldSplit;
final long splitStart = fileSplit.getStart();
final long splitLength = fileSplit.getLength();
final Path finalPath = fileSplit.getPath();
final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
final ReadContext readContext =
new DataWritableReadSupport()
.init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());
schemaSize = MessageTypeParser.parseMessageType(
readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)
).getFieldCount();
return new ParquetInputSplit(
finalPath,
splitStart,
splitStart + splitLength,
splitLength,
fileSplit.getLocations(),
null);
} else {
throw new IllegalArgumentException("Unknown split type: " + oldSplit);
}
}
示例9: check
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
private String check(String file) throws IOException {
Path path = qualifiedPath(file);
ParquetMetadata footer = ParquetFileReader.readFooter(
getConf(), path, ParquetMetadataConverter.NO_FILTER);
FileMetaData meta = footer.getFileMetaData();
String createdBy = meta.getCreatedBy();
if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
// create fake metadata that will read corrupt stats and return them
FileMetaData fakeMeta = new FileMetaData(
meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);
// get just the binary columns
List<ColumnDescriptor> columns = Lists.newArrayList();
Iterables.addAll(columns, Iterables.filter(
meta.getSchema().getColumns(),
new Predicate<ColumnDescriptor>() {
@Override
public boolean apply(@Nullable ColumnDescriptor input) {
return input != null && input.getType() == BINARY;
}
}));
// now check to see if the data is actually corrupt
ParquetFileReader reader = new ParquetFileReader(getConf(),
fakeMeta, path, footer.getBlocks(), columns);
try {
PageStatsValidator validator = new PageStatsValidator();
for (PageReadStore pages = reader.readNextRowGroup(); pages != null;
pages = reader.readNextRowGroup()) {
validator.validate(columns, pages);
}
} catch (BadStatsException e) {
return e.getMessage();
}
}
return null;
}
示例10: test
import org.apache.parquet.hadoop.metadata.ParquetMetadata; //导入方法依赖的package包/类
@Test
public void test() throws Exception {
Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
Path root = file.getParent();
FileSystem fs = file.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
ColumnDescriptor col = schema.getColumns().get(0);
Encoding dataEncoding = PLAIN;
int valueCount = 10;
int d = 1;
int r = 2;
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
Statistics<?> statistics = new BinaryStatistics();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
{
ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
writer.start();
writer.startBlock(rowCount);
{
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema , new HeapByteBufferAllocator());
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePageV2(
rowCount, nullCount, valueCount,
repetitionLevels, definitionLevels,
dataEncoding, data,
statistics);
store.flushToFileWriter(writer);
}
writer.endBlock();
writer.end(new HashMap<String, String>());
}
{
ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(
conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
PageReadStore rowGroup = reader.readNextRowGroup();
PageReader pageReader = rowGroup.getPageReader(col);
DataPageV2 page = (DataPageV2)pageReader.readPage();
assertEquals(rowCount, page.getRowCount());
assertEquals(nullCount, page.getNullCount());
assertEquals(valueCount, page.getValueCount());
assertEquals(d, intValue(page.getDefinitionLevels()));
assertEquals(r, intValue(page.getRepetitionLevels()));
assertEquals(dataEncoding, page.getDataEncoding());
assertEquals(v, intValue(page.getData()));
assertEquals(statistics.toString(), page.getStatistics().toString());
reader.close();
}
}