本文整理汇总了Java中org.apache.parquet.format.converter.ParquetMetadataConverter类的典型用法代码示例。如果您正苦于以下问题:Java ParquetMetadataConverter类的具体用法?Java ParquetMetadataConverter怎么用?Java ParquetMetadataConverter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ParquetMetadataConverter类属于org.apache.parquet.format.converter包,在下文中一共展示了ParquetMetadataConverter类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: ParquetReadOptions
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
ParquetReadOptions(boolean useSignedStringMinMax,
boolean useStatsFilter,
boolean useDictionaryFilter,
boolean useRecordFilter,
FilterCompat.Filter recordFilter,
ParquetMetadataConverter.MetadataFilter metadataFilter,
CompressionCodecFactory codecFactory,
ByteBufferAllocator allocator,
Map<String, String> properties) {
this.useSignedStringMinMax = useSignedStringMinMax;
this.useStatsFilter = useStatsFilter;
this.useDictionaryFilter = useDictionaryFilter;
this.useRecordFilter = useRecordFilter;
this.recordFilter = recordFilter;
this.metadataFilter = metadataFilter;
this.codecFactory = codecFactory;
this.allocator = allocator;
this.properties = Collections.unmodifiableMap(properties);
}
示例2: test
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
@Override
public void test() throws IOException {
Configuration configuration = new Configuration();
ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
super.fsPath, ParquetMetadataConverter.NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(configuration,
metadata.getFileMetaData(),
super.fsPath,
metadata.getBlocks(),
metadata.getFileMetaData().getSchema().getColumns());
PageStatsValidator validator = new PageStatsValidator();
PageReadStore pageReadStore;
while ((pageReadStore = reader.readNextRowGroup()) != null) {
validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
}
}
示例3: getParquetSchema
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
private String getParquetSchema(String source) throws IOException {
Formats.Format format;
try (SeekableInput in = openSeekable(source)) {
format = Formats.detectFormat((InputStream) in);
in.seek(0);
switch (format) {
case PARQUET:
return new ParquetFileReader(
getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER)
.getFileMetaData().getSchema().toString();
default:
throw new IllegalArgumentException(String.format(
"Could not get a Parquet schema for format %s: %s", format, source));
}
}
}
示例4: readDictionaries
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
/**
* Return dictionary per row group for all binary columns in given parquet file.
* @param fs filesystem object.
* @param filePath parquet file to scan
* @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
* @throws IOException
*/
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CodecFactory codecFactory) throws IOException {
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
if (parquetMetadata.getBlocks().size() > 1) {
throw new IOException(
format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
parquetMetadata.getBlocks().size(), filePath));
}
final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();
for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
}
final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
try(final FSDataInputStream in = fs.open(filePath)) {
for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
if (isBinaryType(columnChunkMetaData.getType())) {
final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
// if first page is dictionary encoded then load dictionary, otherwise skip this column.
final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
} else {
columnsToSkip.add(column);
}
}
}
}
return new ImmutablePair<>(dictionaries, columnsToSkip);
}
示例5: run
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1,
"A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1,
"Cannot process multiple Parquet files.");
String source = targets.get(0);
ParquetMetadata footer = ParquetFileReader.readFooter(
getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);
console.info("\nFile path: {}", source);
console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());
Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData();
if (kv != null && !kv.isEmpty()) {
console.info("Properties:");
String format = " %" + maxSize(kv.keySet()) + "s: %s";
for (Map.Entry<String, String> entry : kv.entrySet()) {
console.info(String.format(format, entry.getKey(), entry.getValue()));
}
} else {
console.info("Properties: (none)");
}
MessageType schema = footer.getFileMetaData().getSchema();
console.info("Schema:\n{}", schema);
List<BlockMetaData> rowGroups = footer.getBlocks();
for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
printRowGroup(console, index, rowGroups.get(index), schema);
}
console.info("");
return 0;
}
示例6: getParquetFileMetadata
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
private ParquetFileMetadata getParquetFileMetadata(FileStatus file) throws IOException {
final ParquetMetadata metadata;
metadata = SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER);
MessageType schema = metadata.getFileMetaData().getSchema();
Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
schema.getPaths();
for (String[] path : schema.getPaths()) {
originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
}
List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();
ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
if(logger.isDebugEnabled()){
logger.debug(containsCorruptDates.toString());
}
final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
for (BlockMetaData rowGroup : metadata.getBlocks()) {
List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
long length = 0;
for (ColumnChunkMetaData col : rowGroup.getColumns()) {
ColumnMetadata columnMetadata;
boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
Statistics<?> stats = col.getStatistics();
String[] columnName = col.getPath().toArray();
SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
ColumnTypeMetadata columnTypeMetadata =
new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));
columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
if (statsAvailable) {
// Write stats only if minVal==maxVal. Also, we then store only maxVal
Object mxValue = null;
if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
stats.genericGetMax().equals(stats.genericGetMin())) {
mxValue = stats.genericGetMax();
if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
&& columnTypeMetadata.originalType == OriginalType.DATE) {
mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
}
}
columnMetadata =
new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
} else {
columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
}
columnMetadataList.add(columnMetadata);
length += col.getTotalSize();
}
RowGroupMetadata rowGroupMeta =
new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
rowGroupMetadataList.add(rowGroupMeta);
}
return new ParquetFileMetadata(file, file.getLen(), rowGroupMetadataList, columnTypeInfo);
}
示例7: getColNameToSchemaElementMapping
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
HashMap<String, SchemaElement> schemaElements = new HashMap<>();
FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (SchemaElement se : fileMetaData.getSchema()) {
schemaElements.put(se.getName(), se);
}
return schemaElements;
}
示例8: getFooter
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
public ParquetMetadata getFooter(FileSystemWrapper fs, Path file) {
if (footer == null || !lastFile.equals(file)) {
try {
footer = readFooter(fs, file, ParquetMetadataConverter.NO_FILTER);
} catch (IOException ioe) {
throw new RuntimeException("Failed to read parquet footer for file " + file, ioe);
}
lastFile = file;
}
return footer;
}
示例9: readFooter
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
public static ParquetMetadata readFooter(final FileSystem fs, final Path file, ParquetMetadataConverter.MetadataFilter filter) throws IOException {
return readFooter(fs, fs.getFileStatus(file), filter);
}
示例10: getBatchSchema
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
@Override
public BatchSchema getBatchSchema(final FileSelection selection, final FileSystemWrapper fs) {
final SabotContext context = ((ParquetFormatPlugin)formatPlugin).getContext();
try (
BufferAllocator sampleAllocator = context.getAllocator().newChildAllocator("sample-alloc", 0, Long.MAX_VALUE);
OperatorContextImpl operatorContext = new OperatorContextImpl(context.getConfig(), sampleAllocator, context.getOptionManager(), 1000);
SampleMutator mutator = new SampleMutator(context)
){
final Optional<FileStatus> firstFileO = selection.getFirstFile();
if(!firstFileO.isPresent()) {
throw UserException.dataReadError().message("Unable to find any files for datasets.").build(logger);
}
final FileStatus firstFile = firstFileO.get();
final ParquetMetadata footer = ParquetFileReader.readFooter(fsPlugin.getFsConf(), firstFile, ParquetMetadataConverter.NO_FILTER);
final ParquetReaderUtility.DateCorruptionStatus dateStatus = ParquetReaderUtility.detectCorruptDates(footer, GroupScan.ALL_COLUMNS,
((ParquetFormatPlugin)formatPlugin).getConfig().autoCorrectCorruptDates);
final boolean readInt96AsTimeStamp = operatorContext.getOptions().getOption(PARQUET_READER_INT96_AS_TIMESTAMP).bool_val;
final ImplicitFilesystemColumnFinder finder = new ImplicitFilesystemColumnFinder(context.getOptionManager(), fs, GroupScan.ALL_COLUMNS);
try(RecordReader reader =
new AdditionalColumnsRecordReader(
new ParquetRowiseReader(operatorContext, footer, 0, firstFile.getPath().toString(), GroupScan.ALL_COLUMNS, fs, dateStatus, readInt96AsTimeStamp, true),
finder.getImplicitFieldsForSample(selection)
)) {
reader.setup(mutator);
mutator.allocate(100);
//TODO DX-3873: remove the next() call here. We need this for now since we don't populate inner list types until next.
reader.next();
mutator.getContainer().buildSchema(BatchSchema.SelectionVectorMode.NONE);
return mutator.getContainer().getSchema();
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
示例11: ParquetRowReader
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
public ParquetRowReader(Configuration configuration, Path filePath, ReadSupport<T> readSupport) throws IOException
{
this.filePath = filePath;
ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, filePath, ParquetMetadataConverter.NO_FILTER);
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
FileMetaData fileMetadata = parquetMetadata.getFileMetaData();
this.fileSchema = fileMetadata.getSchema();
Map<String, String> keyValueMetadata = fileMetadata.getKeyValueMetaData();
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
configuration, toSetMultiMap(keyValueMetadata), fileSchema));
this.columnIOFactory = new ColumnIOFactory(fileMetadata.getCreatedBy());
this.requestedSchema = readContext.getRequestedSchema();
this.recordConverter = readSupport.prepareForRead(
configuration, fileMetadata.getKeyValueMetaData(), fileSchema, readContext);
List<ColumnDescriptor> columns = requestedSchema.getColumns();
reader = new ParquetFileReader(configuration, fileMetadata, filePath, blocks, columns);
long total = 0;
for (BlockMetaData block : blocks) {
total += block.getRowCount();
}
this.total = total;
this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
logger.info("ParquetRowReader initialized will read a total of " + total + " records.");
}
示例12: load
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
public ITable load() {
try {
Configuration conf = new Configuration();
System.setProperty("hadoop.home.dir", "/");
conf.set("hadoop.security.authentication", "simple");
conf.set("hadoop.security.authorization", "false");
Path path = new Path(this.filename);
ParquetMetadata md = ParquetFileReader.readFooter(conf, path,
ParquetMetadataConverter.NO_FILTER);
MessageType schema = md.getFileMetaData().getSchema();
ParquetFileReader r = new ParquetFileReader(conf, path, md);
IAppendableColumn[] cols = this.createColumns(md);
MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
PageReadStore pages;
while (null != (pages = r.readNextRowGroup())) {
final long rows = pages.getRowCount();
RecordReader<Group> recordReader = columnIO.getRecordReader(
pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
Group g = recordReader.read();
appendGroup(cols, g, md.getFileMetaData().getSchema().getColumns());
}
}
for (IAppendableColumn c: cols)
c.seal();
return new Table(cols);
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
示例13: parseChunk
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
@Override
protected final ParseWriter parseChunk(int cidx, ParseReader din, ParseWriter dout) {
if (! (din instanceof FVecParseReader)) {
// TODO: Should we modify the interface to expose the underlying chunk for non-streaming parsers?
throw new IllegalStateException("We only accept parser readers backed by a Vec (no streaming support!).");
}
Chunk chunk = ((FVecParseReader) din).getChunk();
Vec vec = chunk.vec();
// extract metadata, we want to read only the row groups that have centers in this chunk
ParquetMetadataConverter.MetadataFilter chunkFilter = ParquetMetadataConverter.range(
chunk.start(), chunk.start() + chunk.len());
ParquetMetadata metadata = VecParquetReader.readFooter(_metadata, chunkFilter);
if (metadata.getBlocks().isEmpty()) {
Log.trace("Chunk #", cidx, " doesn't contain any Parquet block center.");
return dout;
}
Log.info("Processing ", metadata.getBlocks().size(), " blocks of chunk #", cidx);
VecParquetReader reader = new VecParquetReader(vec, metadata, dout, _setup.getColumnTypes());
try {
Integer recordNumber;
do {
recordNumber = reader.read();
} while (recordNumber != null);
} catch (IOException e) {
throw new RuntimeException("Failed to parse records", e);
}
long skipped = reader.getInvalidRecordCount();
if (skipped > 0)
dout.addError(new ParseWriter.ParseErr("Some records were corrupt (skipped " + skipped + ").", cidx, -1, chunk.start()));
return dout;
}
示例14: correctTypeConversions
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
/**
* Overrides unsupported type conversions/mappings specified by the user.
* @param vec byte vec holding bin\ary parquet data
* @param requestedTypes user-specified target types
* @return corrected types
*/
public static byte[] correctTypeConversions(ByteVec vec, byte[] requestedTypes) {
byte[] metadataBytes = VecParquetReader.readFooterAsBytes(vec);
ParquetMetadata metadata = VecParquetReader.readFooter(metadataBytes, ParquetMetadataConverter.NO_FILTER);
byte[] roughTypes = roughGuessTypes(metadata.getFileMetaData().getSchema());
return correctTypeConversions(roughTypes, requestedTypes);
}
示例15: readFooter
import org.apache.parquet.format.converter.ParquetMetadataConverter; //导入依赖的package包/类
private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f) throws IOException {
ParquetMetadataConverter converter = new ParquetMetadataConverter(options);
return readFooter(file, options, f, converter);
}