本文整理汇总了Java中parquet.hadoop.metadata.ParquetMetadata类的典型用法代码示例。如果您正苦于以下问题:Java ParquetMetadata类的具体用法?Java ParquetMetadata怎么用?Java ParquetMetadata使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ParquetMetadata类属于parquet.hadoop.metadata包,在下文中一共展示了ParquetMetadata类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getRowGroupNumbersFromFileSplit
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
示例2: getSchema
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public DatasetJsonRecord getSchema(Path path) throws IOException {
DatasetJsonRecord record = null;
if (!fs.exists(path))
LOG.error("file path : {} not in hdfs", path);
else {
try {
ParquetMetadata readFooter = ParquetFileReader.readFooter(fs.getConf(), path, ParquetMetadataConverter.NO_FILTER);
Map<String, String> schema = readFooter.getFileMetaData().getKeyValueMetaData();
String allFields = schema.get("org.apache.spark.sql.parquet.row.metadata");
FileStatus status = fs.getFileStatus(path);
String storage = STORAGE_TYPE;
String abstractPath = path.toUri().getPath();
String codec = "parquet.codec";
record = new DatasetJsonRecord(allFields, abstractPath, status.getModificationTime(), status.getOwner(), status.getGroup(),
status.getPermission().toString(), codec, storage, "");
LOG.info("parquetfileanalyzer parse path :{},schema is {}", path.toUri().getPath(), record.toCsvString());
} catch (Exception e) {
LOG.error("path : {} content " + " is not Parquet File format content ", path.toUri().getPath());
LOG.info(e.getStackTrace().toString());
}
}
return record;
}
示例3: ParquetRecordReader
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public ParquetRecordReader(
FragmentContext fragmentContext,
long batchSize,
String path,
int rowGroupIndex,
FileSystem fs,
DirectCodecFactory codecFactory,
ParquetMetadata footer,
List<SchemaPath> columns) throws ExecutionSetupException {
this.hadoopPath = new Path(path);
this.fileSystem = fs;
this.codecFactory = codecFactory;
this.rowGroupIndex = rowGroupIndex;
this.batchSize = batchSize;
this.footer = footer;
this.fragmentContext = fragmentContext;
setColumns(columns);
}
示例4: add
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private static void add(ParquetMetadata footer) {
for (BlockMetaData blockMetaData : footer.getBlocks()) {
++blockCount;
MessageType schema = footer.getFileMetaData().getSchema();
recordCount += blockMetaData.getRowCount();
List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
for (ColumnChunkMetaData columnMetaData : columns) {
ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
add(
desc,
columnMetaData.getValueCount(),
columnMetaData.getTotalSize(),
columnMetaData.getTotalUncompressedSize(),
columnMetaData.getEncodings(),
columnMetaData.getStatistics());
}
}
}
示例5: mergeFooters
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
String rootPath = root.toUri().getPath();
GlobalMetaData fileMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Footer footer : footers) {
String footerPath = footer.getFile().toUri().getPath();
if (!footerPath.startsWith(rootPath)) {
throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
}
footerPath = footerPath.substring(rootPath.length());
while (footerPath.startsWith("/")) {
footerPath = footerPath.substring(1);
}
fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
block.setPath(footerPath);
blocks.add(block);
}
}
return new ParquetMetadata(fileMetaData.merge(), blocks);
}
示例6: toParquetMetadata
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
List<RowGroup> rowGroups = new ArrayList<RowGroup>();
int numRows = 0;
for (BlockMetaData block : blocks) {
numRows += block.getRowCount();
addRowGroup(parquetMetadata, rowGroups, block);
}
FileMetaData fileMetaData = new FileMetaData(
currentVersion,
toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
numRows,
rowGroups);
Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
for (Entry<String, String> keyValue : keyValues) {
addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
}
fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
return fileMetaData;
}
示例7: readSchema
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public SchemaDescription readSchema( String file ) throws Exception {
return inClassloader( () -> {
ConfigurationProxy conf = new ConfigurationProxy();
FileSystem fs = FileSystem.get( new URI( file ), conf );
FileStatus fileStatus = fs.getFileStatus( new Path( file ) );
List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
if ( footers.isEmpty() ) {
return new SchemaDescription();
} else {
ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
MessageType schema = meta.getFileMetaData().getSchema();
return ParquetConverter.createSchemaDescription( schema );
}
} );
}
示例8: convertParquetSchemaToKettleWithTwoValidRows
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Test
public void convertParquetSchemaToKettleWithTwoValidRows() throws Exception {
int pentahoValueMetaTypeFirstRow = 2;
boolean allowNullFirstRow = false;
int pentahoValueMetaTypeSecondRow = 5;
boolean allowNullSecondRow = false;
String expectedKettleSchema = ParquetUtils
.createSchema( pentahoValueMetaTypeFirstRow, allowNullFirstRow, pentahoValueMetaTypeSecondRow,
allowNullSecondRow ).marshall();
urlTestResources = Thread.currentThread().getContextClassLoader().getResource( PARQUET_FILE );
ConfigurationProxy conf = new ConfigurationProxy();
conf.set( "fs.defaultFS", "file:///" );
ParquetMetadata meta = ParquetFileReader
.readFooter( conf, new Path( Paths.get( urlTestResources.toURI() ).toString() ),
ParquetMetadataConverter.NO_FILTER );
MessageType schema = meta.getFileMetaData().getSchema();
SchemaDescription kettleSchema = ParquetConverter.createSchemaDescription( schema );
String marshallKettleSchema = kettleSchema.marshall();
Assert.assertEquals( marshallKettleSchema, expectedKettleSchema );
}
示例9: execute
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, new Path(input));
PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
.withAutoColumn()
.withAutoCrop()
.withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
.withColumnPadding(1)
.build();
MetadataUtils.showDetails(out, metaData);
out.flushColumns();
}
示例10: getMetadatas
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public static ParquetMetadata[] getMetadatas (FileStatus[] fileStatuses, Configuration conf) throws IOException
{
ParquetMetadata[] res = new ParquetMetadata[fileStatuses.length];
for (int i = 0; i < fileStatuses.length; ++i)
{
res[i] = ParquetFileReader.readFooter(conf, fileStatuses[i].getPath(), NO_FILTER);
}
return res;
}
示例11: DrillParquetReader
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public DrillParquetReader(FragmentContext fragmentContext, ParquetMetadata footer, RowGroupReadEntry entry,
List<SchemaPath> columns, DrillFileSystem fileSystem) {
this.footer = footer;
this.fileSystem = fileSystem;
this.entry = entry;
setColumns(columns);
this.fragmentContext = fragmentContext;
fillLevelCheckFrequency = this.fragmentContext.getOptions().getOption(ExecConstants.PARQUET_VECTOR_FILL_CHECK_THRESHOLD).num_val.intValue();
fillLevelCheckThreshold = this.fragmentContext.getOptions().getOption(ExecConstants.PARQUET_VECTOR_FILL_THRESHOLD).num_val.intValue();
}
示例12: readFooter
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
* An updated footer reader that tries to read the entire footer without knowing the length.
* This should reduce the amount of seek/read roundtrips in most workloads.
* @param fs
* @param status
* @return
* @throws IOException
*/
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
final FileSystem fs = status.getPath().getFileSystem(config);
try(FSDataInputStream file = fs.open(status.getPath())) {
final long fileLength = status.getLen();
Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());
int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
byte[] footerBytes = new byte[len];
readFully(file, fileLength - len, footerBytes, 0, len);
checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);
if(size > footerBytes.length - FOOTER_METADATA_SIZE){
// if the footer is larger than our initial read, we need to read the rest.
byte[] origFooterBytes = footerBytes;
int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;
footerBytes = new byte[size];
readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
}else{
int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
}
ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(new ByteArrayInputStream(footerBytes));
Footer footer = new Footer(status.getPath(), metadata);
return footer;
}
}
示例13: isComplex
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private static boolean isComplex(ParquetMetadata footer) {
MessageType schema = footer.getFileMetaData().getSchema();
for (Type type : schema.getFields()) {
if (!type.isPrimitive()) {
return true;
}
}
for (ColumnDescriptor col : schema.getColumns()) {
if (col.getMaxRepetitionLevel() > 0) {
return true;
}
}
return false;
}
示例14: validateFooters
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private void validateFooters(final List<Footer> metadata) {
logger.debug(metadata.toString());
assertEquals(3, metadata.size());
for (Footer footer : metadata) {
final File file = new File(footer.getFile().toUri());
assertTrue(file.getName(), file.getName().startsWith("part"));
assertTrue(file.getPath(), file.exists());
final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
assertEquals(2, parquetMetadata.getBlocks().size());
final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
assertEquals("bar", keyValueMetaData.get("foo"));
assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
}
}
示例15: getDatasetDescriptorFromParquetFile
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
throws IOException {
ArrayList<FileStatus> files = new ArrayList<FileStatus>();
FileStatus[] dirs;
dirs = fs.globStatus(fs.makeQualified(getInputPath()));
for (int i = 0; (dirs != null && i < dirs.length); i++) {
files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
// We only check one file, so exit the loop when we have at least
// one.
if (files.size() > 0) {
break;
}
}
ParquetMetadata parquetMetadata;
try {
parquetMetadata =
ParquetFileReader.readFooter(job.getConfiguration(),
fs.makeQualified(files.get(0).getPath()));
} catch (IOException e) {
LOG.error("Wrong file format. Please check the export file's format.", e);
throw e;
}
MessageType schema = parquetMetadata.getFileMetaData().getSchema();
Schema avroSchema = new AvroSchemaConverter().convert(schema);
DatasetDescriptor descriptor =
new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
.compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
return descriptor;
}