本文整理汇总了Java中parquet.hadoop.ParquetFileReader类的典型用法代码示例。如果您正苦于以下问题:Java ParquetFileReader类的具体用法?Java ParquetFileReader怎么用?Java ParquetFileReader使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ParquetFileReader类属于parquet.hadoop包,在下文中一共展示了ParquetFileReader类的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getSchema
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public DatasetJsonRecord getSchema(Path path) throws IOException {
DatasetJsonRecord record = null;
if (!fs.exists(path))
LOG.error("file path : {} not in hdfs", path);
else {
try {
ParquetMetadata readFooter = ParquetFileReader.readFooter(fs.getConf(), path, ParquetMetadataConverter.NO_FILTER);
Map<String, String> schema = readFooter.getFileMetaData().getKeyValueMetaData();
String allFields = schema.get("org.apache.spark.sql.parquet.row.metadata");
FileStatus status = fs.getFileStatus(path);
String storage = STORAGE_TYPE;
String abstractPath = path.toUri().getPath();
String codec = "parquet.codec";
record = new DatasetJsonRecord(allFields, abstractPath, status.getModificationTime(), status.getOwner(), status.getGroup(),
status.getPermission().toString(), codec, storage, "");
LOG.info("parquetfileanalyzer parse path :{},schema is {}", path.toUri().getPath(), record.toCsvString());
} catch (Exception e) {
LOG.error("path : {} content " + " is not Parquet File format content ", path.toUri().getPath());
LOG.info(e.getStackTrace().toString());
}
}
return record;
}
示例2: readSchema
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public SchemaDescription readSchema( String file ) throws Exception {
return inClassloader( () -> {
ConfigurationProxy conf = new ConfigurationProxy();
FileSystem fs = FileSystem.get( new URI( file ), conf );
FileStatus fileStatus = fs.getFileStatus( new Path( file ) );
List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
if ( footers.isEmpty() ) {
return new SchemaDescription();
} else {
ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
MessageType schema = meta.getFileMetaData().getSchema();
return ParquetConverter.createSchemaDescription( schema );
}
} );
}
示例3: convertParquetSchemaToKettleWithTwoValidRows
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Test
public void convertParquetSchemaToKettleWithTwoValidRows() throws Exception {
int pentahoValueMetaTypeFirstRow = 2;
boolean allowNullFirstRow = false;
int pentahoValueMetaTypeSecondRow = 5;
boolean allowNullSecondRow = false;
String expectedKettleSchema = ParquetUtils
.createSchema( pentahoValueMetaTypeFirstRow, allowNullFirstRow, pentahoValueMetaTypeSecondRow,
allowNullSecondRow ).marshall();
urlTestResources = Thread.currentThread().getContextClassLoader().getResource( PARQUET_FILE );
ConfigurationProxy conf = new ConfigurationProxy();
conf.set( "fs.defaultFS", "file:///" );
ParquetMetadata meta = ParquetFileReader
.readFooter( conf, new Path( Paths.get( urlTestResources.toURI() ).toString() ),
ParquetMetadataConverter.NO_FILTER );
MessageType schema = meta.getFileMetaData().getSchema();
SchemaDescription kettleSchema = ParquetConverter.createSchemaDescription( schema );
String marshallKettleSchema = kettleSchema.marshall();
Assert.assertEquals( marshallKettleSchema, expectedKettleSchema );
}
示例4: execute
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, new Path(input));
PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
.withAutoColumn()
.withAutoCrop()
.withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
.withColumnPadding(1)
.build();
MetadataUtils.showDetails(out, metaData);
out.flushColumns();
}
示例5: getMetadatas
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public static ParquetMetadata[] getMetadatas (FileStatus[] fileStatuses, Configuration conf) throws IOException
{
ParquetMetadata[] res = new ParquetMetadata[fileStatuses.length];
for (int i = 0; i < fileStatuses.length; ++i)
{
res[i] = ParquetFileReader.readFooter(conf, fileStatuses[i].getPath(), NO_FILTER);
}
return res;
}
示例6: getFooters
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
final List<TimedRunnable<Footer>> readers = Lists.newArrayList();
List<Footer> foundFooters = Lists.newArrayList();
for(FileStatus status : statuses){
if(status.isDirectory()){
// first we check for summary file.
FileSystem fs = status.getPath().getFileSystem(conf);
final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
if (fs.exists(summaryPath)){
FileStatus summaryStatus = fs.getFileStatus(summaryPath);
foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
continue;
}
// else we handle as normal file.
for(FileStatus inStatus : fs.listStatus(status.getPath(), new DrillPathFilter())){
readers.add(new FooterReader(conf, inStatus));
}
}else{
readers.add(new FooterReader(conf, status));
}
}
if(!readers.isEmpty()){
foundFooters.addAll(TimedRunnable.run("Fetch Parquet Footers", logger, readers, parallelism));
}
return foundFooters;
}
示例7: getDatasetDescriptorFromParquetFile
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
throws IOException {
ArrayList<FileStatus> files = new ArrayList<FileStatus>();
FileStatus[] dirs;
dirs = fs.globStatus(fs.makeQualified(getInputPath()));
for (int i = 0; (dirs != null && i < dirs.length); i++) {
files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
// We only check one file, so exit the loop when we have at least
// one.
if (files.size() > 0) {
break;
}
}
ParquetMetadata parquetMetadata;
try {
parquetMetadata =
ParquetFileReader.readFooter(job.getConfiguration(),
fs.makeQualified(files.get(0).getPath()));
} catch (IOException e) {
LOG.error("Wrong file format. Please check the export file's format.", e);
throw e;
}
MessageType schema = parquetMetadata.getFileMetaData().getSchema();
Schema avroSchema = new AvroSchemaConverter().convert(schema);
DatasetDescriptor descriptor =
new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
.compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
return descriptor;
}
示例8: execute
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
Path inpath = new Path(input);
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath);
MessageType schema = metaData.getFileMetaData().getSchema();
PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
.withAutoColumn()
.withAutoCrop()
.withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES)
.withColumnPadding(1)
.withMaxBufferedLines(1000000)
.withFlushOnTab()
.build();
boolean showmd = !options.hasOption('m');
boolean showdt = !options.hasOption('d');
Set<String> showColumns = null;
if (options.hasOption('c')) {
String[] cols = options.getOptionValues('c');
showColumns = new HashSet<String>(Arrays.asList(cols));
}
dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}
示例9: ParquetFileMetadata
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public ParquetFileMetadata(Configuration conf, Path hdfsFilePath) throws IOException
{
this.metaData = ParquetFileReader.readFooter(conf, hdfsFilePath, NO_FILTER);
}
示例10: execute
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public static LocalMetrics execute (FileStatus[] fileStatuses, ParquetMetadata[] metadatas, String[] columnNames, Configuration conf) throws IOException
{
boolean printColumns = true;
List<ParquetFileReader> readers = new ArrayList<ParquetFileReader>();
List<Column> columns = new ArrayList<Column>();
for (int i = 0; i < fileStatuses.length; ++i)
{
FileStatus status = fileStatuses[i];
ParquetMetadata metadata = metadatas[i];
MessageType schema = metadata.getFileMetaData().getSchema();
List<ColumnDescriptor> columnDescriptors = new ArrayList<ColumnDescriptor>();
for (String columnName : columnNames)
{
int fieldIndex = schema.getFieldIndex(columnName.toLowerCase());
ColumnDescriptor descriptor = schema.getColumns().get(fieldIndex);
columnDescriptors.add(descriptor);
if (printColumns)
{
Column column = new Column();
column.setIndex(fieldIndex);
column.setName(schema.getFieldName(column.getIndex()));
column.setDescriptor(descriptor);
columns.add(column);
}
}
printColumns = false;
readers.add(new ParquetFileReader(conf, status.getPath(), metadata.getBlocks(), columnDescriptors));
}
long time = System.currentTimeMillis();
long rowCount = 0;
long rowGroupCount = 0;
long readerCount = readers.size();
for (ParquetFileReader reader : readers)
{
PageReadStore pageReadStore;
while ((pageReadStore = reader.readNextRowGroup()) != null)
{
rowGroupCount ++;
rowCount += pageReadStore.getRowCount();
}
reader.close();
}
LocalMetrics metrics = new LocalMetrics(columns, readerCount, rowGroupCount, rowCount, System.currentTimeMillis()-time);
return metrics;
}
示例11: testPerformance
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
@Test
@Ignore
public void testPerformance(@Injectable final DrillbitContext bitContext,
@Injectable UserServer.UserClientConnection connection) throws Exception {
final DrillConfig c = DrillConfig.create();
final FunctionImplementationRegistry registry = new FunctionImplementationRegistry(c);
final FragmentContext context = new FragmentContext(bitContext, BitControl.PlanFragment.getDefaultInstance(), connection, registry);
// new NonStrictExpectations() {
// {
// context.getAllocator(); result = BufferAllocator.getAllocator(DrillConfig.create());
// }
// };
final String fileName = "/tmp/parquet_test_performance.parquet";
final HashMap<String, FieldInfo> fields = new HashMap<>();
final ParquetTestProperties props = new ParquetTestProperties(1, 20 * 1000 * 1000, DEFAULT_BYTES_PER_PAGE, fields);
populateFieldInfoMap(props);
//generateParquetFile(fileName, props);
final Configuration dfsConfig = new Configuration();
final List<Footer> footers = ParquetFileReader.readFooters(dfsConfig, new Path(fileName));
final Footer f = footers.iterator().next();
final List<SchemaPath> columns = Lists.newArrayList();
columns.add(new SchemaPath("_MAP.integer", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bigInt", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.f", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.d", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.b", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin2", ExpressionPosition.UNKNOWN));
int totalRowCount = 0;
final FileSystem fs = new CachedSingleFileSystem(fileName);
final BufferAllocator allocator = RootAllocatorFactory.newRoot(c);
for(int i = 0; i < 25; i++) {
final ParquetRecordReader rr = new ParquetRecordReader(context, 256000, fileName, 0, fs,
new DirectCodecFactory(dfsConfig, allocator), f.getParquetMetadata(), columns);
final TestOutputMutator mutator = new TestOutputMutator(allocator);
rr.setup(null, mutator);
final Stopwatch watch = new Stopwatch();
watch.start();
int rowCount = 0;
while ((rowCount = rr.next()) > 0) {
totalRowCount += rowCount;
}
System.out.println(String.format("Time completed: %s. ", watch.elapsed(TimeUnit.MILLISECONDS)));
rr.close();
}
allocator.close();
System.out.println(String.format("Total row count %s", totalRowCount));
}
示例12: run
import parquet.hadoop.ParquetFileReader; //导入依赖的package包/类
public int run(String[] args) throws Exception {
if(args.length < 2) {
LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]");
return 1;
}
String inputFile = args[0];
String outputFile = args[1];
String compression = (args.length > 2) ? args[2] : "none";
Path parquetFilePath = null;
// Find a file in case a directory was passed
RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true);
while(it.hasNext()) {
FileStatus fs = it.next();
if(fs.isFile()) {
parquetFilePath = fs.getPath();
break;
}
}
if(parquetFilePath == null) {
LOG.error("No file found for " + inputFile);
return 1;
}
LOG.info("Getting schema from " + parquetFilePath);
ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
LOG.info(schema);
GroupWriteSupport.setSchema(schema, getConf());
Job job = new Job(getConf());
job.setJarByClass(getClass());
job.setJobName(getClass().getName());
job.setMapperClass(ReadRequestMap.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(ExampleInputFormat.class);
job.setOutputFormatClass(ExampleOutputFormat.class);
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
if(compression.equalsIgnoreCase("snappy")) {
codec = CompressionCodecName.SNAPPY;
} else if(compression.equalsIgnoreCase("gzip")) {
codec = CompressionCodecName.GZIP;
}
LOG.info("Output compression: " + codec);
ExampleOutputFormat.setCompression(job, codec);
FileInputFormat.setInputPaths(job, new Path(inputFile));
FileOutputFormat.setOutputPath(job, new Path(outputFile));
job.waitForCompletion(true);
return 0;
}