本文整理汇总了Java中org.apache.parquet.io.MessageColumnIO类的典型用法代码示例。如果您正苦于以下问题:Java MessageColumnIO类的具体用法?Java MessageColumnIO怎么用?Java MessageColumnIO使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
MessageColumnIO类属于org.apache.parquet.io包,在下文中一共展示了MessageColumnIO类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: validateSameTupleAsEB
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
/**
* <ul> steps:
* <li>Writes using the thrift mapping
* <li>Reads using the pig mapping
* <li>Use Elephant bird to convert from thrift to pig
* <li>Check that both transformations give the same result
* @param o the object to convert
* @throws TException
*/
public static <T extends TBase<?,?>> void validateSameTupleAsEB(T o) throws TException {
final ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter();
@SuppressWarnings("unchecked")
final Class<T> class1 = (Class<T>) o.getClass();
final MessageType schema = thriftSchemaConverter.convert(class1);
final StructType structType = ThriftSchemaConverter.toStructType(class1);
final ThriftToPig<T> thriftToPig = new ThriftToPig<T>(class1);
final Schema pigSchema = thriftToPig.toSchema();
final TupleRecordMaterializer tupleRecordConverter = new TupleRecordMaterializer(schema, pigSchema, true);
RecordConsumer recordConsumer = new ConverterConsumer(tupleRecordConverter.getRootConverter(), schema);
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
ParquetWriteProtocol p = new ParquetWriteProtocol(new RecordConsumerLoggingWrapper(recordConsumer), columnIO, structType);
o.write(p);
final Tuple t = tupleRecordConverter.getCurrentRecord();
final Tuple expected = thriftToPig.getPigTuple(o);
assertEquals(expected.toString(), t.toString());
final MessageType filtered = new PigSchemaConverter().filter(schema, pigSchema);
assertEquals(schema.toString(), filtered.toString());
}
示例2: read
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
private static void read(PageReadStore columns, String pigSchemaString, String message) throws ParserException {
System.out.println(message);
MessageColumnIO columnIO = newColumnFactory(pigSchemaString);
TupleReadSupport tupleReadSupport = new TupleReadSupport();
Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
MessageType schema = new PigSchemaConverter().convert(Utils.getSchemaFromString(pigSchemaString));
ReadContext init = tupleReadSupport.init(null, pigMetaData, schema);
RecordMaterializer<Tuple> recordConsumer = tupleReadSupport.prepareForRead(null, pigMetaData, schema, init);
RecordReader<Tuple> recordReader = columnIO.getRecordReader(columns, recordConsumer);
// TODO: put this back
// if (DEBUG) {
// recordConsumer = new RecordConsumerLoggingWrapper(recordConsumer);
// }
read(recordReader, 10000, pigSchemaString);
read(recordReader, 10000, pigSchemaString);
read(recordReader, 10000, pigSchemaString);
read(recordReader, 10000, pigSchemaString);
read(recordReader, 10000, pigSchemaString);
read(recordReader, 100000, pigSchemaString);
read(recordReader, 1000000, pigSchemaString);
System.out.println();
}
示例3: newSchema
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
private void newSchema() throws IOException {
// Reset it to half of current number and bound it within the limits
recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCountForNextMemCheck / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK);
String json = new Schema(batchSchema).toJson();
extraMetaData.put(DREMIO_ARROW_SCHEMA, json);
List<Type> types = Lists.newArrayList();
for (Field field : batchSchema) {
if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) {
continue;
}
Type childType = getType(field);
if (childType != null) {
types.add(childType);
}
}
Preconditions.checkState(types.size() > 0, "No types for parquet schema");
schema = new MessageType("root", types);
int dictionarySize = (int)context.getOptions().getOption(ExecConstants.PARQUET_DICT_PAGE_SIZE_VALIDATOR);
final ParquetProperties parquetProperties = new ParquetProperties(dictionarySize, writerVersion, enableDictionary,
new ParquetDirectByteBufferAllocator(columnEncoderAllocator), pageSize, true, enableDictionaryForBinary);
pageStore = ColumnChunkPageWriteStoreExposer.newColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, parquetProperties);
store = new ColumnWriteStoreV1(pageStore, pageSize, parquetProperties);
MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
consumer = columnIO.getRecordWriter(store);
setUp(schema, consumer);
}
示例4: checkRead
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
private void checkRead() throws IOException
{
if (current == totalCountLoadedSoFar) {
PageReadStore pages = reader.readNextRowGroup();
if (pages == null) {
throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
}
MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
recordReader = columnIO.getRecordReader(pages, recordConverter, filter);
totalCountLoadedSoFar += pages.getRowCount();
++ currentBlock;
}
}
示例5: load
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
public ITable load() {
try {
Configuration conf = new Configuration();
System.setProperty("hadoop.home.dir", "/");
conf.set("hadoop.security.authentication", "simple");
conf.set("hadoop.security.authorization", "false");
Path path = new Path(this.filename);
ParquetMetadata md = ParquetFileReader.readFooter(conf, path,
ParquetMetadataConverter.NO_FILTER);
MessageType schema = md.getFileMetaData().getSchema();
ParquetFileReader r = new ParquetFileReader(conf, path, md);
IAppendableColumn[] cols = this.createColumns(md);
MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
PageReadStore pages;
while (null != (pages = r.readNextRowGroup())) {
final long rows = pages.getRowCount();
RecordReader<Group> recordReader = columnIO.getRecordReader(
pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
Group g = recordReader.read();
appendGroup(cols, g, md.getFileMetaData().getSchema().getColumns());
}
}
for (IAppendableColumn c: cols)
c.seal();
return new Table(cols);
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
示例6: newSchema
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
private void newSchema() throws IOException {
List<Type> types = Lists.newArrayList();
for (MaterializedField field : batchSchema) {
if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) {
continue;
}
types.add(getType(field));
}
schema = new MessageType("root", types);
// We don't want this number to be too small, ideally we divide the block equally across the columns.
// It is unlikely all columns are going to be the same size.
// Its value is likely below Integer.MAX_VALUE (2GB), although rowGroupSize is a long type.
// Therefore this size is cast to int, since allocating byte array in under layer needs to
// limit the array size in an int scope.
int initialBlockBufferSize = max(MINIMUM_BUFFER_SIZE, blockSize / this.schema.getColumns().size() / 5);
// We don't want this number to be too small either. Ideally, slightly bigger than the page size,
// but not bigger than the block buffer
int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
// TODO: Use initialSlabSize from ParquetProperties once drill will be updated to the latest version of Parquet library
int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(64, pageSize, 10);
// TODO: Replace ParquetColumnChunkPageWriteStore with ColumnChunkPageWriteStore from parquet library
// once PARQUET-1006 will be resolved
pageStore = new ParquetColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, initialSlabSize,
pageSize, new ParquetDirectByteBufferAllocator(oContext));
store = new ColumnWriteStoreV1(pageStore, pageSize, initialPageBufferSize, enableDictionary,
writerVersion, new ParquetDirectByteBufferAllocator(oContext));
MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
consumer = columnIO.getRecordWriter(store);
setUp(schema, consumer);
}
示例7: checkRead
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
private void checkRead() throws IOException {
if (current == totalCountLoadedSoFar) {
if (current != 0) {
totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
if (Log.DEBUG) {
LOG.debug("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
if (totalTime != 0) {
final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
LOG.debug("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
}
}
}
if (Log.DEBUG) LOG.debug("at row " + current + ". reading next block");
long t0 = System.currentTimeMillis();
PageReadStore pages = reader.readNextRowGroup();
if (pages == null) {
throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
}
long timeSpentReading = System.currentTimeMillis() - t0;
totalTimeSpentReadingBytes += timeSpentReading;
BenchmarkCounter.incrementTime(timeSpentReading);
if (Log.INFO) LOG.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount());
if (Log.DEBUG) LOG.debug("initializing Record assembly with requested schema " + requestedSchema);
MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
recordReader = columnIO.getRecordReader(pages, recordConverter, filter);
startedAssemblingCurrentBlockAt = System.currentTimeMillis();
totalCountLoadedSoFar += pages.getRowCount();
++ currentBlock;
}
}
示例8: checkRead
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
private void checkRead() throws IOException {
if (current == totalCountLoadedSoFar) {
if (current != 0) {
totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
if (Log.isLoggingFor("info")) {
Log.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
if (totalTime != 0) {
final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
Log.info("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
}
}
}
Log.info("at row " + current + ". reading next block");
long t0 = System.currentTimeMillis();
PageReadStore pages = reader.readNextRowGroup();
if (pages == null) {
throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
}
long timeSpentReading = System.currentTimeMillis() - t0;
totalTimeSpentReadingBytes += timeSpentReading;
if (Log.isLoggingFor("info")) Log.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount());
if (Log.isLoggingFor("debug")) Log.debug("initializing Record assembly with requested schema " + requestedSchema);
MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
recordReader = columnIO.getRecordReader(pages, recordConverter, filter);
startedAssemblingCurrentBlockAt = System.currentTimeMillis();
totalCountLoadedSoFar += pages.getRowCount();
++ currentBlock;
}
}
示例9: checkRead
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
private void checkRead() throws IOException {
if (current == totalCountLoadedSoFar) {
if (current != 0) {
totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
if (LOG.isInfoEnabled()) {
LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
if (totalTime != 0) {
final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
LOG.info("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
}
}
}
LOG.info("at row " + current + ". reading next block");
long t0 = System.currentTimeMillis();
PageReadStore pages = reader.readNextRowGroup();
if (pages == null) {
throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
}
long timeSpentReading = System.currentTimeMillis() - t0;
totalTimeSpentReadingBytes += timeSpentReading;
BenchmarkCounter.incrementTime(timeSpentReading);
if (LOG.isInfoEnabled()) LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount());
LOG.debug("initializing Record assembly with requested schema {}", requestedSchema);
MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
recordReader = columnIO.getRecordReader(pages, recordConverter,
filterRecords ? filter : FilterCompat.NOOP);
startedAssemblingCurrentBlockAt = System.currentTimeMillis();
totalCountLoadedSoFar += pages.getRowCount();
++ currentBlock;
}
}
示例10: initStore
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
private void initStore() {
pageStore = new ColumnChunkPageWriteStore(compressor, schema, props.getAllocator());
columnStore = props.newColumnWriteStore(schema, pageStore);
MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema);
this.recordConsumer = columnIO.getRecordWriter(columnStore);
writeSupport.prepareForWrite(recordConsumer);
}
示例11: validate
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
private <T extends TBase<?,?>> void validate(T expected) throws TException {
@SuppressWarnings("unchecked")
final Class<T> thriftClass = (Class<T>)expected.getClass();
final MemPageStore memPageStore = new MemPageStore(1);
final ThriftSchemaConverter schemaConverter = new ThriftSchemaConverter();
final MessageType schema = schemaConverter.convert(thriftClass);
LOG.info("{}", schema);
final MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema);
final ColumnWriteStoreV1 columns = new ColumnWriteStoreV1(memPageStore,
ParquetProperties.builder()
.withPageSize(10000)
.withDictionaryEncoding(false)
.build());
final RecordConsumer recordWriter = columnIO.getRecordWriter(columns);
final StructType thriftType = schemaConverter.toStructType(thriftClass);
ParquetWriteProtocol parquetWriteProtocol = new ParquetWriteProtocol(recordWriter, columnIO, thriftType);
expected.write(parquetWriteProtocol);
recordWriter.flush();
columns.flush();
ThriftRecordConverter<T> converter = new TBaseRecordConverter<T>(thriftClass, schema, thriftType);
final RecordReader<T> recordReader = columnIO.getRecordReader(memPageStore, converter);
final T result = recordReader.read();
assertEquals(expected, result);
}
示例12: validateThrift
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
private void validateThrift(String[] expectations, TBase<?, ?> a)
throws TException {
final ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter();
// System.out.println(a);
final Class<TBase<?,?>> class1 = (Class<TBase<?,?>>)a.getClass();
final MessageType schema = thriftSchemaConverter.convert(class1);
LOG.info("{}", schema);
final StructType structType = thriftSchemaConverter.toStructType(class1);
ExpectationValidatingRecordConsumer recordConsumer = new ExpectationValidatingRecordConsumer(new ArrayDeque<String>(Arrays.asList(expectations)));
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
ParquetWriteProtocol p = new ParquetWriteProtocol(new RecordConsumerLoggingWrapper(recordConsumer), columnIO, structType);
a.write(p);
}
示例13: write
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
private static void write(MemPageStore memPageStore, ColumnWriteStoreV1 columns, MessageType schema, String pigSchemaString) throws ExecException, ParserException {
MessageColumnIO columnIO = newColumnFactory(pigSchemaString);
TupleWriteSupport tupleWriter = TupleWriteSupport.fromPigSchema(pigSchemaString);
tupleWriter.init(null);
tupleWriter.prepareForWrite(columnIO.getRecordWriter(columns));
write(memPageStore, tupleWriter, 10000);
write(memPageStore, tupleWriter, 10000);
write(memPageStore, tupleWriter, 10000);
write(memPageStore, tupleWriter, 10000);
write(memPageStore, tupleWriter, 10000);
write(memPageStore, tupleWriter, 100000);
write(memPageStore, tupleWriter, 1000000);
System.out.println();
}
示例14: prepareForWrite
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
@Override
public void prepareForWrite(RecordConsumer recordConsumer) {
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
this.parquetWriteProtocol = new ParquetWriteProtocol(recordConsumer, columnIO, thriftStruct);
}
示例15: prepareForWrite
import org.apache.parquet.io.MessageColumnIO; //导入依赖的package包/类
@Override
public void prepareForWrite(RecordConsumer recordConsumer) {
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
this.parquetWriteProtocol = new ParquetWriteProtocol(recordConsumer, columnIO, thriftStruct);
thriftWriteSupport.prepareForWrite(recordConsumer);
}