本文整理匯總了Java中org.apache.parquet.hadoop.CodecFactory類的典型用法代碼示例。如果您正苦於以下問題:Java CodecFactory類的具體用法?Java CodecFactory怎麽用?Java CodecFactory使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
CodecFactory類屬於org.apache.parquet.hadoop包,在下文中一共展示了CodecFactory類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: readDictionaries
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
/**
* Return dictionary per row group for all binary columns in given parquet file.
* @param fs filesystem object.
* @param filePath parquet file to scan
* @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
* @throws IOException
*/
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CodecFactory codecFactory) throws IOException {
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
if (parquetMetadata.getBlocks().size() > 1) {
throw new IOException(
format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
parquetMetadata.getBlocks().size(), filePath));
}
final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();
for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
}
final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
try(final FSDataInputStream in = fs.open(filePath)) {
for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
if (isBinaryType(columnChunkMetaData.getType())) {
final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
// if first page is dictionary encoded then load dictionary, otherwise skip this column.
final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
} else {
columnsToSkip.add(column);
}
}
}
}
return new ImmutablePair<>(dictionaries, columnsToSkip);
}
示例2: DeprecatedParquetVectorizedReader
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
public DeprecatedParquetVectorizedReader(
OperatorContext operatorContext,
String path,
int rowGroupIndex,
FileSystem fs,
CodecFactory codecFactory,
ParquetMetadata footer,
List<SchemaPath> columns,
ParquetReaderUtility.DateCorruptionStatus dateCorruptionStatus,
boolean readInt96AsTimeStamp,
Map<String, GlobalDictionaryFieldInfo> globalDictionaryColumns,
GlobalDictionaries globalDictionaries) throws ExecutionSetupException {
super(operatorContext, columns);
this.hadoopPath = new Path(path);
this.fileSystem = fs;
this.codecFactory = codecFactory;
this.rowGroupIndex = rowGroupIndex;
this.footer = footer;
this.dateCorruptionStatus = dateCorruptionStatus;
this.readInt96AsTimeStamp = readInt96AsTimeStamp;
this.globalDictionaryColumns = globalDictionaryColumns == null? Collections.<String, GlobalDictionaryFieldInfo>emptyMap() : globalDictionaryColumns;
this.globalDictionaries = globalDictionaries;
this.singleInputStream = null;
}
示例3: getReaders
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
@Override
public List<RecordReader> getReaders(UnifiedParquetReader unifiedReader) throws ExecutionSetupException {
final ParquetMetadata footer = unifiedReader.getFooter();
final DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer,
unifiedReader.columnsInGroupScan, unifiedReader.autoCorrectCorruptDates);
List<RecordReader> returnList = new ArrayList<>();
returnList.add(unifiedReader.addFilterIfNecessary(
new DeprecatedParquetVectorizedReader(
unifiedReader.context,
unifiedReader.readEntry.getPath(), unifiedReader.readEntry.getRowGroupIndex(), unifiedReader.fs,
CodecFactory.createDirectCodecFactory(
unifiedReader.fs.getConf(),
new ParquetDirectByteBufferAllocator(unifiedReader.context.getAllocator()), 0),
footer,
unifiedReader.realFields,
containsCorruptDates,
unifiedReader.readInt96AsTimeStamp,
unifiedReader.globalDictionaryFieldInfoMap,
unifiedReader.dictionaries
)
));
return returnList;
}
示例4: testLocalDictionaries
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
@Test
public void testLocalDictionaries() throws IOException {
try (final BufferAllocator bufferAllocator = new RootAllocator(SabotConfig.getMaxDirectMemory())) {
final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries1 =
LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook1.parquet"), codecFactory);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries2 =
LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook2.parquet"), codecFactory);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries3 =
LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook3.parquet"), codecFactory);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries4 =
LocalDictionariesReader.readDictionaries(fs, new Path(partitionDirPath, "phonebook4.parquet"), codecFactory);
assertEquals(2, dictionaries1.getKey().size()); // name and kind have dictionaries
assertEquals(1, dictionaries2.getKey().size());
assertEquals(1, dictionaries3.getKey().size());
assertEquals(1, dictionaries4.getKey().size());
assertEquals(0, dictionaries1.getValue().size());
assertEquals(1, dictionaries2.getValue().size()); // skip name
assertEquals(1, dictionaries3.getValue().size()); // skip name
assertEquals(1, dictionaries4.getValue().size()); // skip name
}
}
示例5: getScanBatch
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
private RecordBatch getScanBatch() throws Exception {
List<RecordReader> readers = Lists.newArrayList();
for (String path : inputPaths) {
ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), new Path(path));
for (int i = 0; i < footer.getBlocks().size(); i++) {
readers.add(new ParquetRecordReader(fragContext,
path,
i,
fs,
CodecFactory.createDirectCodecFactory(fs.getConf(),
new ParquetDirectByteBufferAllocator(opContext.getAllocator()), 0),
footer,
columnsToRead,
ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_NO_CORRUPTION));
}
}
RecordBatch scanBatch = new ScanBatch(null, fragContext, readers);
return scanBatch;
}
示例6: readLocalDictionaries
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
private static Map<ColumnDescriptor, List<Dictionary>> readLocalDictionaries(FileSystem fs, FileStatus[] statuses, BufferAllocator allocator) throws IOException{
final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // These columns are not dictionary encoded in at least one file.
final Map<ColumnDescriptor, List<Dictionary>> allDictionaries = Maps.newHashMap();
final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(allocator), 0);
for (FileStatus status : statuses) {
logger.debug("Scanning file {}", status.getPath());
final Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> localDictionaries = LocalDictionariesReader.readDictionaries(
fs, status.getPath(), codecFactory);
// Skip columns which are not dictionary encoded
for (ColumnDescriptor skippedColumn : localDictionaries.getRight()) {
columnsToSkip.add(skippedColumn);
allDictionaries.remove(skippedColumn);
}
for (final Map.Entry<ColumnDescriptor, Dictionary> entry : localDictionaries.getLeft().entrySet()) {
if (!columnsToSkip.contains(entry.getKey())) {
if (allDictionaries.containsKey(entry.getKey())) {
allDictionaries.get(entry.getKey()).add(entry.getValue());
} else {
allDictionaries.put(entry.getKey(), Lists.newArrayList(entry.getValue()));
}
}
}
}
logger.debug("Skipping columns {}", columnsToSkip);
return allDictionaries;
}
示例7: main
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
public static void main(String[] args) {
try (final BufferAllocator bufferAllocator = new RootAllocator(SabotConfig.getMaxDirectMemory())) {
final FileSystem fs = FileSystem.getLocal(new Configuration());
final Path filePath = new Path(args[0]);
final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
final Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries = readDictionaries(fs, filePath, codecFactory);
for (Map.Entry<ColumnDescriptor, Dictionary> entry : dictionaries.getLeft().entrySet()) {
printDictionary(entry.getKey(), entry.getValue());
}
System.out.println("Binary columns which are not dictionary encoded: " + dictionaries.getRight());
} catch (IOException ioe) {
logger.error("Failed ", ioe);
}
}
示例8: newReader
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
RecordReader newReader(OperatorContext context,
List<SchemaPath> columns,
FileSystem fs,
String path,
CodecFactory codecFactory,
List<FilterCondition> conditions,
DateCorruptionStatus corruptionStatus,
boolean readInt96AsTimeStamp,
boolean enableDetailedTracing,
ParquetMetadata footer,
int rowGroupIndex,
SimpleIntVector deltas,
boolean useSingleStream);
示例9: UnifiedParquetReader
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
public UnifiedParquetReader(
OperatorContext context,
ParquetReaderFactory readerFactory,
List<SchemaPath> realFields,
List<SchemaPath> columnsInGroupScan,
Map<String, GlobalDictionaryFieldInfo> globalDictionaryFieldInfoMap,
List<FilterCondition> filterConditions,
ParquetDatasetSplitXAttr readEntry,
FileSystem fs,
ParquetMetadata footer,
GlobalDictionaries dictionaries,
CodecFactory codecFactory,
boolean autoCorrectCorruptDates,
boolean readInt96AsTimeStamp,
boolean vectorize,
boolean enableDetailedTracing) {
super();
this.context = context;
this.readerFactory = readerFactory;
this.columnsInGroupScan = columnsInGroupScan;
this.globalDictionaryFieldInfoMap = globalDictionaryFieldInfoMap;
this.filterConditions = filterConditions;
this.fs = fs;
this.footer = footer;
this.readEntry = readEntry;
this.autoCorrectCorruptDates = autoCorrectCorruptDates;
this.readInt96AsTimeStamp = readInt96AsTimeStamp;
this.vectorize = vectorize;
this.realFields = realFields;
this.dictionaries = dictionaries;
this.codecFactory = codecFactory;
this.enableDetailedTracing = enableDetailedTracing;
this.useSingleStream = context.getOptions().getOption(ExecConstants.PARQUET_SINGLE_STREAM);
}
示例10: ParquetRecordReader
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
public ParquetRecordReader(FragmentContext fragmentContext,
String path,
int rowGroupIndex,
long numRecordsToRead,
FileSystem fs,
CodecFactory codecFactory,
ParquetMetadata footer,
List<SchemaPath> columns,
ParquetReaderUtility.DateCorruptionStatus dateCorruptionStatus) throws ExecutionSetupException {
this(fragmentContext, DEFAULT_BATCH_LENGTH_IN_BITS, numRecordsToRead,
path, rowGroupIndex, fs, codecFactory, footer, columns, dateCorruptionStatus);
}
示例11: ParquetRecordWriter
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
public ParquetRecordWriter(FragmentContext context, ParquetWriter writer) throws OutOfMemoryException{
super();
this.oContext = context.newOperatorContext(writer);
this.codecFactory = CodecFactory.createDirectCodecFactory(writer.getFormatPlugin().getFsConf(),
new ParquetDirectByteBufferAllocator(oContext.getAllocator()), pageSize);
this.partitionColumns = writer.getPartitionColumns();
this.hasPartitions = partitionColumns != null && partitionColumns.size() > 0;
this.extraMetaData.put(DRILL_VERSION_PROPERTY, DrillVersionInfo.getVersion());
this.extraMetaData.put(WRITER_VERSION_PROPERTY, String.valueOf(ParquetWriter.WRITER_VERSION));
this.storageStrategy = writer.getStorageStrategy() == null ? StorageStrategy.DEFAULT : writer.getStorageStrategy();
this.cleanUpLocations = Lists.newArrayList();
}
示例12: getCodecFactory
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
public CodecFactory getCodecFactory() {
return codecFactory;
}
示例13: create
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
@Override
public ProducerOperator create(FragmentExecutionContext fragmentExecContext, final OperatorContext context, final ParquetSubScan config) throws ExecutionSetupException {
final FileSystemStoragePlugin2 registry = (FileSystemStoragePlugin2) fragmentExecContext.getStoragePlugin(config.getPluginId());
final FileSystemPlugin fsPlugin = registry.getFsPlugin();
final FileSystemWrapper fs = registry.getFs();
final Configuration conf = fsPlugin.getFsConf();
conf.setBoolean(ENABLE_BYTES_READ_COUNTER, false);
conf.setBoolean(ENABLE_BYTES_TOTAL_COUNTER, false);
conf.setBoolean(ENABLE_TIME_READ_COUNTER, false);
final Stopwatch watch = Stopwatch.createStarted();
boolean isAccelerator = config.getPluginId().getName().equals("__accelerator");
final ParquetReaderFactory readerFactory = UnifiedParquetReader.getReaderFactory(context.getConfig());
// TODO (AH )Fix implicit columns with mod time and global dictionaries
final ImplicitFilesystemColumnFinder finder = new ImplicitFilesystemColumnFinder(context.getOptions(), fs, config.getColumns(), isAccelerator);
// load global dictionaries, globalDictionaries must be closed by the last reader
final GlobalDictionaries globalDictionaries = GlobalDictionaries.create(context, fs, config.getGlobalDictionaryEncodedColumns());
final boolean vectorize = context.getOptions().getOption(ExecConstants.PARQUET_READER_VECTORIZE);
final boolean autoCorrectCorruptDates = ((ParquetFileConfig)FileFormat.getForFile(config.getFormatSettings())).getAutoCorrectCorruptDates();
final boolean readInt96AsTimeStamp = context.getOptions().getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val;
final boolean enableDetailedTracing = context.getOptions().getOption(ExecConstants.ENABLED_PARQUET_TRACING);
final CodecFactory codec = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(context.getAllocator()), 0);
final Map<String, GlobalDictionaryFieldInfo> globalDictionaryEncodedColumns = Maps.newHashMap();
if (globalDictionaries != null) {
for (GlobalDictionaryFieldInfo fieldInfo : config.getGlobalDictionaryEncodedColumns()) {
globalDictionaryEncodedColumns.put(fieldInfo.getFieldName(), fieldInfo);
}
}
final CompositeReaderConfig readerConfig = CompositeReaderConfig.getCompound(config.getSchema(), config.getColumns(), config.getPartitionColumns());
final List<ParquetDatasetSplit> sortedSplits = Lists.newArrayList();
final SingletonParquetFooterCache footerCache = new SingletonParquetFooterCache();
for (DatasetSplit spilt : config.getSplits()) {
sortedSplits.add(new ParquetDatasetSplit(spilt));
}
Collections.sort(sortedSplits);
FluentIterable < RecordReader > readers = FluentIterable.from(sortedSplits).transform(new Function<ParquetDatasetSplit, RecordReader>() {
@Override
public RecordReader apply(ParquetDatasetSplit split) {
final UnifiedParquetReader inner = new UnifiedParquetReader(
context,
readerFactory,
finder.getRealFields(),
config.getColumns(),
globalDictionaryEncodedColumns,
config.getConditions(),
split.getSplitXAttr(),
fs,
footerCache.getFooter(fs, new Path(split.getSplitXAttr().getPath())),
globalDictionaries,
codec,
autoCorrectCorruptDates,
readInt96AsTimeStamp,
vectorize,
enableDetailedTracing
);
return readerConfig.wrapIfNecessary(context.getAllocator(), inner, split.getDatasetSplit());
}
});
final ScanOperator scan = new ScanOperator(fragmentExecContext.getSchemaUpdater(), config, context, readers.iterator(), globalDictionaries);
logger.debug("Took {} ms to create Parquet Scan SqlOperatorImpl.", watch.elapsed(TimeUnit.MILLISECONDS));
return scan;
}
示例14: ParquetRecordWriter
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
public ParquetRecordWriter(OperatorContext context, ParquetWriter writer, ParquetFormatConfig config) throws OutOfMemoryException{
this.conf = new Configuration(writer.getFsConf());
this.context = context;
this.codecAllocator = context.getAllocator().newChildAllocator("ParquetCodecFactory", 0, Long.MAX_VALUE);
this.columnEncoderAllocator = context.getAllocator().newChildAllocator("ParquetColEncoder", 0, Long.MAX_VALUE);
this.codecFactory = CodecFactory.createDirectCodecFactory(this.conf,
new ParquetDirectByteBufferAllocator(codecAllocator), pageSize);
this.extraMetaData.put(DREMIO_VERSION_PROPERTY, DremioVersionInfo.getVersion());
this.extraMetaData.put(IS_DATE_CORRECT_PROPERTY, "true");
this.proxyUserUGI = ImpersonationUtil.createProxyUgi(writer.getUserName());
FragmentHandle handle = context.getFragmentHandle();
String fragmentId = String.format("%d_%d", handle.getMajorFragmentId(), handle.getMinorFragmentId());
this.location = writer.getLocation();
this.prefix = fragmentId;
this.extension = config.outputExtension;
memoryThreshold = (int) context.getOptions().getOption(ExecConstants.PARQUET_MEMORY_THRESHOLD_VALIDATOR);
blockSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_BLOCK_SIZE_VALIDATOR);
pageSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_PAGE_SIZE_VALIDATOR);
final String codecName = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE_VALIDATOR).toLowerCase();
switch(codecName) {
case "snappy":
codec = CompressionCodecName.SNAPPY;
break;
case "lzo":
codec = CompressionCodecName.LZO;
break;
case "gzip":
codec = CompressionCodecName.GZIP;
break;
case "none":
case "uncompressed":
codec = CompressionCodecName.UNCOMPRESSED;
break;
default:
throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName));
}
enableDictionary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_VALIDATOR);
enableDictionaryForBinary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_BINARY_TYPE_VALIDATOR);
maxPartitions = context.getOptions().getOption(ExecConstants.PARQUET_MAXIMUM_PARTITIONS_VALIDATOR);
minRecordsForFlush = context.getOptions().getOption(ExecConstants.PARQUET_MIN_RECORDS_FOR_FLUSH_VALIDATOR);
}
示例15: testPerformance
import org.apache.parquet.hadoop.CodecFactory; //導入依賴的package包/類
@Test
@Ignore
public void testPerformance(@Injectable final DrillbitContext bitContext,
@Injectable UserClientConnection connection) throws Exception {
final DrillConfig c = DrillConfig.create();
final FunctionImplementationRegistry registry = new FunctionImplementationRegistry(c);
final FragmentContext context = new FragmentContext(bitContext, BitControl.PlanFragment.getDefaultInstance(), connection, registry);
// new NonStrictExpectations() {
// {
// context.getAllocator(); result = BufferAllocator.getAllocator(DrillConfig.create());
// }
// };
final String fileName = "/tmp/parquet_test_performance.parquet";
final HashMap<String, FieldInfo> fields = new HashMap<>();
final ParquetTestProperties props = new ParquetTestProperties(1, 20 * 1000 * 1000, DEFAULT_BYTES_PER_PAGE, fields);
populateFieldInfoMap(props);
//generateParquetFile(fileName, props);
final Configuration dfsConfig = new Configuration();
final List<Footer> footers = ParquetFileReader.readFooters(dfsConfig, new Path(fileName));
final Footer f = footers.iterator().next();
final List<SchemaPath> columns = Lists.newArrayList();
columns.add(new SchemaPath("_MAP.integer", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bigInt", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.f", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.d", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.b", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin2", ExpressionPosition.UNKNOWN));
int totalRowCount = 0;
final FileSystem fs = new CachedSingleFileSystem(fileName);
final BufferAllocator allocator = RootAllocatorFactory.newRoot(c);
for(int i = 0; i < 25; i++) {
final ParquetRecordReader rr = new ParquetRecordReader(context, fileName, 0, fs,
CodecFactory.createDirectCodecFactory(dfsConfig, new ParquetDirectByteBufferAllocator(allocator), 0),
f.getParquetMetadata(), columns, ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION);
final TestOutputMutator mutator = new TestOutputMutator(allocator);
rr.setup(null, mutator);
final Stopwatch watch = Stopwatch.createStarted();
int rowCount = 0;
while ((rowCount = rr.next()) > 0) {
totalRowCount += rowCount;
}
System.out.println(String.format("Time completed: %s. ", watch.elapsed(TimeUnit.MILLISECONDS)));
rr.close();
}
allocator.close();
System.out.println(String.format("Total row count %s", totalRowCount));
}