本文整理汇总了Java中org.apache.parquet.column.ColumnDescriptor类的典型用法代码示例。如果您正苦于以下问题:Java ColumnDescriptor类的具体用法?Java ColumnDescriptor怎么用?Java ColumnDescriptor使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ColumnDescriptor类属于org.apache.parquet.column包,在下文中一共展示了ColumnDescriptor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: addColumn
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
public void addColumn(ColumnDescriptor descriptor, ColumnChunkMetaData metaData) throws IOException {
final FSDataInputStream in;
if (useSingleStream) {
if (streams.isEmpty()) {
in = fs.open(path);
streams.add(in);
} else {
in = streams.get(0);
}
in.seek(metaData.getStartingPos());
columns.put(descriptor, new SingleStreamColumnChunkIncPageReader(metaData, descriptor, in));
} else {
// create new stream per column
in = fs.open(path);
streams.add(in);
in.seek(metaData.getStartingPos());
columns.put(descriptor, new ColumnChunkIncPageReader(metaData, descriptor, in));
}
}
示例2: createGlobalDictionaries
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
/**
* Builds a global dictionary for parquet table for BINARY or FIXED_LEN_BYTE_ARRAY column types.
* It will remove exiting dictionaries if present and create new ones.
* @param fs filesystem
* @param tableDir root directory for given table that has parquet files
* @param bufferAllocator memory allocator
* @return GlobalDictionariesInfo that has dictionary version, root path and columns along with path to dictionary files.
* @throws IOException
*/
public static GlobalDictionariesInfo createGlobalDictionaries(FileSystem fs, Path tableDir, BufferAllocator bufferAllocator) throws IOException {
final FileStatus[] statuses = fs.listStatus(tableDir, PARQUET_FILES_FILTER);
final Map<ColumnDescriptor, Path> globalDictionaries = Maps.newHashMap();
final Map<ColumnDescriptor, List<Dictionary>> allDictionaries = readLocalDictionaries(fs, statuses, bufferAllocator);
final long dictionaryVersion = getDictionaryVersion(fs, tableDir) + 1;
final Path tmpDictionaryRootDir = createTempRootDir(fs, tableDir, dictionaryVersion);
logger.debug("Building global dictionaries for columns {} with version {}", allDictionaries.keySet(), dictionaryVersion);
// Sort all local dictionaries and write it to file with an index if needed
for (Map.Entry<ColumnDescriptor, List<Dictionary>> entry : allDictionaries.entrySet()) {
final ColumnDescriptor columnDescriptor = entry.getKey();
final Path dictionaryFile = dictionaryFilePath(tmpDictionaryRootDir, columnDescriptor);
logger.debug("Creating a new global dictionary for {} with version {}", columnDescriptor.toString(), dictionaryVersion);
createDictionaryFile(fs, dictionaryFile, columnDescriptor, entry.getValue(), null, bufferAllocator);
globalDictionaries.put(columnDescriptor, dictionaryFile);
}
final Path finalDictionaryRootDir = createDictionaryVersionedRootPath(fs, tableDir, dictionaryVersion, tmpDictionaryRootDir);
return new GlobalDictionariesInfo(globalDictionaries, finalDictionaryRootDir, dictionaryVersion);
}
示例3: main
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
public static void main(String []args) {
try (final BufferAllocator bufferAllocator = new RootAllocator(SabotConfig.getMaxDirectMemory())) {
final Path tableDir = new Path(args[0]);
final FileSystem fs = tableDir.getFileSystem(new Configuration());
if (fs.exists(tableDir) && fs.isDirectory(tableDir)) {
Map<ColumnDescriptor, Path> dictionaryEncodedColumns = createGlobalDictionaries(fs, tableDir, bufferAllocator).getColumnsToDictionaryFiles();
long version = getDictionaryVersion(fs, tableDir);
Path dictionaryRootDir = getDictionaryVersionedRootPath(fs, tableDir, version);
for (ColumnDescriptor columnDescriptor: dictionaryEncodedColumns.keySet()) {
final VectorContainer data = readDictionary(fs, dictionaryRootDir, columnDescriptor, bufferAllocator);
System.out.println("Dictionary for column [" + columnDescriptor.toString() + " size " + data.getRecordCount());
BatchPrinter.printBatch(data);
data.clear();
}
}
} catch (IOException ioe) {
logger.error("Failed ", ioe);
}
}
示例4: readDictionaries
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
/**
* Return dictionary per row group for all binary columns in given parquet file.
* @param fs filesystem object.
* @param filePath parquet file to scan
* @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
* @throws IOException
*/
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CodecFactory codecFactory) throws IOException {
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
if (parquetMetadata.getBlocks().size() > 1) {
throw new IOException(
format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
parquetMetadata.getBlocks().size(), filePath));
}
final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();
for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
}
final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
try(final FSDataInputStream in = fs.open(filePath)) {
for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
if (isBinaryType(columnChunkMetaData.getType())) {
final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
// if first page is dictionary encoded then load dictionary, otherwise skip this column.
final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
} else {
columnsToSkip.add(column);
}
}
}
}
return new ImmutablePair<>(dictionaries, columnsToSkip);
}
示例5: convertColumnDescriptor
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
/**
* Converts {@link ColumnDescriptor} to {@link SchemaPath} and converts any parquet LOGICAL LIST to something
* the execution engine can understand (removes the extra 'list' and 'element' fields from the name)
*/
private static SchemaPath convertColumnDescriptor(final MessageType schema, final ColumnDescriptor columnDescriptor) {
List<String> path = Lists.newArrayList(columnDescriptor.getPath());
// go through the path and find all logical lists
int index = 0;
Type type = schema;
while (!type.isPrimitive()) { // don't bother checking the last element in the path as it is a primitive type
type = type.asGroupType().getType(path.get(index));
if (type.getOriginalType() == OriginalType.LIST && LogicalListL1Converter.isSupportedSchema(type.asGroupType())) {
// remove 'list'
type = type.asGroupType().getType(path.get(index+1));
path.remove(index+1);
// remove 'element'
type = type.asGroupType().getType(path.get(index+1));
path.remove(index+1);
}
index++;
}
String[] schemaColDesc = new String[path.size()];
path.toArray(schemaColDesc);
return SchemaPath.getCompoundPath(schemaColDesc);
}
示例6: ColumnReader
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
protected ColumnReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException {
this.parentReader = parentReader;
this.columnDescriptor = descriptor;
this.columnChunkMetaData = columnChunkMetaData;
this.isFixedLength = fixedLength;
this.schemaElement = schemaElement;
this.valueVec = v;
this.pageReader = (parentReader.getSingleStream() != null)?
new DeprecatedSingleStreamPageReader(this, parentReader.getSingleStream(), parentReader.getHadoopPath(), columnChunkMetaData) :
new PageReader(this, parentReader.getFileSystem(), parentReader.getHadoopPath(), columnChunkMetaData);
if (columnDescriptor.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
if (columnDescriptor.getType() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
dataTypeLengthInBits = columnDescriptor.getTypeLength() * 8;
} else if (columnDescriptor.getType() == PrimitiveTypeName.INT96
&& (valueVec instanceof TimeStampMilliVector || valueVec instanceof NullableTimeStampMilliVector)) {
// if int 96 column is being read as a Timestamp, this truncates the time format used by Impala
// dataTypeLengthInBits is only ever used when computing offsets into the destination vector, so it
// needs to be set to the bit width of the resulting Arrow type, usually this matches the input length
dataTypeLengthInBits = 64;
} else {
dataTypeLengthInBits = DeprecatedParquetVectorizedReader.getTypeLengthInBits(columnDescriptor.getType());
}
}
}
示例7: testLocalDictionaries
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
@Test
public void testLocalDictionaries() throws IOException {
try (final BufferAllocator bufferAllocator = new RootAllocator(SabotConfig.getMaxDirectMemory())) {
final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries1 =
LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook1.parquet"), codecFactory);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries2 =
LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook2.parquet"), codecFactory);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries3 =
LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook3.parquet"), codecFactory);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries4 =
LocalDictionariesReader.readDictionaries(fs, new Path(partitionDirPath, "phonebook4.parquet"), codecFactory);
assertEquals(2, dictionaries1.getKey().size()); // name and kind have dictionaries
assertEquals(1, dictionaries2.getKey().size());
assertEquals(1, dictionaries3.getKey().size());
assertEquals(1, dictionaries4.getKey().size());
assertEquals(0, dictionaries1.getValue().size());
assertEquals(1, dictionaries2.getValue().size()); // skip name
assertEquals(1, dictionaries3.getValue().size()); // skip name
assertEquals(1, dictionaries4.getValue().size()); // skip name
}
}
示例8: read
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
@Test
public void read(String fileName) throws IOException
{
Path path = new Path(fileName);
Configuration conf = new Configuration();
conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
ParquetMetadata metadata = ParquetFileReader.readFooter(conf, path, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, metadata.getFileMetaData(), path, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns());
PageReadStore pageReadStore;
PageReader pageReader;
DataPage page;
while ((pageReadStore = reader.readNextRowGroup()) != null) {
for (ColumnDescriptor cd: metadata.getFileMetaData().getSchema().getColumns()) {
pageReader = pageReadStore.getPageReader(cd);
page = pageReader.readPage();
}
}
}
示例9: loadParquetSchema
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
/**
* Scan the Parquet footer, then map each Parquet column to the list of columns
* we want to read. Track those to be read.
*/
private void loadParquetSchema() {
// TODO - figure out how to deal with this better once we add nested reading, note also look where this map is used below
// store a map from column name to converted types if they are non-null
Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
// loop to add up the length of the fixed width columns and build the schema
for (ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
ParquetColumnMetadata columnMetadata = new ParquetColumnMetadata(column);
columnMetadata.resolveDrillType(schemaElements, options);
if (! fieldSelected(columnMetadata.field)) {
continue;
}
selectedColumnMetadata.add(columnMetadata);
}
}
示例10: initialize
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
public void initialize(FileMetaData parquetFileMetadata,
Path file, List<BlockMetaData> blocks, Configuration configuration)
throws IOException {
// initialize a ReadContext for this file
Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
configuration, toSetMultiMap(fileMetadata), fileSchema));
this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
this.requestedSchema = readContext.getRequestedSchema();
this.fileSchema = parquetFileMetadata.getSchema();
this.file = file;
this.columnCount = requestedSchema.getPaths().size();
this.recordConverter = readSupport.prepareForRead(
configuration, fileMetadata, fileSchema, readContext);
this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
List<ColumnDescriptor> columns = requestedSchema.getColumns();
reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
for (BlockMetaData block : blocks) {
total += block.getRowCount();
}
this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
示例11: initialize
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
public void initialize(MessageType fileSchema,
FileMetaData parquetFileMetadata,
Path file, List<BlockMetaData> blocks, Configuration configuration)
throws IOException {
// initialize a ReadContext for this file
Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
configuration, toSetMultiMap(fileMetadata), fileSchema));
this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
this.requestedSchema = readContext.getRequestedSchema();
this.fileSchema = fileSchema;
this.file = file;
this.columnCount = requestedSchema.getPaths().size();
this.recordConverter = readSupport.prepareForRead(
configuration, fileMetadata, fileSchema, readContext);
this.strictTypeChecking = true;
List<ColumnDescriptor> columns = requestedSchema.getColumns();
reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
for (BlockMetaData block : blocks) {
total += block.getRowCount();
}
Log.info("RecordReader initialized will read a total of " + total + " records.");
}
示例12: showDetails
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) {
String name = Strings.repeat(".", depth) + type.getName();
OriginalType otype = type.getOriginalType();
Repetition rep = type.getRepetition();
PrimitiveTypeName ptype = type.getPrimitiveTypeName();
out.format("%s: %s %s", name, rep, ptype);
if (otype != null) out.format(" O:%s", otype);
if (container != null) {
cpath.add(type.getName());
String[] paths = cpath.toArray(new String[cpath.size()]);
cpath.remove(cpath.size() - 1);
ColumnDescriptor desc = container.getColumnDescription(paths);
int defl = desc.getMaxDefinitionLevel();
int repl = desc.getMaxRepetitionLevel();
out.format(" R:%d D:%d", repl, defl);
}
out.println();
}
示例13: newValuesWriter
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
@Override
public ValuesWriter newValuesWriter(ColumnDescriptor descriptor) {
switch (descriptor.getType()) {
case BOOLEAN:
return getBooleanValuesWriter();
case FIXED_LEN_BYTE_ARRAY:
return getFixedLenByteArrayValuesWriter(descriptor);
case BINARY:
return getBinaryValuesWriter(descriptor);
case INT32:
return getInt32ValuesWriter(descriptor);
case INT64:
return getInt64ValuesWriter(descriptor);
case INT96:
return getInt96ValuesWriter(descriptor);
case DOUBLE:
return getDoubleValuesWriter(descriptor);
case FLOAT:
return getFloatValuesWriter(descriptor);
default:
throw new IllegalArgumentException("Unknown type " + descriptor.getType());
}
}
示例14: dictionaryWriter
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
static DictionaryValuesWriter dictionaryWriter(ColumnDescriptor path, ParquetProperties properties, Encoding dictPageEncoding, Encoding dataPageEncoding) {
switch (path.getType()) {
case BOOLEAN:
throw new IllegalArgumentException("no dictionary encoding for BOOLEAN");
case BINARY:
return new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
case INT32:
return new DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
case INT64:
return new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
case INT96:
return new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), 12, dataPageEncoding, dictPageEncoding, properties.getAllocator());
case DOUBLE:
return new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
case FLOAT:
return new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
case FIXED_LEN_BYTE_ARRAY:
return new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), path.getTypeLength(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
default:
throw new IllegalArgumentException("Unknown type " + path.getType());
}
}
示例15: ColumnReaderImpl
import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
/**
* creates a reader for triplets
* @param path the descriptor for the corresponding column
* @param pageReader the underlying store to read from
*/
public ColumnReaderImpl(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) {
this.path = checkNotNull(path, "path");
this.pageReader = checkNotNull(pageReader, "pageReader");
this.converter = checkNotNull(converter, "converter");
this.writerVersion = writerVersion;
DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
if (dictionaryPage != null) {
try {
this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage);
if (converter.hasDictionarySupport()) {
converter.setDictionary(dictionary);
}
} catch (IOException e) {
throw new ParquetDecodingException("could not decode the dictionary for " + path, e);
}
} else {
this.dictionary = null;
}
this.totalValueCount = pageReader.getTotalValueCount();
if (totalValueCount <= 0) {
throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0");
}
consume();
}