本文整理汇总了Java中org.apache.parquet.column.Dictionary类的典型用法代码示例。如果您正苦于以下问题:Java Dictionary类的具体用法?Java Dictionary怎么用?Java Dictionary使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Dictionary类属于org.apache.parquet.column包,在下文中一共展示了Dictionary类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: createGlobalDictionaries
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
/**
* Builds a global dictionary for parquet table for BINARY or FIXED_LEN_BYTE_ARRAY column types.
* It will remove exiting dictionaries if present and create new ones.
* @param fs filesystem
* @param tableDir root directory for given table that has parquet files
* @param bufferAllocator memory allocator
* @return GlobalDictionariesInfo that has dictionary version, root path and columns along with path to dictionary files.
* @throws IOException
*/
public static GlobalDictionariesInfo createGlobalDictionaries(FileSystem fs, Path tableDir, BufferAllocator bufferAllocator) throws IOException {
final FileStatus[] statuses = fs.listStatus(tableDir, PARQUET_FILES_FILTER);
final Map<ColumnDescriptor, Path> globalDictionaries = Maps.newHashMap();
final Map<ColumnDescriptor, List<Dictionary>> allDictionaries = readLocalDictionaries(fs, statuses, bufferAllocator);
final long dictionaryVersion = getDictionaryVersion(fs, tableDir) + 1;
final Path tmpDictionaryRootDir = createTempRootDir(fs, tableDir, dictionaryVersion);
logger.debug("Building global dictionaries for columns {} with version {}", allDictionaries.keySet(), dictionaryVersion);
// Sort all local dictionaries and write it to file with an index if needed
for (Map.Entry<ColumnDescriptor, List<Dictionary>> entry : allDictionaries.entrySet()) {
final ColumnDescriptor columnDescriptor = entry.getKey();
final Path dictionaryFile = dictionaryFilePath(tmpDictionaryRootDir, columnDescriptor);
logger.debug("Creating a new global dictionary for {} with version {}", columnDescriptor.toString(), dictionaryVersion);
createDictionaryFile(fs, dictionaryFile, columnDescriptor, entry.getValue(), null, bufferAllocator);
globalDictionaries.put(columnDescriptor, dictionaryFile);
}
final Path finalDictionaryRootDir = createDictionaryVersionedRootPath(fs, tableDir, dictionaryVersion, tmpDictionaryRootDir);
return new GlobalDictionariesInfo(globalDictionaries, finalDictionaryRootDir, dictionaryVersion);
}
示例2: readDictionaries
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
/**
* Return dictionary per row group for all binary columns in given parquet file.
* @param fs filesystem object.
* @param filePath parquet file to scan
* @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
* @throws IOException
*/
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CodecFactory codecFactory) throws IOException {
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
if (parquetMetadata.getBlocks().size() > 1) {
throw new IOException(
format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
parquetMetadata.getBlocks().size(), filePath));
}
final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();
for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
}
final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
try(final FSDataInputStream in = fs.open(filePath)) {
for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
if (isBinaryType(columnChunkMetaData.getType())) {
final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
// if first page is dictionary encoded then load dictionary, otherwise skip this column.
final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
} else {
columnsToSkip.add(column);
}
}
}
}
return new ImmutablePair<>(dictionaries, columnsToSkip);
}
示例3: testLocalDictionaries
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
@Test
public void testLocalDictionaries() throws IOException {
try (final BufferAllocator bufferAllocator = new RootAllocator(SabotConfig.getMaxDirectMemory())) {
final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries1 =
LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook1.parquet"), codecFactory);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries2 =
LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook2.parquet"), codecFactory);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries3 =
LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook3.parquet"), codecFactory);
Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries4 =
LocalDictionariesReader.readDictionaries(fs, new Path(partitionDirPath, "phonebook4.parquet"), codecFactory);
assertEquals(2, dictionaries1.getKey().size()); // name and kind have dictionaries
assertEquals(1, dictionaries2.getKey().size());
assertEquals(1, dictionaries3.getKey().size());
assertEquals(1, dictionaries4.getKey().size());
assertEquals(0, dictionaries1.getValue().size());
assertEquals(1, dictionaries2.getValue().size()); // skip name
assertEquals(1, dictionaries3.getValue().size()); // skip name
assertEquals(1, dictionaries4.getValue().size()); // skip name
}
}
示例4: readLocalDictionaries
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static Map<ColumnDescriptor, List<Dictionary>> readLocalDictionaries(FileSystem fs, FileStatus[] statuses, BufferAllocator allocator) throws IOException{
final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // These columns are not dictionary encoded in at least one file.
final Map<ColumnDescriptor, List<Dictionary>> allDictionaries = Maps.newHashMap();
final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(allocator), 0);
for (FileStatus status : statuses) {
logger.debug("Scanning file {}", status.getPath());
final Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> localDictionaries = LocalDictionariesReader.readDictionaries(
fs, status.getPath(), codecFactory);
// Skip columns which are not dictionary encoded
for (ColumnDescriptor skippedColumn : localDictionaries.getRight()) {
columnsToSkip.add(skippedColumn);
allDictionaries.remove(skippedColumn);
}
for (final Map.Entry<ColumnDescriptor, Dictionary> entry : localDictionaries.getLeft().entrySet()) {
if (!columnsToSkip.contains(entry.getKey())) {
if (allDictionaries.containsKey(entry.getKey())) {
allDictionaries.get(entry.getKey()).add(entry.getValue());
} else {
allDictionaries.put(entry.getKey(), Lists.newArrayList(entry.getValue()));
}
}
}
}
logger.debug("Skipping columns {}", columnsToSkip);
return allDictionaries;
}
示例5: buildIntegerGlobalDictionary
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static VectorContainer buildIntegerGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(32, true), null);
final VectorContainer input = new VectorContainer(bufferAllocator);
final NullableIntVector intVector = input.addOrGet(field);
intVector.allocateNew();
final SortedSet<Integer> values = Sets.newTreeSet();
for (Dictionary dictionary : dictionaries) {
for (int i = 0; i <= dictionary.getMaxId(); ++i) {
values.add(dictionary.decodeToInt(i));
}
}
if (existingDict != null) {
final NullableIntVector existingDictValues = existingDict.getValueAccessorById(NullableIntVector.class, 0).getValueVector();
for (int i = 0; i < existingDict.getRecordCount(); ++i) {
values.add(existingDictValues.getAccessor().get(i));
}
}
final Iterator<Integer> iter = values.iterator();
int recordCount = 0;
while (iter.hasNext()) {
intVector.getMutator().setSafe(recordCount++, iter.next());
}
intVector.getMutator().setValueCount(recordCount);
input.setRecordCount(recordCount);
input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
return input;
}
示例6: buildLongGlobalDictionary
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static VectorContainer buildLongGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(64, true), null);
final VectorContainer input = new VectorContainer(bufferAllocator);
final NullableBigIntVector longVector = input.addOrGet(field);
longVector.allocateNew();
SortedSet<Long> values = Sets.newTreeSet();
for (Dictionary dictionary : dictionaries) {
for (int i = 0; i <= dictionary.getMaxId(); ++i) {
values.add(dictionary.decodeToLong(i));
}
}
if (existingDict != null) {
final NullableBigIntVector existingDictValues = existingDict.getValueAccessorById(NullableBigIntVector.class, 0).getValueVector();
for (int i = 0; i < existingDict.getRecordCount(); ++i) {
values.add(existingDictValues.getAccessor().get(i));
}
}
final Iterator<Long> iter = values.iterator();
int recordCount = 0;
while (iter.hasNext()) {
longVector.getMutator().setSafe(recordCount++, iter.next());
}
longVector.getMutator().setValueCount(recordCount);
input.setRecordCount(recordCount);
input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
return input;
}
示例7: buildDoubleGlobalDictionary
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static VectorContainer buildDoubleGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null);
final VectorContainer input = new VectorContainer(bufferAllocator);
final NullableFloat8Vector doubleVector = input.addOrGet(field);
doubleVector.allocateNew();
SortedSet<Double> values = Sets.newTreeSet();
for (Dictionary dictionary : dictionaries) {
for (int i = 0; i <= dictionary.getMaxId(); ++i) {
values.add(dictionary.decodeToDouble(i));
}
}
if (existingDict != null) {
final NullableFloat8Vector existingDictValues = existingDict.getValueAccessorById(NullableFloat8Vector.class, 0).getValueVector();
for (int i = 0; i < existingDict.getRecordCount(); ++i) {
values.add(existingDictValues.getAccessor().get(i));
}
}
final Iterator<Double> iter = values.iterator();
int recordCount = 0;
while (iter.hasNext()) {
doubleVector.getMutator().setSafe(recordCount++, iter.next());
}
doubleVector.getMutator().setValueCount(recordCount);
input.setRecordCount(recordCount);
input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
return input;
}
示例8: buildFloatGlobalDictionary
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static VectorContainer buildFloatGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null);
final VectorContainer input = new VectorContainer(bufferAllocator);
final NullableFloat4Vector floatVector = input.addOrGet(field);
floatVector.allocateNew();
SortedSet<Float> values = Sets.newTreeSet();
for (Dictionary dictionary : dictionaries) {
for (int i = 0; i <= dictionary.getMaxId(); ++i) {
values.add(dictionary.decodeToFloat(i));
}
}
if (existingDict != null) {
final NullableFloat4Vector existingDictValues = existingDict.getValueAccessorById(NullableFloat4Vector.class, 0).getValueVector();
for (int i = 0; i < existingDict.getRecordCount(); ++i) {
values.add(existingDictValues.getAccessor().get(i));
}
}
final Iterator<Float> iter = values.iterator();
int recordCount = 0;
while (iter.hasNext()) {
floatVector.getMutator().setSafe(recordCount++, iter.next());
}
floatVector.getMutator().setValueCount(recordCount);
input.setRecordCount(recordCount);
input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
return input;
}
示例9: buildBinaryGlobalDictionary
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static VectorContainer buildBinaryGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Binary(), null);
final VectorContainer input = new VectorContainer(bufferAllocator);
final NullableVarBinaryVector binaryVector = input.addOrGet(field);
binaryVector.allocateNew();
final SortedSet<Binary> values = new TreeSet<>();
for (Dictionary dictionary : dictionaries) {
for (int i = 0; i <= dictionary.getMaxId(); ++i) {
values.add(dictionary.decodeToBinary(i));
}
}
if (existingDict != null) {
final NullableVarBinaryVector existingDictValues = existingDict.getValueAccessorById(NullableVarBinaryVector.class, 0).getValueVector();
for (int i = 0; i < existingDict.getRecordCount(); ++i) {
values.add(Binary.fromConstantByteArray(existingDictValues.getAccessor().get(i)));
}
}
final Iterator<Binary> iter = values.iterator();
int recordCount = 0;
while (iter.hasNext()) {
final byte[] data = iter.next().getBytes();
binaryVector.getMutator().setSafe(recordCount++, data, 0, data.length);
}
binaryVector.getMutator().setValueCount(recordCount);
input.setRecordCount(recordCount);
input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
return input;
}
示例10: readDictionary
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
public static Dictionary readDictionary(FSDataInputStream in, ColumnDescriptor column, PageHeaderWithOffset pageHeader, BytesDecompressor decompressor) throws IOException {
in.seek(pageHeader.getOffset());
final byte[] data = new byte[pageHeader.getPageHeader().getCompressed_page_size()];
int read = in.read(data);
if (read != data.length) {
throw new IOException(format("Failed to read dictionary page, read %d bytes, expected %d", read, data.length));
}
final DictionaryPage dictionaryPage = new DictionaryPage(
decompressor.decompress(BytesInput.from(data), pageHeader.getPageHeader().getUncompressed_page_size()),
pageHeader.getPageHeader().getDictionary_page_header().getNum_values(),
CONVERTER.getEncoding(pageHeader.getPageHeader().getDictionary_page_header().getEncoding()));
return dictionaryPage.getEncoding().initDictionary(column, dictionaryPage);
}
示例11: main
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
public static void main(String[] args) {
try (final BufferAllocator bufferAllocator = new RootAllocator(SabotConfig.getMaxDirectMemory())) {
final FileSystem fs = FileSystem.getLocal(new Configuration());
final Path filePath = new Path(args[0]);
final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
final Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries = readDictionaries(fs, filePath, codecFactory);
for (Map.Entry<ColumnDescriptor, Dictionary> entry : dictionaries.getLeft().entrySet()) {
printDictionary(entry.getKey(), entry.getValue());
}
System.out.println("Binary columns which are not dictionary encoded: " + dictionaries.getRight());
} catch (IOException ioe) {
logger.error("Failed ", ioe);
}
}
示例12: printDictionary
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
public static void printDictionary(ColumnDescriptor columnDescriptor, Dictionary localDictionary) {
System.out.println("Dictionary for column " + columnDescriptor.toString());
for (int i = 0; i < localDictionary.getMaxId(); ++i) {
switch (columnDescriptor.getType()) {
case INT32:
System.out.println(format("%d: %d", i, localDictionary.decodeToInt(i)));
break;
case INT64:
System.out.println(format("%d: %d", i, localDictionary.decodeToLong(i)));
break;
case INT96:
case BINARY:
case FIXED_LEN_BYTE_ARRAY:
System.out.println(format("%d: %s", i, new String(localDictionary.decodeToBinary(i).getBytesUnsafe())));
break;
case FLOAT:
System.out.println(format("%d: %f", i, localDictionary.decodeToFloat(i)));
break;
case DOUBLE:
System.out.println(format("%d: %f", i, localDictionary.decodeToDouble(i)));
break;
case BOOLEAN:
System.out.println(format("%d: %b", i, localDictionary.decodeToBoolean(i)));
break;
default:
break;
}
}
}
示例13: setDictionary
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
@Override
public void setDictionary(Dictionary dictionary)
{
expandedDictionary = new Value[dictionary.getMaxId() + 1];
for (int id = 0; id <= dictionary.getMaxId(); id++) {
// This is copied array. Copying at ValueFactory#newString is not necessary.
byte[] bytes = dictionary.decodeToBinary(id).getBytes();
expandedDictionary[id] = ValueFactory.newString(bytes);
}
}
示例14: setDictionary
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
@Override
public void setDictionary(Dictionary dictionary)
{
expandedDictionary = new Value[dictionary.getMaxId() + 1];
for (int id = 0; id <= dictionary.getMaxId(); id++) {
expandedDictionary[id] = decimalFromLong(dictionary.decodeToInt(id));
}
}
示例15: setDictionary
import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
@Override
public void setDictionary(Dictionary dictionary) {
_dict = new String[dictionary.getMaxId() + 1];
for (int i = 0; i <= dictionary.getMaxId(); i++) {
_dict[i] = dictionary.decodeToBinary(i).toStringUsingUTF8();
}
}