当前位置: 首页>>代码示例>>Java>>正文


Java Dictionary类代码示例

本文整理汇总了Java中org.apache.parquet.column.Dictionary的典型用法代码示例。如果您正苦于以下问题:Java Dictionary类的具体用法?Java Dictionary怎么用?Java Dictionary使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


Dictionary类属于org.apache.parquet.column包,在下文中一共展示了Dictionary类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: createGlobalDictionaries

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
/**
 * Builds a global dictionary for parquet table for BINARY or FIXED_LEN_BYTE_ARRAY column types.
 * It will remove exiting dictionaries if present and create new ones.
 * @param fs filesystem
 * @param tableDir root directory for given table that has parquet files
 * @param bufferAllocator memory allocator
 * @return GlobalDictionariesInfo that has dictionary version, root path and columns along with path to dictionary files.
 * @throws IOException
 */
public static GlobalDictionariesInfo createGlobalDictionaries(FileSystem fs, Path tableDir, BufferAllocator bufferAllocator) throws IOException {
  final FileStatus[] statuses = fs.listStatus(tableDir, PARQUET_FILES_FILTER);
  final Map<ColumnDescriptor, Path> globalDictionaries = Maps.newHashMap();
  final Map<ColumnDescriptor, List<Dictionary>> allDictionaries = readLocalDictionaries(fs, statuses, bufferAllocator);
  final long dictionaryVersion = getDictionaryVersion(fs, tableDir) + 1;
  final Path tmpDictionaryRootDir = createTempRootDir(fs, tableDir, dictionaryVersion);
  logger.debug("Building global dictionaries for columns {} with version {}", allDictionaries.keySet(), dictionaryVersion);

  // Sort all local dictionaries and write it to file with an index if needed
  for (Map.Entry<ColumnDescriptor, List<Dictionary>> entry : allDictionaries.entrySet()) {
    final ColumnDescriptor columnDescriptor = entry.getKey();
    final Path dictionaryFile = dictionaryFilePath(tmpDictionaryRootDir, columnDescriptor);
    logger.debug("Creating a new global dictionary for {} with version {}", columnDescriptor.toString(), dictionaryVersion);
    createDictionaryFile(fs, dictionaryFile, columnDescriptor, entry.getValue(), null, bufferAllocator);
    globalDictionaries.put(columnDescriptor, dictionaryFile);
  }
  final Path finalDictionaryRootDir = createDictionaryVersionedRootPath(fs, tableDir, dictionaryVersion, tmpDictionaryRootDir);
  return new GlobalDictionariesInfo(globalDictionaries, finalDictionaryRootDir,  dictionaryVersion);
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:29,代码来源:GlobalDictionaryBuilder.java

示例2: readDictionaries

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
/**
 * Return dictionary per row group for all binary columns in given parquet file.
 * @param fs filesystem object.
 * @param filePath parquet file to scan
 * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
 * @throws IOException
 */
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CodecFactory codecFactory) throws IOException {
  final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
  if (parquetMetadata.getBlocks().size() > 1) {
    throw new IOException(
      format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
        parquetMetadata.getBlocks().size(), filePath));
  }
  final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
  final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();

  for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
    columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
  }

  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
  final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
  try(final FSDataInputStream in = fs.open(filePath)) {
    for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
      if (isBinaryType(columnChunkMetaData.getType())) {
        final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
        // if first page is dictionary encoded then load dictionary, otherwise skip this column.
        final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
        if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
          dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
        } else {
          columnsToSkip.add(column);
        }
      }
    }
  }
  return new ImmutablePair<>(dictionaries, columnsToSkip);
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:40,代码来源:LocalDictionariesReader.java

示例3: testLocalDictionaries

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
@Test
public void testLocalDictionaries() throws IOException {
  try (final BufferAllocator bufferAllocator = new RootAllocator(SabotConfig.getMaxDirectMemory())) {
    final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries1 =
      LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook1.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries2 =
      LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook2.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries3 =
      LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook3.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries4 =
      LocalDictionariesReader.readDictionaries(fs, new Path(partitionDirPath, "phonebook4.parquet"), codecFactory);

    assertEquals(2, dictionaries1.getKey().size()); // name and kind have dictionaries
    assertEquals(1, dictionaries2.getKey().size());
    assertEquals(1, dictionaries3.getKey().size());
    assertEquals(1, dictionaries4.getKey().size());

    assertEquals(0, dictionaries1.getValue().size());
    assertEquals(1, dictionaries2.getValue().size()); // skip name
    assertEquals(1, dictionaries3.getValue().size()); // skip name
    assertEquals(1, dictionaries4.getValue().size()); // skip name
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:25,代码来源:TestGlobalDictionaryBuilder.java

示例4: readLocalDictionaries

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static Map<ColumnDescriptor, List<Dictionary>> readLocalDictionaries(FileSystem fs, FileStatus[] statuses, BufferAllocator allocator) throws IOException{
  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // These columns are not dictionary encoded in at least one file.
  final Map<ColumnDescriptor, List<Dictionary>> allDictionaries = Maps.newHashMap();
  final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(allocator), 0);
  for (FileStatus status : statuses) {
    logger.debug("Scanning file {}", status.getPath());
    final Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> localDictionaries = LocalDictionariesReader.readDictionaries(
      fs, status.getPath(), codecFactory);

    // Skip columns which are not dictionary encoded
    for (ColumnDescriptor skippedColumn : localDictionaries.getRight()) {
      columnsToSkip.add(skippedColumn);
      allDictionaries.remove(skippedColumn);
    }

    for (final Map.Entry<ColumnDescriptor, Dictionary> entry : localDictionaries.getLeft().entrySet()) {
      if (!columnsToSkip.contains(entry.getKey())) {
        if (allDictionaries.containsKey(entry.getKey())) {
          allDictionaries.get(entry.getKey()).add(entry.getValue());
        } else {
          allDictionaries.put(entry.getKey(), Lists.newArrayList(entry.getValue()));
        }
      }
    }
  }
  logger.debug("Skipping columns {}", columnsToSkip);
  return allDictionaries;
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:29,代码来源:GlobalDictionaryBuilder.java

示例5: buildIntegerGlobalDictionary

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static VectorContainer buildIntegerGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(32, true), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final NullableIntVector intVector = input.addOrGet(field);
  intVector.allocateNew();
  final SortedSet<Integer> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToInt(i));
    }
  }
  if (existingDict != null) {
    final NullableIntVector existingDictValues = existingDict.getValueAccessorById(NullableIntVector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.getAccessor().get(i));
    }
  }
  final Iterator<Integer> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    intVector.getMutator().setSafe(recordCount++, iter.next());
  }
  intVector.getMutator().setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:28,代码来源:GlobalDictionaryBuilder.java

示例6: buildLongGlobalDictionary

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static VectorContainer buildLongGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(64, true), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final NullableBigIntVector longVector = input.addOrGet(field);
  longVector.allocateNew();
  SortedSet<Long> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToLong(i));
    }
  }
  if (existingDict != null) {
    final NullableBigIntVector existingDictValues = existingDict.getValueAccessorById(NullableBigIntVector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.getAccessor().get(i));
    }
  }
  final Iterator<Long> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    longVector.getMutator().setSafe(recordCount++, iter.next());
  }
  longVector.getMutator().setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:28,代码来源:GlobalDictionaryBuilder.java

示例7: buildDoubleGlobalDictionary

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static VectorContainer buildDoubleGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final NullableFloat8Vector doubleVector = input.addOrGet(field);
  doubleVector.allocateNew();
  SortedSet<Double> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToDouble(i));
    }
  }
  if (existingDict != null) {
    final NullableFloat8Vector existingDictValues = existingDict.getValueAccessorById(NullableFloat8Vector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.getAccessor().get(i));
    }
  }
  final Iterator<Double> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    doubleVector.getMutator().setSafe(recordCount++, iter.next());
  }
  doubleVector.getMutator().setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:28,代码来源:GlobalDictionaryBuilder.java

示例8: buildFloatGlobalDictionary

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static VectorContainer buildFloatGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final NullableFloat4Vector floatVector = input.addOrGet(field);
  floatVector.allocateNew();
  SortedSet<Float> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToFloat(i));
    }
  }
  if (existingDict != null) {
    final NullableFloat4Vector existingDictValues = existingDict.getValueAccessorById(NullableFloat4Vector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.getAccessor().get(i));
    }
  }
  final Iterator<Float> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    floatVector.getMutator().setSafe(recordCount++, iter.next());
  }
  floatVector.getMutator().setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:28,代码来源:GlobalDictionaryBuilder.java

示例9: buildBinaryGlobalDictionary

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
private static VectorContainer buildBinaryGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Binary(), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final NullableVarBinaryVector binaryVector = input.addOrGet(field);
  binaryVector.allocateNew();
  final SortedSet<Binary> values = new TreeSet<>();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToBinary(i));
    }
  }
  if (existingDict != null) {
    final NullableVarBinaryVector existingDictValues = existingDict.getValueAccessorById(NullableVarBinaryVector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(Binary.fromConstantByteArray(existingDictValues.getAccessor().get(i)));
    }
  }
  final Iterator<Binary> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    final byte[] data = iter.next().getBytes();
    binaryVector.getMutator().setSafe(recordCount++, data, 0, data.length);
  }
  binaryVector.getMutator().setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:29,代码来源:GlobalDictionaryBuilder.java

示例10: readDictionary

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
public static Dictionary readDictionary(FSDataInputStream in, ColumnDescriptor column, PageHeaderWithOffset pageHeader, BytesDecompressor decompressor) throws IOException {
  in.seek(pageHeader.getOffset());
  final byte[] data = new byte[pageHeader.getPageHeader().getCompressed_page_size()];
  int read = in.read(data);
  if (read != data.length) {
    throw new IOException(format("Failed to read dictionary page, read %d bytes, expected %d", read, data.length));
  }
  final DictionaryPage dictionaryPage = new DictionaryPage(
    decompressor.decompress(BytesInput.from(data), pageHeader.getPageHeader().getUncompressed_page_size()),
    pageHeader.getPageHeader().getDictionary_page_header().getNum_values(),
    CONVERTER.getEncoding(pageHeader.getPageHeader().getDictionary_page_header().getEncoding()));
  return dictionaryPage.getEncoding().initDictionary(column, dictionaryPage);
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:14,代码来源:LocalDictionariesReader.java

示例11: main

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
public static void main(String[] args) {
  try (final BufferAllocator bufferAllocator = new RootAllocator(SabotConfig.getMaxDirectMemory())) {
    final FileSystem fs = FileSystem.getLocal(new Configuration());
    final Path filePath = new Path(args[0]);
    final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
    final Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries = readDictionaries(fs, filePath, codecFactory);
    for (Map.Entry<ColumnDescriptor, Dictionary> entry :  dictionaries.getLeft().entrySet()) {
      printDictionary(entry.getKey(), entry.getValue());
    }
    System.out.println("Binary columns which are not dictionary encoded: " + dictionaries.getRight());
  } catch (IOException ioe) {
    logger.error("Failed ", ioe);
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:15,代码来源:LocalDictionariesReader.java

示例12: printDictionary

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
public static void printDictionary(ColumnDescriptor columnDescriptor, Dictionary localDictionary) {
  System.out.println("Dictionary for column " + columnDescriptor.toString());
  for (int i = 0; i < localDictionary.getMaxId(); ++i) {
    switch (columnDescriptor.getType()) {
      case INT32:
        System.out.println(format("%d: %d", i, localDictionary.decodeToInt(i)));
        break;
      case INT64:
        System.out.println(format("%d: %d", i, localDictionary.decodeToLong(i)));
        break;
      case INT96:
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
        System.out.println(format("%d: %s", i, new String(localDictionary.decodeToBinary(i).getBytesUnsafe())));
        break;
      case FLOAT:
        System.out.println(format("%d: %f", i, localDictionary.decodeToFloat(i)));
        break;
      case DOUBLE:
        System.out.println(format("%d: %f", i, localDictionary.decodeToDouble(i)));
        break;
      case BOOLEAN:
        System.out.println(format("%d: %b", i, localDictionary.decodeToBoolean(i)));
        break;
      default:
        break;
    }
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:30,代码来源:LocalDictionariesReader.java

示例13: setDictionary

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
@Override
public void setDictionary(Dictionary dictionary)
{
    expandedDictionary = new Value[dictionary.getMaxId() + 1];
    for (int id = 0; id <= dictionary.getMaxId(); id++) {
        // This is copied array. Copying at ValueFactory#newString is not necessary.
        byte[] bytes = dictionary.decodeToBinary(id).getBytes();
        expandedDictionary[id] = ValueFactory.newString(bytes);
    }
}
 
开发者ID:CyberAgent,项目名称:embulk-input-parquet_hadoop,代码行数:11,代码来源:ParquetStringConverter.java

示例14: setDictionary

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
@Override
public void setDictionary(Dictionary dictionary)
{
    expandedDictionary = new Value[dictionary.getMaxId() + 1];
    for (int id = 0; id <= dictionary.getMaxId(); id++) {
        expandedDictionary[id] = decimalFromLong(dictionary.decodeToInt(id));
    }
}
 
开发者ID:CyberAgent,项目名称:embulk-input-parquet_hadoop,代码行数:9,代码来源:ParquetDecimalConverter.java

示例15: setDictionary

import org.apache.parquet.column.Dictionary; //导入依赖的package包/类
@Override
public void setDictionary(Dictionary dictionary) {
  _dict = new String[dictionary.getMaxId() + 1];
  for (int i = 0; i <= dictionary.getMaxId(); i++) {
    _dict[i] = dictionary.decodeToBinary(i).toStringUsingUTF8();
  }
}
 
开发者ID:h2oai,项目名称:h2o-3,代码行数:8,代码来源:ChunkConverter.java


注:本文中的org.apache.parquet.column.Dictionary类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。