当前位置: 首页>>代码示例>>Java>>正文


Java ColumnDescriptor类代码示例

本文整理汇总了Java中org.apache.parquet.column.ColumnDescriptor的典型用法代码示例。如果您正苦于以下问题:Java ColumnDescriptor类的具体用法?Java ColumnDescriptor怎么用?Java ColumnDescriptor使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


ColumnDescriptor类属于org.apache.parquet.column包,在下文中一共展示了ColumnDescriptor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: addColumn

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
public void addColumn(ColumnDescriptor descriptor, ColumnChunkMetaData metaData) throws IOException {
  final FSDataInputStream in;
  if (useSingleStream) {
    if (streams.isEmpty()) {
      in = fs.open(path);
      streams.add(in);
    } else {
      in = streams.get(0);
    }
    in.seek(metaData.getStartingPos());
    columns.put(descriptor, new SingleStreamColumnChunkIncPageReader(metaData, descriptor, in));
  } else {
    // create new stream per column
    in = fs.open(path);
    streams.add(in);
    in.seek(metaData.getStartingPos());
    columns.put(descriptor, new ColumnChunkIncPageReader(metaData, descriptor, in));
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:20,代码来源:ColumnChunkIncReadStore.java

示例2: createGlobalDictionaries

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
/**
 * Builds a global dictionary for parquet table for BINARY or FIXED_LEN_BYTE_ARRAY column types.
 * It will remove exiting dictionaries if present and create new ones.
 * @param fs filesystem
 * @param tableDir root directory for given table that has parquet files
 * @param bufferAllocator memory allocator
 * @return GlobalDictionariesInfo that has dictionary version, root path and columns along with path to dictionary files.
 * @throws IOException
 */
public static GlobalDictionariesInfo createGlobalDictionaries(FileSystem fs, Path tableDir, BufferAllocator bufferAllocator) throws IOException {
  final FileStatus[] statuses = fs.listStatus(tableDir, PARQUET_FILES_FILTER);
  final Map<ColumnDescriptor, Path> globalDictionaries = Maps.newHashMap();
  final Map<ColumnDescriptor, List<Dictionary>> allDictionaries = readLocalDictionaries(fs, statuses, bufferAllocator);
  final long dictionaryVersion = getDictionaryVersion(fs, tableDir) + 1;
  final Path tmpDictionaryRootDir = createTempRootDir(fs, tableDir, dictionaryVersion);
  logger.debug("Building global dictionaries for columns {} with version {}", allDictionaries.keySet(), dictionaryVersion);

  // Sort all local dictionaries and write it to file with an index if needed
  for (Map.Entry<ColumnDescriptor, List<Dictionary>> entry : allDictionaries.entrySet()) {
    final ColumnDescriptor columnDescriptor = entry.getKey();
    final Path dictionaryFile = dictionaryFilePath(tmpDictionaryRootDir, columnDescriptor);
    logger.debug("Creating a new global dictionary for {} with version {}", columnDescriptor.toString(), dictionaryVersion);
    createDictionaryFile(fs, dictionaryFile, columnDescriptor, entry.getValue(), null, bufferAllocator);
    globalDictionaries.put(columnDescriptor, dictionaryFile);
  }
  final Path finalDictionaryRootDir = createDictionaryVersionedRootPath(fs, tableDir, dictionaryVersion, tmpDictionaryRootDir);
  return new GlobalDictionariesInfo(globalDictionaries, finalDictionaryRootDir,  dictionaryVersion);
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:29,代码来源:GlobalDictionaryBuilder.java

示例3: main

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
public static void main(String []args) {
  try (final BufferAllocator bufferAllocator = new RootAllocator(SabotConfig.getMaxDirectMemory())) {
    final Path tableDir  = new Path(args[0]);
    final FileSystem fs = tableDir.getFileSystem(new Configuration());
    if (fs.exists(tableDir) && fs.isDirectory(tableDir)) {
      Map<ColumnDescriptor, Path> dictionaryEncodedColumns = createGlobalDictionaries(fs, tableDir, bufferAllocator).getColumnsToDictionaryFiles();
      long version = getDictionaryVersion(fs, tableDir);
      Path dictionaryRootDir = getDictionaryVersionedRootPath(fs, tableDir, version);
      for (ColumnDescriptor columnDescriptor: dictionaryEncodedColumns.keySet()) {
        final VectorContainer data = readDictionary(fs, dictionaryRootDir, columnDescriptor, bufferAllocator);
        System.out.println("Dictionary for column [" + columnDescriptor.toString() + " size " + data.getRecordCount());
        BatchPrinter.printBatch(data);
        data.clear();
      }
    }
  } catch (IOException ioe) {
    logger.error("Failed ", ioe);
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:20,代码来源:GlobalDictionaryBuilder.java

示例4: readDictionaries

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
/**
 * Return dictionary per row group for all binary columns in given parquet file.
 * @param fs filesystem object.
 * @param filePath parquet file to scan
 * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
 * @throws IOException
 */
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CodecFactory codecFactory) throws IOException {
  final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(fs.getConf(), filePath, ParquetMetadataConverter.NO_FILTER);
  if (parquetMetadata.getBlocks().size() > 1) {
    throw new IOException(
      format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
        parquetMetadata.getBlocks().size(), filePath));
  }
  final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
  final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();

  for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
    columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
  }

  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
  final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
  try(final FSDataInputStream in = fs.open(filePath)) {
    for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
      if (isBinaryType(columnChunkMetaData.getType())) {
        final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
        // if first page is dictionary encoded then load dictionary, otherwise skip this column.
        final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
        if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
          dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
        } else {
          columnsToSkip.add(column);
        }
      }
    }
  }
  return new ImmutablePair<>(dictionaries, columnsToSkip);
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:40,代码来源:LocalDictionariesReader.java

示例5: convertColumnDescriptor

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
/**
 * Converts {@link ColumnDescriptor} to {@link SchemaPath} and converts any parquet LOGICAL LIST to something
 * the execution engine can understand (removes the extra 'list' and 'element' fields from the name)
 */
private static SchemaPath convertColumnDescriptor(final MessageType schema, final ColumnDescriptor columnDescriptor) {
  List<String> path = Lists.newArrayList(columnDescriptor.getPath());

  // go through the path and find all logical lists
  int index = 0;
  Type type = schema;
  while (!type.isPrimitive()) { // don't bother checking the last element in the path as it is a primitive type
    type = type.asGroupType().getType(path.get(index));
    if (type.getOriginalType() == OriginalType.LIST && LogicalListL1Converter.isSupportedSchema(type.asGroupType())) {
      // remove 'list'
      type = type.asGroupType().getType(path.get(index+1));
      path.remove(index+1);
      // remove 'element'
      type = type.asGroupType().getType(path.get(index+1));
      path.remove(index+1);
    }
    index++;
  }

  String[] schemaColDesc = new String[path.size()];
  path.toArray(schemaColDesc);
  return SchemaPath.getCompoundPath(schemaColDesc);
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:28,代码来源:ParquetRowiseReader.java

示例6: ColumnReader

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
protected ColumnReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
                       ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException {
  this.parentReader = parentReader;
  this.columnDescriptor = descriptor;
  this.columnChunkMetaData = columnChunkMetaData;
  this.isFixedLength = fixedLength;
  this.schemaElement = schemaElement;
  this.valueVec =  v;
  this.pageReader = (parentReader.getSingleStream() != null)?
    new DeprecatedSingleStreamPageReader(this, parentReader.getSingleStream(), parentReader.getHadoopPath(), columnChunkMetaData) :
    new PageReader(this, parentReader.getFileSystem(), parentReader.getHadoopPath(), columnChunkMetaData);

  if (columnDescriptor.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
    if (columnDescriptor.getType() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
      dataTypeLengthInBits = columnDescriptor.getTypeLength() * 8;
    } else if (columnDescriptor.getType() == PrimitiveTypeName.INT96
      && (valueVec instanceof TimeStampMilliVector || valueVec instanceof NullableTimeStampMilliVector)) {
      // if int 96 column is being read as a Timestamp, this truncates the time format used by Impala
      // dataTypeLengthInBits is only ever used when computing offsets into the destination vector, so it
      // needs to be set to the bit width of the resulting Arrow type, usually this matches the input length
      dataTypeLengthInBits = 64;
    } else {
      dataTypeLengthInBits = DeprecatedParquetVectorizedReader.getTypeLengthInBits(columnDescriptor.getType());
    }
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:27,代码来源:ColumnReader.java

示例7: testLocalDictionaries

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
@Test
public void testLocalDictionaries() throws IOException {
  try (final BufferAllocator bufferAllocator = new RootAllocator(SabotConfig.getMaxDirectMemory())) {
    final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries1 =
      LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook1.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries2 =
      LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook2.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries3 =
      LocalDictionariesReader.readDictionaries(fs, new Path(tableDirPath, "phonebook3.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries4 =
      LocalDictionariesReader.readDictionaries(fs, new Path(partitionDirPath, "phonebook4.parquet"), codecFactory);

    assertEquals(2, dictionaries1.getKey().size()); // name and kind have dictionaries
    assertEquals(1, dictionaries2.getKey().size());
    assertEquals(1, dictionaries3.getKey().size());
    assertEquals(1, dictionaries4.getKey().size());

    assertEquals(0, dictionaries1.getValue().size());
    assertEquals(1, dictionaries2.getValue().size()); // skip name
    assertEquals(1, dictionaries3.getValue().size()); // skip name
    assertEquals(1, dictionaries4.getValue().size()); // skip name
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:25,代码来源:TestGlobalDictionaryBuilder.java

示例8: read

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
@Test
public void read(String fileName) throws IOException
{
    Path path = new Path(fileName);
    Configuration conf = new Configuration();
    conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());

    ParquetMetadata metadata = ParquetFileReader.readFooter(conf, path, NO_FILTER);
    ParquetFileReader reader = new ParquetFileReader(conf, metadata.getFileMetaData(), path, metadata.getBlocks(), metadata.getFileMetaData().getSchema().getColumns());
    PageReadStore pageReadStore;
    PageReader pageReader;
    DataPage page;
    while ((pageReadStore = reader.readNextRowGroup()) != null) {
        for (ColumnDescriptor cd: metadata.getFileMetaData().getSchema().getColumns()) {
            pageReader = pageReadStore.getPageReader(cd);
            page = pageReader.readPage();
        }
    }
}
 
开发者ID:dbiir,项目名称:RealtimeAnalysis,代码行数:20,代码来源:ParquetFileReaderTest.java

示例9: loadParquetSchema

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
/**
 * Scan the Parquet footer, then map each Parquet column to the list of columns
 * we want to read. Track those to be read.
 */

private void loadParquetSchema() {
  // TODO - figure out how to deal with this better once we add nested reading, note also look where this map is used below
  // store a map from column name to converted types if they are non-null
  Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);

  // loop to add up the length of the fixed width columns and build the schema
  for (ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
    ParquetColumnMetadata columnMetadata = new ParquetColumnMetadata(column);
    columnMetadata.resolveDrillType(schemaElements, options);
    if (! fieldSelected(columnMetadata.field)) {
      continue;
    }
    selectedColumnMetadata.add(columnMetadata);
  }
}
 
开发者ID:axbaretto,项目名称:drill,代码行数:21,代码来源:ParquetSchema.java

示例10: initialize

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
 
开发者ID:apache,项目名称:tajo,代码行数:24,代码来源:InternalParquetRecordReader.java

示例11: initialize

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
public void initialize(MessageType fileSchema,
                       FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
        throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
          configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = fileSchema;
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
          configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = true;
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  Log.info("RecordReader initialized will read a total of " + total + " records.");
}
 
开发者ID:h2oai,项目名称:h2o-3,代码行数:24,代码来源:H2OInternalParquetReader.java

示例12: showDetails

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) {
  String name = Strings.repeat(".", depth) + type.getName();
  OriginalType otype = type.getOriginalType();
  Repetition rep = type.getRepetition();
  PrimitiveTypeName ptype = type.getPrimitiveTypeName();

  out.format("%s: %s %s", name, rep, ptype);
  if (otype != null) out.format(" O:%s", otype);

  if (container != null) {
    cpath.add(type.getName());
    String[] paths = cpath.toArray(new String[cpath.size()]);
    cpath.remove(cpath.size() - 1);

    ColumnDescriptor desc = container.getColumnDescription(paths);

    int defl = desc.getMaxDefinitionLevel();
    int repl = desc.getMaxRepetitionLevel();
    out.format(" R:%d D:%d", repl, defl);
  }
  out.println();
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:23,代码来源:MetadataUtils.java

示例13: newValuesWriter

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
@Override
public ValuesWriter newValuesWriter(ColumnDescriptor descriptor) {
  switch (descriptor.getType()) {
    case BOOLEAN:
      return getBooleanValuesWriter();
    case FIXED_LEN_BYTE_ARRAY:
      return getFixedLenByteArrayValuesWriter(descriptor);
    case BINARY:
      return getBinaryValuesWriter(descriptor);
    case INT32:
      return getInt32ValuesWriter(descriptor);
    case INT64:
      return getInt64ValuesWriter(descriptor);
    case INT96:
      return getInt96ValuesWriter(descriptor);
    case DOUBLE:
      return getDoubleValuesWriter(descriptor);
    case FLOAT:
      return getFloatValuesWriter(descriptor);
    default:
      throw new IllegalArgumentException("Unknown type " + descriptor.getType());
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:24,代码来源:DefaultV2ValuesWriterFactory.java

示例14: dictionaryWriter

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
static DictionaryValuesWriter dictionaryWriter(ColumnDescriptor path, ParquetProperties properties, Encoding dictPageEncoding, Encoding dataPageEncoding) {
  switch (path.getType()) {
    case BOOLEAN:
      throw new IllegalArgumentException("no dictionary encoding for BOOLEAN");
    case BINARY:
      return new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case INT32:
      return new DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case INT64:
      return new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case INT96:
      return new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), 12, dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case DOUBLE:
      return new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case FLOAT:
      return new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    case FIXED_LEN_BYTE_ARRAY:
      return new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), path.getTypeLength(), dataPageEncoding, dictPageEncoding, properties.getAllocator());
    default:
      throw new IllegalArgumentException("Unknown type " + path.getType());
  }
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:23,代码来源:DefaultValuesWriterFactory.java

示例15: ColumnReaderImpl

import org.apache.parquet.column.ColumnDescriptor; //导入依赖的package包/类
/**
 * creates a reader for triplets
 * @param path the descriptor for the corresponding column
 * @param pageReader the underlying store to read from
 */
public ColumnReaderImpl(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) {
  this.path = checkNotNull(path, "path");
  this.pageReader = checkNotNull(pageReader, "pageReader");
  this.converter = checkNotNull(converter, "converter");
  this.writerVersion = writerVersion;
  DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage);
      if (converter.hasDictionarySupport()) {
        converter.setDictionary(dictionary);
      }
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + path, e);
    }
  } else {
    this.dictionary = null;
  }
  this.totalValueCount = pageReader.getTotalValueCount();
  if (totalValueCount <= 0) {
    throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0");
  }
  consume();
}
 
开发者ID:apache,项目名称:parquet-mr,代码行数:30,代码来源:ColumnReaderImpl.java


注:本文中的org.apache.parquet.column.ColumnDescriptor类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。