当前位置: 首页>>代码示例>>Java>>正文


Java DatasetDescriptor类代码示例

本文整理汇总了Java中com.cloudera.cdk.data.DatasetDescriptor的典型用法代码示例。如果您正苦于以下问题:Java DatasetDescriptor类的具体用法?Java DatasetDescriptor怎么用?Java DatasetDescriptor使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


DatasetDescriptor类属于com.cloudera.cdk.data包,在下文中一共展示了DatasetDescriptor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testGeneric

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test
public void testGeneric() throws IOException {
  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:20,代码来源:TestCrunchDatasets.java

示例2: testGenericParquet

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test
public void testGenericParquet() throws IOException {
  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:20,代码来源:TestCrunchDatasets.java

示例3: execute

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public void execute() throws MojoExecutionException, MojoFailureException {
  if (avroSchemaFile == null && avroSchemaReflectClass == null) {
    throw new IllegalArgumentException("One of cdk.avroSchemaFile or " +
        "cdk.avroSchemaReflectClass must be specified");
  }

  DatasetRepository repo = getDatasetRepository();

  DatasetDescriptor descriptor = repo.load(datasetName).getDescriptor();
  DatasetDescriptor.Builder descriptorBuilder =
      new DatasetDescriptor.Builder(descriptor);
  configureSchema(descriptorBuilder, avroSchemaFile, avroSchemaReflectClass);

  repo.update(datasetName, descriptorBuilder.build());
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:17,代码来源:UpdateDatasetMojo.java

示例4: execute

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public void execute() throws MojoExecutionException, MojoFailureException {
  if (avroSchemaFile == null && avroSchemaReflectClass == null) {
    throw new IllegalArgumentException("One of cdk.avroSchemaFile or " +
        "cdk.avroSchemaReflectClass must be specified");
  }

  DatasetRepository repo = getDatasetRepository();

  DatasetDescriptor.Builder descriptorBuilder = new DatasetDescriptor.Builder();
  configureSchema(descriptorBuilder, avroSchemaFile, avroSchemaReflectClass);

  if (format.equals(Formats.AVRO.getName())) {
    descriptorBuilder.format(Formats.AVRO);
  } else if (format.equals(Formats.PARQUET.getName())) {
    descriptorBuilder.format(Formats.PARQUET);
  } else {
    throw new MojoExecutionException("Unrecognized format: " + format);
  }

  if (partitionExpression != null) {
    descriptorBuilder.partitionStrategy(Accessor.getDefault().fromExpression(partitionExpression));
  }

  repo.create(datasetName, descriptorBuilder.build());
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:27,代码来源:CreateDatasetMojo.java

示例5: updateTableSchema

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
public static void updateTableSchema(Table table, DatasetDescriptor descriptor) {
  if (table.getProperty(AVRO_SCHEMA_LITERAL_PROPERTY_NAME) != null) {
    table.setProperty(
        AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
        descriptor.getSchema().toString());
  } else if (table.getProperty(AVRO_SCHEMA_URL_PROPERTY_NAME) != null) {
    if (descriptor.getSchemaUrl() == null) {
      throw new MetadataProviderException("Cannot update " +
          AVRO_SCHEMA_URL_PROPERTY_NAME + " since descriptor schema URL is not set.");
    }
    table.setProperty(
        AVRO_SCHEMA_URL_PROPERTY_NAME,
        descriptor.getSchemaUrl().toExternalForm());
  } else {
    throw new MetadataProviderException("Cannot update Avro schema since neither " +
        AVRO_SCHEMA_LITERAL_PROPERTY_NAME + " nor " + AVRO_SCHEMA_URL_PROPERTY_NAME +
        " is set.");
  }
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:20,代码来源:HiveUtils.java

示例6: update

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@SuppressWarnings("deprecation")
@Override
public DatasetDescriptor update(String name, DatasetDescriptor descriptor) {
  Preconditions.checkArgument(name != null, "Name cannot be null");
  Preconditions.checkArgument(descriptor != null,
      "Descriptor cannot be null");

  if (!exists(name)) {
    throw new com.cloudera.cdk.data.NoSuchDatasetException("Table not found: " + name);
  }

  Table table = hcat.getTable(HiveUtils.DEFAULT_DB, name);
  HiveUtils.updateTableSchema(table, descriptor);
  hcat.alterTable(table);
  return descriptor;
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:17,代码来源:HCatalogMetadataProvider.java

示例7: testPartitionedSourceAndTarget

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test
@SuppressWarnings("deprecation")
public void testPartitionedSourceAndTarget() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = partitionStrategy.partitionKey(0);
  Dataset<Record> inputPart0 = inputDataset.getPartition(key, false);
  Dataset<Record> outputPart0 = outputDataset.getPartition(key, true);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputPart0));
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:26,代码来源:TestCrunchDatasets.java

示例8: newCompositeDataset

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@SuppressWarnings("unchecked")
private <E> RandomAccessDataset<E> newCompositeDataset(String name, String tableName,
    List<DatasetDescriptor> descriptors) {
  List<Class<SpecificRecord>> subEntityClasses = new ArrayList<Class<SpecificRecord>>();
  for (DatasetDescriptor descriptor : descriptors) {
    try {
      Class<SpecificRecord> subEntityClass = (Class<SpecificRecord>) Class
          .forName(descriptor.getSchema().getFullName());
      subEntityClasses.add(subEntityClass);
    } catch (ClassNotFoundException e) {
      throw new DatasetRepositoryException(e);
    }
  }
  Dao dao = SpecificAvroDao.buildCompositeDaoWithEntityManager(tablePool,
      tableName, subEntityClasses, schemaManager);
  return new DaoDataset<E>(name, dao, descriptors.get(0));
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:18,代码来源:HBaseDatasetRepository.java

示例9: beforeClass

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@BeforeClass
public static void beforeClass() throws Exception {
  HBaseTestUtils.getMiniCluster();
  // managed table should be created by HBaseDatasetRepository
  HBaseTestUtils.util.deleteTable(Bytes.toBytes(managedTableName));
  HBaseDatasetRepository repo = new HBaseDatasetRepository.Builder()
      .configuration(HBaseTestUtils.getConf()).build();
  String testGenericEntity = AvroUtils.inputStreamToString(
      HBaseDatasetRepositoryTest.class.getResourceAsStream("/TestGenericEntity.avsc"));
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();
  dataset = repo.create("testtable", descriptor);
  for (int i = 0; i < 10; i++) {
    dataset.put(HBaseDatasetRepositoryTest.createGenericEntity(i));
  }
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:18,代码来源:HBaseDatasetReaderTest.java

示例10: newFileWriter

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@SuppressWarnings("unchecked") // See https://github.com/Parquet/parquet-mr/issues/106
public static <E> DatasetWriter<E> newFileWriter(
    FileSystem fs, Path path, DatasetDescriptor descriptor) {
  // ensure the path exists
  try {
    fs.mkdirs(path);
  } catch (IOException ex) {
    throw new DatasetWriterException("Could not create path:" + path, ex);
  }

  final Format format = descriptor.getFormat();
  final Path file = new Path(path, uniqueFilename(descriptor.getFormat()));

  if (Formats.PARQUET.equals(format)) {
    return new ParquetFileSystemDatasetWriter(fs, file, descriptor.getSchema());
  } else if (Formats.AVRO.equals(format)) {
    return new FileSystemDatasetWriter.Builder()
        .fileSystem(fs)
        .path(file)
        .schema(descriptor.getSchema())
        .build();
  } else {
    throw new UnknownFormatException("Unknown format:" + format);
  }
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:26,代码来源:FileSystemWriters.java

示例11: load

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public <E> Dataset<E> load(String name) {
  Preconditions.checkArgument(name != null, "Name can not be null");

  logger.debug("Loading dataset:{}", name);

  DatasetDescriptor descriptor = metadataProvider.load(name);

  FileSystemDataset<E> ds = new FileSystemDataset.Builder()
      .name(name)
      .configuration(conf)
      .descriptor(descriptor)
      .partitionKey(descriptor.isPartitioned() ?
          com.cloudera.cdk.data.impl.Accessor.getDefault().newPartitionKey() :
          null)
      .build();

  logger.debug("Loaded dataset:{}", ds);

  return ds;
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:22,代码来源:FileSystemDatasetRepository.java

示例12: ensureExists

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
/**
 * Creates, if necessary, the given the location for {@code descriptor}.
 *
 * @param conf A Configuration
 * @param descriptor A DatasetDescriptor
 */
static void ensureExists(
    DatasetDescriptor descriptor, Configuration conf) {
  Preconditions.checkArgument(descriptor.getLocation() != null,
      "Cannot get FileSystem for a descriptor with no location");
  final Path dataPath = new Path(descriptor.getLocation());

  final FileSystem fs = fsForPath(dataPath, conf);

  try {
    if (!fs.exists(dataPath)) {
      fs.mkdirs(dataPath);
    }
  } catch (IOException ex) {
    throw new DatasetRepositoryException("Cannot access data location", ex);
  }
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:23,代码来源:FileSystemDatasetRepository.java

示例13: AbstractRangeView

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
protected AbstractRangeView(Dataset<E> dataset) {
  this.dataset = dataset;
  final DatasetDescriptor descriptor = dataset.getDescriptor();
  if (descriptor.isPartitioned()) {
    this.range = new MarkerRange(new MarkerComparator(
        descriptor.getPartitionStrategy()));
    this.keys = new ThreadLocal<StorageKey>() {
      @Override
      protected StorageKey initialValue() {
        return new StorageKey(descriptor.getPartitionStrategy());
      }
    };
  } else {
    // use UNDEFINED, which handles inappropriate calls to range methods
    this.range = MarkerRange.UNDEFINED;
    this.keys = null; // not used
  }
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:19,代码来源:AbstractRangeView.java

示例14: testUnknownFormat

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test(expected = UnknownFormatException.class)
public void testUnknownFormat() throws IOException {
  final DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(STRING_SCHEMA)
      .format(Accessor.getDefault().newFormat("explode!"))
      .build();

  MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
      fileSystem, Lists.newArrayList(TEST_FILE), descriptor);

  try {
    reader.open();
  } finally {
    reader.close();
  }
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:17,代码来源:TestMultiFileDatasetReader.java

示例15: testWriteAndRead

import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test
public void testWriteAndRead() throws IOException {
  FileSystemDataset<Record> ds = new FileSystemDataset.Builder()
      .name("test")
      .configuration(getConfiguration())
      .descriptor(new DatasetDescriptor.Builder()
          .schemaUri(USER_SCHEMA_URL)
          .format(format)
          .location(testDirectory)
          .build())
      .build();

  Assert.assertFalse("Dataset is not partitioned", ds.getDescriptor()
    .isPartitioned());

  writeTestUsers(ds, 10);
  checkTestUsers(ds, 10);
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:19,代码来源:TestFileSystemDataset.java


注:本文中的com.cloudera.cdk.data.DatasetDescriptor类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。