本文整理汇总了Java中com.cloudera.cdk.data.DatasetDescriptor类的典型用法代码示例。如果您正苦于以下问题:Java DatasetDescriptor类的具体用法?Java DatasetDescriptor怎么用?Java DatasetDescriptor使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
DatasetDescriptor类属于com.cloudera.cdk.data包,在下文中一共展示了DatasetDescriptor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testGeneric
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test
public void testGeneric() throws IOException {
Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
// write two files, each of 5 records
writeTestUsers(inputDataset, 5, 0);
writeTestUsers(inputDataset, 5, 5);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
checkTestUsers(outputDataset, 10);
}
示例2: testGenericParquet
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test
public void testGenericParquet() throws IOException {
Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).format(Formats.PARQUET).build());
Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).format(Formats.PARQUET).build());
// write two files, each of 5 records
writeTestUsers(inputDataset, 5, 0);
writeTestUsers(inputDataset, 5, 5);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
checkTestUsers(outputDataset, 10);
}
示例3: execute
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public void execute() throws MojoExecutionException, MojoFailureException {
if (avroSchemaFile == null && avroSchemaReflectClass == null) {
throw new IllegalArgumentException("One of cdk.avroSchemaFile or " +
"cdk.avroSchemaReflectClass must be specified");
}
DatasetRepository repo = getDatasetRepository();
DatasetDescriptor descriptor = repo.load(datasetName).getDescriptor();
DatasetDescriptor.Builder descriptorBuilder =
new DatasetDescriptor.Builder(descriptor);
configureSchema(descriptorBuilder, avroSchemaFile, avroSchemaReflectClass);
repo.update(datasetName, descriptorBuilder.build());
}
示例4: execute
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public void execute() throws MojoExecutionException, MojoFailureException {
if (avroSchemaFile == null && avroSchemaReflectClass == null) {
throw new IllegalArgumentException("One of cdk.avroSchemaFile or " +
"cdk.avroSchemaReflectClass must be specified");
}
DatasetRepository repo = getDatasetRepository();
DatasetDescriptor.Builder descriptorBuilder = new DatasetDescriptor.Builder();
configureSchema(descriptorBuilder, avroSchemaFile, avroSchemaReflectClass);
if (format.equals(Formats.AVRO.getName())) {
descriptorBuilder.format(Formats.AVRO);
} else if (format.equals(Formats.PARQUET.getName())) {
descriptorBuilder.format(Formats.PARQUET);
} else {
throw new MojoExecutionException("Unrecognized format: " + format);
}
if (partitionExpression != null) {
descriptorBuilder.partitionStrategy(Accessor.getDefault().fromExpression(partitionExpression));
}
repo.create(datasetName, descriptorBuilder.build());
}
示例5: updateTableSchema
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
public static void updateTableSchema(Table table, DatasetDescriptor descriptor) {
if (table.getProperty(AVRO_SCHEMA_LITERAL_PROPERTY_NAME) != null) {
table.setProperty(
AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
descriptor.getSchema().toString());
} else if (table.getProperty(AVRO_SCHEMA_URL_PROPERTY_NAME) != null) {
if (descriptor.getSchemaUrl() == null) {
throw new MetadataProviderException("Cannot update " +
AVRO_SCHEMA_URL_PROPERTY_NAME + " since descriptor schema URL is not set.");
}
table.setProperty(
AVRO_SCHEMA_URL_PROPERTY_NAME,
descriptor.getSchemaUrl().toExternalForm());
} else {
throw new MetadataProviderException("Cannot update Avro schema since neither " +
AVRO_SCHEMA_LITERAL_PROPERTY_NAME + " nor " + AVRO_SCHEMA_URL_PROPERTY_NAME +
" is set.");
}
}
示例6: update
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@SuppressWarnings("deprecation")
@Override
public DatasetDescriptor update(String name, DatasetDescriptor descriptor) {
Preconditions.checkArgument(name != null, "Name cannot be null");
Preconditions.checkArgument(descriptor != null,
"Descriptor cannot be null");
if (!exists(name)) {
throw new com.cloudera.cdk.data.NoSuchDatasetException("Table not found: " + name);
}
Table table = hcat.getTable(HiveUtils.DEFAULT_DB, name);
HiveUtils.updateTableSchema(table, descriptor);
hcat.alterTable(table);
return descriptor;
}
示例7: testPartitionedSourceAndTarget
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test
@SuppressWarnings("deprecation")
public void testPartitionedSourceAndTarget() throws IOException {
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
"username", 2).build();
Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
writeTestUsers(inputDataset, 10);
PartitionKey key = partitionStrategy.partitionKey(0);
Dataset<Record> inputPart0 = inputDataset.getPartition(key, false);
Dataset<Record> outputPart0 = outputDataset.getPartition(key, true);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputPart0, GenericData.Record.class));
pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(5, datasetSize(outputPart0));
}
示例8: newCompositeDataset
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@SuppressWarnings("unchecked")
private <E> RandomAccessDataset<E> newCompositeDataset(String name, String tableName,
List<DatasetDescriptor> descriptors) {
List<Class<SpecificRecord>> subEntityClasses = new ArrayList<Class<SpecificRecord>>();
for (DatasetDescriptor descriptor : descriptors) {
try {
Class<SpecificRecord> subEntityClass = (Class<SpecificRecord>) Class
.forName(descriptor.getSchema().getFullName());
subEntityClasses.add(subEntityClass);
} catch (ClassNotFoundException e) {
throw new DatasetRepositoryException(e);
}
}
Dao dao = SpecificAvroDao.buildCompositeDaoWithEntityManager(tablePool,
tableName, subEntityClasses, schemaManager);
return new DaoDataset<E>(name, dao, descriptors.get(0));
}
示例9: beforeClass
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@BeforeClass
public static void beforeClass() throws Exception {
HBaseTestUtils.getMiniCluster();
// managed table should be created by HBaseDatasetRepository
HBaseTestUtils.util.deleteTable(Bytes.toBytes(managedTableName));
HBaseDatasetRepository repo = new HBaseDatasetRepository.Builder()
.configuration(HBaseTestUtils.getConf()).build();
String testGenericEntity = AvroUtils.inputStreamToString(
HBaseDatasetRepositoryTest.class.getResourceAsStream("/TestGenericEntity.avsc"));
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schemaLiteral(testGenericEntity)
.build();
dataset = repo.create("testtable", descriptor);
for (int i = 0; i < 10; i++) {
dataset.put(HBaseDatasetRepositoryTest.createGenericEntity(i));
}
}
示例10: newFileWriter
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@SuppressWarnings("unchecked") // See https://github.com/Parquet/parquet-mr/issues/106
public static <E> DatasetWriter<E> newFileWriter(
FileSystem fs, Path path, DatasetDescriptor descriptor) {
// ensure the path exists
try {
fs.mkdirs(path);
} catch (IOException ex) {
throw new DatasetWriterException("Could not create path:" + path, ex);
}
final Format format = descriptor.getFormat();
final Path file = new Path(path, uniqueFilename(descriptor.getFormat()));
if (Formats.PARQUET.equals(format)) {
return new ParquetFileSystemDatasetWriter(fs, file, descriptor.getSchema());
} else if (Formats.AVRO.equals(format)) {
return new FileSystemDatasetWriter.Builder()
.fileSystem(fs)
.path(file)
.schema(descriptor.getSchema())
.build();
} else {
throw new UnknownFormatException("Unknown format:" + format);
}
}
示例11: load
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Override
public <E> Dataset<E> load(String name) {
Preconditions.checkArgument(name != null, "Name can not be null");
logger.debug("Loading dataset:{}", name);
DatasetDescriptor descriptor = metadataProvider.load(name);
FileSystemDataset<E> ds = new FileSystemDataset.Builder()
.name(name)
.configuration(conf)
.descriptor(descriptor)
.partitionKey(descriptor.isPartitioned() ?
com.cloudera.cdk.data.impl.Accessor.getDefault().newPartitionKey() :
null)
.build();
logger.debug("Loaded dataset:{}", ds);
return ds;
}
示例12: ensureExists
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
/**
* Creates, if necessary, the given the location for {@code descriptor}.
*
* @param conf A Configuration
* @param descriptor A DatasetDescriptor
*/
static void ensureExists(
DatasetDescriptor descriptor, Configuration conf) {
Preconditions.checkArgument(descriptor.getLocation() != null,
"Cannot get FileSystem for a descriptor with no location");
final Path dataPath = new Path(descriptor.getLocation());
final FileSystem fs = fsForPath(dataPath, conf);
try {
if (!fs.exists(dataPath)) {
fs.mkdirs(dataPath);
}
} catch (IOException ex) {
throw new DatasetRepositoryException("Cannot access data location", ex);
}
}
示例13: AbstractRangeView
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
protected AbstractRangeView(Dataset<E> dataset) {
this.dataset = dataset;
final DatasetDescriptor descriptor = dataset.getDescriptor();
if (descriptor.isPartitioned()) {
this.range = new MarkerRange(new MarkerComparator(
descriptor.getPartitionStrategy()));
this.keys = new ThreadLocal<StorageKey>() {
@Override
protected StorageKey initialValue() {
return new StorageKey(descriptor.getPartitionStrategy());
}
};
} else {
// use UNDEFINED, which handles inappropriate calls to range methods
this.range = MarkerRange.UNDEFINED;
this.keys = null; // not used
}
}
示例14: testUnknownFormat
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test(expected = UnknownFormatException.class)
public void testUnknownFormat() throws IOException {
final DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(STRING_SCHEMA)
.format(Accessor.getDefault().newFormat("explode!"))
.build();
MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
fileSystem, Lists.newArrayList(TEST_FILE), descriptor);
try {
reader.open();
} finally {
reader.close();
}
}
示例15: testWriteAndRead
import com.cloudera.cdk.data.DatasetDescriptor; //导入依赖的package包/类
@Test
public void testWriteAndRead() throws IOException {
FileSystemDataset<Record> ds = new FileSystemDataset.Builder()
.name("test")
.configuration(getConfiguration())
.descriptor(new DatasetDescriptor.Builder()
.schemaUri(USER_SCHEMA_URL)
.format(format)
.location(testDirectory)
.build())
.build();
Assert.assertFalse("Dataset is not partitioned", ds.getDescriptor()
.isPartitioned());
writeTestUsers(ds, 10);
checkTestUsers(ds, 10);
}