本文整理汇总了Java中org.apache.crunch.Pipeline类的典型用法代码示例。如果您正苦于以下问题:Java Pipeline类的具体用法?Java Pipeline怎么用?Java Pipeline使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Pipeline类属于org.apache.crunch包,在下文中一共展示了Pipeline类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testGeneric
import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Test
public void testGeneric() throws IOException {
Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
// write two files, each of 5 records
writeTestUsers(inputDataset, 5, 0);
writeTestUsers(inputDataset, 5, 5);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
checkTestUsers(outputDataset, 10);
}
示例2: testGenericParquet
import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Test
public void testGenericParquet() throws IOException {
Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).format(Formats.PARQUET).build());
Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).format(Formats.PARQUET).build());
// write two files, each of 5 records
writeTestUsers(inputDataset, 5, 0);
writeTestUsers(inputDataset, 5, 5);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
checkTestUsers(outputDataset, 10);
}
示例3: loadKeyedRecords
import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
Set<String> samples)
throws IOException {
PCollection<Pair<org.bdgenomics.formats.avro.Variant, Collection<Genotype>>> adamRecords
= readVariants(inputFormat, inputPath, conf, pipeline, sampleGroup);
// The data are now loaded into ADAM variant objects; convert to keyed SpecificRecords
ADAMToKeyedSpecificRecordFn converter =
new ADAMToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
@SuppressWarnings("unchecked")
PType<SpecificRecord> specificPType = Avros.specifics(converter.getSpecificRecordType());
return adamRecords.parallelDo("Convert to keyed SpecificRecords",
converter, Avros.tableOf(KEY_PTYPE, specificPType));
}
示例4: readVariants
import org.apache.crunch.Pipeline; //导入依赖的package包/类
private static PCollection<Pair<Variant, Collection<Genotype>>>
readVariants(String inputFormat, Path inputPath, Configuration conf,
Pipeline pipeline, String sampleGroup) throws IOException {
PCollection<Pair<Variant, Collection<Genotype>>> adamRecords;
if (inputFormat.equals("VCF")) {
TableSource<LongWritable, VariantContextWritable> vcfSource =
From.formattedFile(
inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
PType<Pair<Variant, Collection<Genotype>>> adamPType =
Avros.pairs(Avros.specifics(org.bdgenomics.formats.avro.Variant.class),
Avros.collections(Avros.specifics(Genotype.class)));
adamRecords =
vcfRecords.parallelDo("VCF to ADAM Variant", new VCFToADAMVariantFn(), adamPType);
} else if (inputFormat.equals("AVRO")) {
throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
} else if (inputFormat.equals("PARQUET")) {
throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
} else {
throw new IllegalStateException("Unrecognized input format: " + inputFormat);
}
return adamRecords;
}
示例5: testPartitionedSourceAndTarget
import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Test
@SuppressWarnings("deprecation")
public void testPartitionedSourceAndTarget() throws IOException {
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
"username", 2).build();
Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
writeTestUsers(inputDataset, 10);
PartitionKey key = partitionStrategy.partitionKey(0);
Dataset<Record> inputPart0 = inputDataset.getPartition(key, false);
Dataset<Record> outputPart0 = outputDataset.getPartition(key, true);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputPart0, GenericData.Record.class));
pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(5, datasetSize(outputPart0));
}
示例6: startMapReduce
import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Override
public void startMapReduce(String taskName) {
String outputDirectory = propertyConfig.getProperty("sqoop.task." + taskName + ".toJobConfig.outputDirectory");
String hadoopAddress = propertyConfig.getProperty("sqoop.task." + taskName + ".tolink.linkConfig.uri");
Pipeline pipeline = new MRPipeline(CrunchMapReduce.class, new Configuration());
Class<AvroParquetFileSourceTarget> avroParquetFileSourceTargetClass = AvroParquetFileSourceTarget.class;
}
示例7: run
import org.apache.crunch.Pipeline; //导入依赖的package包/类
public int run(String[] args) throws Exception {
String fooInputPath = args[0];
String barInputPath = args[1];
String outputPath = args[2];
int fooValMax = Integer.parseInt(args[3]);
int joinValMax = Integer.parseInt(args[4]);
int numberOfReducers = Integer.parseInt(args[5]);
Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
PCollection<String> fooLines = pipeline.readTextFile(fooInputPath); //<2>
PCollection<String> barLines = pipeline.readTextFile(barInputPath);
PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo( //<3>
new FooIndicatorFn(),
Avros.tableOf(Avros.longs(),
Avros.pairs(Avros.longs(), Avros.ints())));
fooTable = fooTable.filter(new FooFilter(fooValMax)); //<4>
PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
Avros.tableOf(Avros.longs(), Avros.ints()));
DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy = //<5>
new DefaultJoinStrategy
<Long, Pair<Long, Integer>, Integer>
(numberOfReducers);
PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
.join(fooTable, barTable, JoinType.INNER_JOIN);
PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));
filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>
PipelineResult result = pipeline.done();
return result.succeeded() ? 0 : 1;
}
示例8: loadKeyedRecords
import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
Set<String> samples)
throws IOException {
PCollection<Variant> variants = readVariants(inputFormat, inputPath,
conf, pipeline, sampleGroup);
GA4GHToKeyedSpecificRecordFn converter =
new GA4GHToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
@SuppressWarnings("unchecked")
PType<SpecificRecord> specificPType = Avros.specifics(converter
.getSpecificRecordType());
return variants.parallelDo("Convert to keyed SpecificRecords",
converter, Avros.tableOf(KEY_PTYPE, specificPType));
}
示例9: readVariants
import org.apache.crunch.Pipeline; //导入依赖的package包/类
private static PCollection<Variant> readVariants(String inputFormat, Path inputPath,
Configuration conf, Pipeline pipeline, String sampleGroup) throws IOException {
PCollection<Variant> variants;
if (inputFormat.equals("VCF")) {
VCFToGA4GHVariantFn.configureHeaders(
conf, FileUtils.findVcfs(inputPath, conf), sampleGroup);
TableSource<LongWritable, VariantContextWritable> vcfSource =
From.formattedFile(
inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
variants = vcfRecords.parallelDo(
"VCF to GA4GH Variant", new VCFToGA4GHVariantFn(), Avros.specifics(Variant.class));
} else if (inputFormat.equals("AVRO")) {
variants = pipeline.read(From.avroFile(inputPath, Avros.specifics(Variant.class)));
} else if (inputFormat.equals("PARQUET")) {
@SuppressWarnings("unchecked")
Source<Variant> source =
new AvroParquetFileSource(inputPath, Avros.specifics(Variant.class));
variants = pipeline.read(source);
} else {
throw new IllegalStateException("Unrecognized input format: " + inputFormat);
}
return variants;
}
示例10: loadPartitionedVariants
import org.apache.crunch.Pipeline; //导入依赖的package包/类
/**
* Load and partition variants.
* key = (contig, pos, sample_group); value = Variant/Call Avro object
* @param inputFormat the format of the input data (VCF, AVRO, or PARQUET)
* @param inputPath the input data path
* @param conf the Hadoop configuration
* @param pipeline the Crunch pipeline
* @param variantsOnly whether to ignore samples and only load variants
* @param flatten whether to flatten the data types
* @param sampleGroup an identifier for the group of samples being loaded
* @param samples the samples to include
* @param redistribute whether to repartition the data by locus/sample group
* @param segmentSize the number of base pairs in each segment partition
* @param numReducers the number of reducers to use
* @return the keyed variant or call records
* @throws IOException if an I/O error is encountered during loading
*/
public PTable<String, SpecificRecord> loadPartitionedVariants(
String inputFormat, Path inputPath, Configuration conf,
Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
Set<String> samples, boolean redistribute, long segmentSize, int numReducers)
throws IOException {
PTable<Tuple3<String, Long, String>, SpecificRecord> locusSampleKeyedRecords =
loadKeyedRecords(inputFormat, inputPath, conf, pipeline, variantsOnly, flatten,
sampleGroup, samples);
// execute a DISTRIBUTE BY operation if requested
PTable<Tuple3<String, Long, String>, SpecificRecord> sortedRecords;
if (redistribute) {
// partitionKey(chr, chrSeg, sampleGroup), Pair(secondaryKey/pos, originalDatum)
PTableType<Tuple3<String, Long, String>,
Pair<Long,
Pair<Tuple3<String, Long, String>, SpecificRecord>>> reKeyedPType =
Avros.tableOf(Avros.triples(Avros.strings(), Avros.longs(), Avros.strings()),
Avros.pairs(Avros.longs(),
Avros.pairs(locusSampleKeyedRecords.getKeyType(),
locusSampleKeyedRecords.getValueType())));
PTable<Tuple3<String, Long, String>,
Pair<Long, Pair<Tuple3<String, Long, String>, SpecificRecord>>> reKeyed =
locusSampleKeyedRecords.parallelDo("Re-keying for redistribution",
new ReKeyDistributeByFn(segmentSize), reKeyedPType);
// repartition and sort by pos
sortedRecords = SecondarySort.sortAndApply(
reKeyed, new UnKeyForDistributeByFn(),
locusSampleKeyedRecords.getPTableType(), numReducers);
} else {
// input data assumed to be already globally sorted
sortedRecords = locusSampleKeyedRecords;
}
// generate the partition keys
return sortedRecords.mapKeys("Generate partition keys",
new LocusSampleToPartitionFn(segmentSize, sampleGroup), Avros.strings());
}
示例11: inputPairs
import org.apache.crunch.Pipeline; //导入依赖的package包/类
protected final <V extends RealVector> PCollection<Pair<Integer, V>> inputPairs(
Pipeline p,
String inputKey,
PType<V> ptype) {
PType<Pair<Integer, V>> inputType = Avros.pairs(Avros.ints(), ptype);
return p.read(avroInput(inputKey, inputType));
}
示例12: run
import org.apache.crunch.Pipeline; //导入依赖的package包/类
public void run() {
// TODO: Switch to parameterized views.
View<ExampleEvent> view = Datasets.load(ScheduledReportApp.EXAMPLE_DS_URI,
ExampleEvent.class);
RefinableView<GenericRecord> target = Datasets.load(ScheduledReportApp.REPORT_DS_URI,
GenericRecord.class);
// Get the view into which this report will be written.
DateTime dateTime = getNominalTime().toDateTime(DateTimeZone.UTC);
View<GenericRecord> output = target
.with("year", dateTime.getYear())
.with("month", dateTime.getMonthOfYear())
.with("day", dateTime.getDayOfMonth())
.with("hour", dateTime.getHourOfDay())
.with("minute", dateTime.getMinuteOfHour());
Pipeline pipeline = getPipeline();
PCollection<ExampleEvent> events = pipeline.read(CrunchDatasets.asSource(view));
PTable<Long, ExampleEvent> eventsByUser = events.by(new GetEventId(), Avros.longs());
// Count of events by user ID.
PTable<Long, Long> userEventCounts = eventsByUser.keys().count();
PCollection<GenericData.Record> report = userEventCounts.parallelDo(
new ToUserReport(),
Avros.generics(SCHEMA));
pipeline.write(report, CrunchDatasets.asTarget(output));
pipeline.run();
}
示例13: run
import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {
JCommander jc = new JCommander(this);
try {
jc.parse(args);
} catch (ParameterException e) {
jc.usage();
return 1;
}
if (paths == null || paths.size() != 2) {
jc.usage();
return 1;
}
String inputPathString = paths.get(0);
String outputPathString = paths.get(1);
Configuration conf = getConf();
Path inputPath = new Path(inputPathString);
Path outputPath = new Path(outputPathString);
outputPath = outputPath.getFileSystem(conf).makeQualified(outputPath);
Pipeline pipeline = new MRPipeline(getClass(), conf);
VariantsLoader variantsLoader;
if (dataModel.equals("GA4GH")) {
variantsLoader = new GA4GHVariantsLoader();
} else if (dataModel.equals("ADAM")) {
variantsLoader = new ADAMVariantsLoader();
} else {
jc.usage();
return 1;
}
Set<String> sampleSet = samples == null ? null :
Sets.newLinkedHashSet(Splitter.on(',').split(samples));
PTable<String, SpecificRecord> partitionKeyedRecords =
variantsLoader.loadPartitionedVariants(inputFormat, inputPath, conf, pipeline,
variantsOnly, flatten, sampleGroup, sampleSet, redistribute, segmentSize,
numReducers);
if (FileUtils.sampleGroupExists(outputPath, conf, sampleGroup)) {
if (overwrite) {
FileUtils.deleteSampleGroup(outputPath, conf, sampleGroup);
} else {
LOG.error("Sample group already exists: " + sampleGroup);
return 1;
}
}
pipeline.write(partitionKeyedRecords, new AvroParquetPathPerKeyTarget(outputPath),
Target.WriteMode.APPEND);
PipelineResult result = pipeline.done();
return result.succeeded() ? 0 : 1;
}
示例14: inputVectors
import org.apache.crunch.Pipeline; //导入依赖的package包/类
protected final <V extends RealVector> PCollection<V> inputVectors(Pipeline p, String inputKey, PType<V> ptype) {
return PTables.asPTable(inputPairs(p, inputKey, ptype)).values();
}
示例15: run
import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {
new JCommander(this, args);
URI outputUri = URI.create(output);
// Our crunch job is a MapReduce job
Pipeline pipeline = new MRPipeline(LegacyHdfs2Cass.class, getConf());
// Parse & fetch info about target Cassandra cluster
CassandraParams params = CassandraParams.parse(outputUri);
// Read records from Avro files in inputFolder
PCollection<ByteBuffer> records =
pipeline.read(From.avroFile(inputList(input), Avros.records(ByteBuffer.class)));
// Transform the input
String protocol = outputUri.getScheme();
if (protocol.equalsIgnoreCase("thrift")) {
records
// First convert ByteBuffers to ThriftRecords
.parallelDo(new LegacyHdfsToThrift(), ThriftRecord.PTYPE)
// Then group the ThriftRecords in preparation for writing them
.parallelDo(new ThriftRecord.AsPair(), ThriftRecord.AsPair.PTYPE)
.groupByKey(params.createGroupingOptions())
// Finally write the ThriftRecords to Cassandra
.write(new ThriftTarget(outputUri, params));
}
else if (protocol.equalsIgnoreCase("cql")) {
records
// In case of CQL, convert ByteBuffers to CQLRecords
.parallelDo(new LegacyHdfsToCQL(), CQLRecord.PTYPE)
.by(params.getKeyFn(), Avros.bytes())
.groupByKey(params.createGroupingOptions())
.write(new CQLTarget(outputUri, params));
}
// Execute the pipeline
PipelineResult result = pipeline.done();
return result.succeeded() ? 0 : 1;
}