本文整理汇总了Java中com.google.cloud.dataflow.sdk.transforms.DoFn类的典型用法代码示例。如果您正苦于以下问题:Java DoFn类的具体用法?Java DoFn怎么用?Java DoFn使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
DoFn类属于com.google.cloud.dataflow.sdk.transforms包,在下文中一共展示了DoFn类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: processElement
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Override
public void processElement(
DoFn<KV<String, Wrapper>, KV<String, WorkflowArgs>>.ProcessContext c) throws Exception {
LOG.info("Combining args");
Wrapper value = c.element().getValue();
WorkflowArgs retval = null;
// Iterate in order
for (WorkflowArgs wa : value.map.values()) {
// Modify a copy
if (retval == null) {
retval = new WorkflowArgs(wa);
// Find differences and merge
} else {
retval.gatherArgs(wa);
}
}
c.output(KV.of(c.element().getKey(), retval));
}
示例2: setupDataInput
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
List<KV<String, TSProto>> data) {
// Assert that we have 44 Elements in the PCollection
PCollection<KV<String, TSProto>> tsData =
pipeline.apply("ReadData", Create.of(data))
.apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {
@Override
public void processElement(ProcessContext c) throws Exception {
c.outputWithTimestamp(c.element(),
new DateTime(c.element().getValue().getTime()).toInstant());
}
})).setName("Assign TimeStamps");
return tsData;
}
示例3: apply
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Override
public PCollection<KV<GATKRead, ReferenceBases>> apply(PCollection<KV<ReferenceBases, Iterable<GATKRead>>> input) {
return input.apply(ParDo.of(new DoFn<KV<ReferenceBases, Iterable<GATKRead>>, KV<GATKRead, ReferenceBases>>() {
private static final long serialVersionUID = 1L;
@Override
public void processElement(ProcessContext c) throws Exception {
// Each element of the PCollection is a set of reads keyed by a reference shard
// The shard MUST have all of the reference bases for ALL of the reads. If not
// it's an error.
final ReferenceBases shard = c.element().getKey();
final Iterable<GATKRead> reads = c.element().getValue();
// For every read, find the subset of the reference that matches it according to our referenceWindowFunction
for (GATKRead r : reads) {
final ReferenceBases subset = shard.getSubset(referenceWindowFunction.apply(r));
c.output(KV.of(r, subset));
}
}
})).setName("GroupReadWithRefBases");
}
示例4: flattenShards
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
/**
* Goes from full shards to separated, individual (read,context pairs).
* The advantage is you don't need to know about shards anymore.
* This comes at the expense of copying the reference data instead of being able to have a pointer to in-shard data.
*/
public static DoFn<ContextShard,KV<GATKRead,ReadContextData>> flattenShards(SerializableFunction<GATKRead, SimpleInterval> contextFn) {
return new DoFnWLog<ContextShard, KV<GATKRead, ReadContextData>>("flattenShards") {
private static final long serialVersionUID = 1L;
@Override
public void processElement(ProcessContext c) throws Exception {
ContextShard shard = c.element();
for (int i=0; i<shard.reads.size(); i++) {
GATKRead read = shard.reads.get(i);
ReadContextData rc = shard.readContext.get(i);
ReadContextData rcd = new ReadContextData( rc.getOverlappingReferenceBases().getSubset(contextFn.apply(read)), rc.getOverlappingVariants());
c.output(KV.of(read,rcd));
}
}
};
}
示例5: apply
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
/**
* Filter out reads we don't want.
*/
@Override
public PCollection<GATKRead> apply(PCollection<GATKRead> in) {
return in.apply(ParDo
.named(readFilter.getClass().getSimpleName())
.of(new DoFn<GATKRead, GATKRead>() {
private static final long serialVersionUID = 1L;
@Override
public void processElement(DoFn<GATKRead,GATKRead>.ProcessContext c) throws Exception {
GATKRead read = c.element();
if (readFilter.test(read)) {
c.output(read);
}
}
}));
}
示例6: of
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
/**
* Recalibration report on GCS/HDFS -> PCollection of a single BaseRecalOutput.
* The loading is done at the worker.
*
* @param pipeline the pipeline, with authentication information.
* @param GCSFileName the path to the recalibration report. Must start with "gs://"
*/
static public PCollection<BaseRecalOutput> of(final Pipeline pipeline, String GCSFileName) {
return pipeline.apply("calibration report name", Create.of(GCSFileName))
.apply(ParDo.of(new DoFn<String, BaseRecalOutput>() {
private static final long serialVersionUID = 1L;
@Override
public void processElement(ProcessContext c) {
final String fname = c.element();
File dest = IOUtils.createTempFile("temp-BaseRecal-", ".tmp");
try {
BucketUtils.copyFile(fname, c.getPipelineOptions(), dest.getPath());
} catch (IOException x) {
throw new GATKException("Unable to download recalibration table from '" + fname + "'.", x);
}
c.output(new BaseRecalOutput(dest));
}
}).named("ingest calibration report"));
}
示例7: keyReadsByName
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
/**
* Makes keys for read pairs. To be grouped by in the next step.
*/
static PTransform<PCollection<? extends GATKRead>, PCollection<KV<String, GATKRead>>> keyReadsByName(final PCollectionView<SAMFileHeader> headerPcolView) {
return ParDo
.named("key reads by name")
.withSideInputs(headerPcolView)
.of(new DoFn<GATKRead, KV<String, GATKRead>>() {
private static final long serialVersionUID = 1l;
@Override
public void processElement(final ProcessContext context) throws Exception {
final GATKRead record = context.element();
if (ReadUtils.readHasMappedMate(record)) {
final SAMFileHeader h = context.sideInput(headerPcolView);
final String key = ReadsKey.keyForRead(h, record);
final KV<String, GATKRead> kv = KV.of(key, record);
context.output(kv);
}
}
});
}
示例8: makeKeysForFragments
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
/**
* Groups reads by keys - keys are tuples of (library, contig, position, orientation).
*/
static PTransform<PCollection<? extends GATKRead>, PCollection<KV<String, GATKRead>>> makeKeysForFragments(final PCollectionView<SAMFileHeader> headerPcolView) {
return ParDo
.named("make keys for reads")
.withSideInputs(headerPcolView)
.of(new DoFn<GATKRead, KV<String, GATKRead>>() {
private static final long serialVersionUID = 1L;
@Override
public void processElement(final ProcessContext context) throws Exception {
final GATKRead record = context.element().copy();
record.setIsDuplicate(false);
final SAMFileHeader h = context.sideInput(headerPcolView);
final String key = ReadsKey.keyForFragment(h, record);
final KV<String, GATKRead> kv = KV.of(key, record);
context.output(kv);
}
});
}
示例9: finalizeMetrics
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
private PTransform<PCollection<? extends KV<String, DuplicationMetrics>>, PCollection<KV<String, DuplicationMetrics>>> finalizeMetrics() {
return ParDo
.named("finalize metrics")
.of(new DoFn<KV<String, DuplicationMetrics>, KV<String, DuplicationMetrics>>() {
private static final long serialVersionUID = 1l;
@Override
public void processElement(final ProcessContext context) throws Exception {
DuplicationMetrics metrics = context.element().getValue().copy();
// Divide these by 2 because they are counted for each read
// when they should be counted by pair.
metrics.READ_PAIRS_EXAMINED = metrics.READ_PAIRS_EXAMINED / 2;
metrics.READ_PAIR_DUPLICATES = metrics.READ_PAIR_DUPLICATES / 2;
metrics.calculateDerivedMetrics();
if (metrics.ESTIMATED_LIBRARY_SIZE == null) {
metrics.ESTIMATED_LIBRARY_SIZE = 0L;
}
final KV<String, DuplicationMetrics> kv = KV.of(context.element().getKey(), metrics);
context.output(kv);
}
});
}
示例10: testGATKReadCoding
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Test(dataProvider = "reads")
public void testGATKReadCoding( final List<GATKRead> reads ) {
// The simplest way to figure out if a class is coded correctly is to create a PCollection
// of that type and see if it matches the List version.
final Pipeline p = GATKTestPipeline.create();
DataflowUtils.registerGATKCoders(p);
// Need to explicitly set the coder to GATKReadCoder, otherwise Create fails to infer
// a coder properly in the case where the List contains a mix of different GATKRead implementations.
final PCollection<GATKRead> dataflowReads = p.apply(Create.of(reads).withCoder(new GATKReadCoder()));
DataflowAssert.that(dataflowReads).containsInAnyOrder(reads);
final PCollection<GATKRead> dataflowReadsAfterTransform = dataflowReads.apply(ParDo.of(new DoFn<GATKRead, GATKRead>() {
private static final long serialVersionUID = 1l;
@Override
public void processElement( ProcessContext c ) throws Exception {
c.output(c.element());
}
})).setCoder(new GATKReadCoder());
DataflowAssert.that(dataflowReadsAfterTransform).containsInAnyOrder(reads);
p.run();
}
示例11: window
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
private static <T, W extends BoundedWindow> TransformEvaluator<Window.Bound<T>> window() {
return new TransformEvaluator<Window.Bound<T>>() {
@Override
public void evaluate(Window.Bound<T> transform, EvaluationContext context) {
@SuppressWarnings("unchecked")
JavaRDDLike<WindowedValue<T>, ?> inRDD =
(JavaRDDLike<WindowedValue<T>, ?>) context.getInputRDD(transform);
WindowFn<? super T, W> windowFn = WINDOW_FG.get("windowFn", transform);
if (windowFn instanceof GlobalWindows) {
context.setOutputRDD(transform, inRDD);
} else {
@SuppressWarnings("unchecked")
DoFn<T, T> addWindowsDoFn = new AssignWindowsDoFn<>(windowFn);
DoFnFunction<T, T> dofn =
new DoFnFunction<>(addWindowsDoFn, context.getRuntimeContext(), null);
context.setOutputRDD(transform, inRDD.mapPartitions(dofn));
}
}
};
}
示例12: processElement
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Override
public void processElement(
DoFn<KV<String, List<TSAggValueProto>>, KV<String, TSAggValueProto>>.ProcessContext c)
throws Exception {
for (TSAggValueProto candle : c.element().getValue()) {
c.output(KV.of(c.element().getKey(), candle));
}
}
示例13: processElement
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Override
public void processElement(
DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
throws Exception {
c.output(KV.of(c.element().getKey(), TSAggValueProto.newBuilder(c.element().getValue())
.setCloseTime(c.window().maxTimestamp().getMillis()).build()));
}
开发者ID:GoogleCloudPlatform,项目名称:data-timeseries-java,代码行数:10,代码来源:EmbedWindowTimeIntoAggregateDoFn.java
示例14: run
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
public static void run() {
DataflowPipelineOptions options = PipelineOptionsFactory.create()
.as(DataflowPipelineOptions.class);
options.setRunner(BlockingDataflowPipelineRunner.class);
options.setProject("chrome-oven-144308");
options.setFilesToStage(
detectClassPathResourcesToStage(
DataflowPipelineRunner.class.getClassLoader()
)
);
options.setStagingLocation("gs://dataflow-chrome-oven-144308/stagingForScheduledPipeline");
Pipeline p = Pipeline.create(options);
System.out.println("get here 0");
p.apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/*"))
.apply(ParDo.named("ExtractWords").of(new DoFn<String, String>() {
@Override
public void processElement(ProcessContext c) {
System.out.println("get here 1");
for (String word : c.element().split("[^a-zA-Z']+")) {
if (!word.isEmpty()) {
c.output(word);
}
}
}
}))
.apply(Count.<String>perElement())
.apply("FormatResults", MapElements.via(new SimpleFunction<KV<String, Long>, String>() {
@Override
public String apply(KV<String, Long> input) {
System.out.println("get here 3");
return input.getKey() + ": " + input.getValue();
}
}))
.apply(TextIO.Write.to("gs://dataflow-chrome-oven-144308/scheduled"));
p.run();
}
示例15: processElement
import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Override
public void processElement(DoFn<Entity, TableRow>.ProcessContext context) throws Exception {
System.out.println("Processing table row");
Entity e = context.element();
TableRow row = entityToRow(new BQSchema().getSchema(), e);
System.out.println("key is" + getKeyString(e));
row.put("_key", getKeyString(e));
context.output(row);
}