Java DoFn类代码示例

本文整理汇总了Java中com.google.cloud.dataflow.sdk.transforms.DoFn类的典型用法代码示例。如果您正苦于以下问题：Java DoFn类的具体用法？Java DoFn怎么用？Java DoFn使用的例子？那么, 这里精选的类代码示例或许可以为您提供帮助。

DoFn类属于com.google.cloud.dataflow.sdk.transforms包，在下文中一共展示了DoFn类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: processElement

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Override
public void processElement(
    DoFn<KV<String, Wrapper>, KV<String, WorkflowArgs>>.ProcessContext c) throws Exception {

  LOG.info("Combining args");

  Wrapper value = c.element().getValue();
  WorkflowArgs retval = null;

  // Iterate in order
  for (WorkflowArgs wa : value.map.values()) {

    // Modify a copy
    if (retval == null) {
      retval = new WorkflowArgs(wa);
    // Find differences and merge
    } else {
      retval.gatherArgs(wa);
    }
  }
  c.output(KV.of(c.element().getKey(), retval));
}

开发者ID:googlegenomics，项目名称:dockerflow，代码行数:23，代码来源:DockerDo.java

示例2: setupDataInput

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
    List<KV<String, TSProto>> data) {


  // Assert that we have 44 Elements in the PCollection
  PCollection<KV<String, TSProto>> tsData =
      pipeline.apply("ReadData", Create.of(data))
          .apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {

            @Override
            public void processElement(ProcessContext c) throws Exception {
              c.outputWithTimestamp(c.element(),
                  new DateTime(c.element().getValue().getTime()).toInstant());

            }

          })).setName("Assign TimeStamps");
  return tsData;

}

开发者ID:GoogleCloudPlatform，项目名称:data-timeseries-java，代码行数:21，代码来源:FXTimeSeriesPipelineSRGTests.java

示例3: apply

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Override
public PCollection<KV<GATKRead, ReferenceBases>> apply(PCollection<KV<ReferenceBases, Iterable<GATKRead>>> input) {
    return input.apply(ParDo.of(new DoFn<KV<ReferenceBases, Iterable<GATKRead>>, KV<GATKRead, ReferenceBases>>() {
        private static final long serialVersionUID = 1L;
        @Override
        public void processElement(ProcessContext c) throws Exception {
            // Each element of the PCollection is a set of reads keyed by a reference shard
            // The shard MUST have all of the reference bases for ALL of the reads. If not
            // it's an error.
            final ReferenceBases shard = c.element().getKey();
            final Iterable<GATKRead> reads = c.element().getValue();
            // For every read, find the subset of the reference that matches it according to our referenceWindowFunction
            for (GATKRead r : reads) {
                final ReferenceBases subset = shard.getSubset(referenceWindowFunction.apply(r));
                c.output(KV.of(r, subset));
            }
        }
    })).setName("GroupReadWithRefBases");
}

开发者ID:broadinstitute，项目名称:gatk-dataflow，代码行数:20，代码来源:PairReadWithRefBases.java

示例4: flattenShards

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
/**
 * Goes from full shards to separated, individual (read,context pairs).
 * The advantage is you don't need to know about shards anymore.
 * This comes at the expense of copying the reference data instead of being able to have a pointer to in-shard data.
 */
public static DoFn<ContextShard,KV<GATKRead,ReadContextData>> flattenShards(SerializableFunction<GATKRead, SimpleInterval> contextFn) {
    return new DoFnWLog<ContextShard, KV<GATKRead, ReadContextData>>("flattenShards") {
        private static final long serialVersionUID = 1L;
        @Override
        public void processElement(ProcessContext c) throws Exception {
            ContextShard shard = c.element();
            for (int i=0; i<shard.reads.size(); i++) {
                GATKRead read = shard.reads.get(i);
                ReadContextData rc = shard.readContext.get(i);
                ReadContextData rcd = new ReadContextData( rc.getOverlappingReferenceBases().getSubset(contextFn.apply(read)), rc.getOverlappingVariants());
                c.output(KV.of(read,rcd));
            }
        }
    };
}

开发者ID:broadinstitute，项目名称:gatk-dataflow，代码行数:21，代码来源:AddContextDataToReadOptimized.java

示例5: apply

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
/**
 * Filter out reads we don't want.
 */
@Override
public PCollection<GATKRead> apply(PCollection<GATKRead> in) {
    return in.apply(ParDo
            .named(readFilter.getClass().getSimpleName())
            .of(new DoFn<GATKRead, GATKRead>() {
                private static final long serialVersionUID = 1L;
                @Override
                public void processElement(DoFn<GATKRead,GATKRead>.ProcessContext c) throws Exception {
                    GATKRead read = c.element();
                    if (readFilter.test(read)) {
                        c.output(read);
                    }
                }
            }));
}

开发者ID:broadinstitute，项目名称:gatk-dataflow，代码行数:19，代码来源:DataflowReadFilter.java

示例6: of

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
/**
 * Recalibration report on GCS/HDFS -> PCollection of a single BaseRecalOutput.
 * The loading is done at the worker.
 *
 * @param pipeline the pipeline, with authentication information.
 * @param GCSFileName the path to the recalibration report. Must start with "gs://"
 */
static public PCollection<BaseRecalOutput> of(final Pipeline pipeline, String GCSFileName) {
    return pipeline.apply("calibration report name", Create.of(GCSFileName))
            .apply(ParDo.of(new DoFn<String, BaseRecalOutput>() {
                private static final long serialVersionUID = 1L;
                @Override
                public void processElement(ProcessContext c) {
                    final String fname = c.element();
                    File dest = IOUtils.createTempFile("temp-BaseRecal-", ".tmp");
                    try {
                        BucketUtils.copyFile(fname, c.getPipelineOptions(), dest.getPath());
                    } catch (IOException x) {
                        throw new GATKException("Unable to download recalibration table from '" + fname + "'.", x);
                    }
                    c.output(new BaseRecalOutput(dest));
                }

            }).named("ingest calibration report"));
}

开发者ID:broadinstitute，项目名称:gatk-dataflow，代码行数:26，代码来源:BaseRecalOutputSource.java

示例7: keyReadsByName

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
/**
 * Makes keys for read pairs. To be grouped by in the next step.
 */
static PTransform<PCollection<? extends GATKRead>, PCollection<KV<String, GATKRead>>> keyReadsByName(final PCollectionView<SAMFileHeader> headerPcolView) {
    return ParDo
            .named("key reads by name")
            .withSideInputs(headerPcolView)
            .of(new DoFn<GATKRead, KV<String, GATKRead>>() {
                private static final long serialVersionUID = 1l;

                @Override
                public void processElement(final ProcessContext context) throws Exception {
                    final GATKRead record = context.element();
                    if (ReadUtils.readHasMappedMate(record)) {
                        final SAMFileHeader h = context.sideInput(headerPcolView);
                        final String key = ReadsKey.keyForRead(h, record);
                        final KV<String, GATKRead> kv = KV.of(key, record);
                        context.output(kv);
                    }
                }
            });
}

开发者ID:broadinstitute，项目名称:gatk-dataflow，代码行数:23，代码来源:MarkDuplicatesDataflowUtils.java

示例8: makeKeysForFragments

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
/**
 * Groups reads by keys - keys are tuples of (library, contig, position, orientation).
 */
static PTransform<PCollection<? extends GATKRead>, PCollection<KV<String, GATKRead>>> makeKeysForFragments(final PCollectionView<SAMFileHeader> headerPcolView) {
    return ParDo
            .named("make keys for reads")
            .withSideInputs(headerPcolView)
            .of(new DoFn<GATKRead, KV<String, GATKRead>>() {
                private static final long serialVersionUID = 1L;
                @Override
                public void processElement(final ProcessContext context) throws Exception {
                    final GATKRead record = context.element().copy();
                    record.setIsDuplicate(false);
                    final SAMFileHeader h = context.sideInput(headerPcolView);
                    final String key = ReadsKey.keyForFragment(h, record);
                    final KV<String, GATKRead> kv = KV.of(key, record);
                    context.output(kv);
                }
            });
}

开发者ID:broadinstitute，项目名称:gatk-dataflow，代码行数:21，代码来源:MarkDuplicatesDataflowUtils.java

示例9: finalizeMetrics

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
private PTransform<PCollection<? extends KV<String, DuplicationMetrics>>, PCollection<KV<String, DuplicationMetrics>>> finalizeMetrics() {
    return ParDo
        .named("finalize metrics")
        .of(new DoFn<KV<String, DuplicationMetrics>, KV<String, DuplicationMetrics>>() {
                private static final long serialVersionUID = 1l;

                @Override
                public void processElement(final ProcessContext context) throws Exception {
                    DuplicationMetrics metrics = context.element().getValue().copy();
                    // Divide these by 2 because they are counted for each read
                    // when they should be counted by pair.
                    metrics.READ_PAIRS_EXAMINED = metrics.READ_PAIRS_EXAMINED / 2;
                    metrics.READ_PAIR_DUPLICATES = metrics.READ_PAIR_DUPLICATES / 2;

                    metrics.calculateDerivedMetrics();
                    if (metrics.ESTIMATED_LIBRARY_SIZE == null) {
                        metrics.ESTIMATED_LIBRARY_SIZE = 0L;
                    }
                    final KV<String, DuplicationMetrics> kv = KV.of(context.element().getKey(), metrics);
                    context.output(kv);
                }
            });
}

开发者ID:broadinstitute，项目名称:gatk-dataflow，代码行数:24，代码来源:MarkDuplicatesDataflowUtils.java

示例10: testGATKReadCoding

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Test(dataProvider = "reads")
public void testGATKReadCoding( final List<GATKRead> reads ) {
    // The simplest way to figure out if a class is coded correctly is to create a PCollection
    // of that type and see if it matches the List version.
    final Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    // Need to explicitly set the coder to GATKReadCoder, otherwise Create fails to infer
    // a coder properly in the case where the List contains a mix of different GATKRead implementations.
    final PCollection<GATKRead> dataflowReads = p.apply(Create.of(reads).withCoder(new GATKReadCoder()));
    DataflowAssert.that(dataflowReads).containsInAnyOrder(reads);

    final PCollection<GATKRead> dataflowReadsAfterTransform = dataflowReads.apply(ParDo.of(new DoFn<GATKRead, GATKRead>() {
        private static final long serialVersionUID = 1l;

        @Override
        public void processElement( ProcessContext c ) throws Exception {
            c.output(c.element());
        }
    })).setCoder(new GATKReadCoder());
    DataflowAssert.that(dataflowReadsAfterTransform).containsInAnyOrder(reads);

    p.run();
}

开发者ID:broadinstitute，项目名称:gatk-dataflow，代码行数:25，代码来源:GATKReadCoderUnitTest.java

示例11: window

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
private static <T, W extends BoundedWindow> TransformEvaluator<Window.Bound<T>> window() {
  return new TransformEvaluator<Window.Bound<T>>() {
    @Override
    public void evaluate(Window.Bound<T> transform, EvaluationContext context) {
      @SuppressWarnings("unchecked")
      JavaRDDLike<WindowedValue<T>, ?> inRDD =
          (JavaRDDLike<WindowedValue<T>, ?>) context.getInputRDD(transform);
      WindowFn<? super T, W> windowFn = WINDOW_FG.get("windowFn", transform);
      if (windowFn instanceof GlobalWindows) {
        context.setOutputRDD(transform, inRDD);
      } else {
        @SuppressWarnings("unchecked")
        DoFn<T, T> addWindowsDoFn = new AssignWindowsDoFn<>(windowFn);
        DoFnFunction<T, T> dofn =
                new DoFnFunction<>(addWindowsDoFn, context.getRuntimeContext(), null);
        context.setOutputRDD(transform, inRDD.mapPartitions(dofn));
      }
    }
  };
}

开发者ID:shakamunyi，项目名称:spark-dataflow，代码行数:21，代码来源:TransformTranslator.java

示例12: processElement

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Override
public void processElement(
    DoFn<KV<String, List<TSAggValueProto>>, KV<String, TSAggValueProto>>.ProcessContext c)
    throws Exception {
  for (TSAggValueProto candle : c.element().getValue()) {

    c.output(KV.of(c.element().getKey(), candle));

  }

}

开发者ID:GoogleCloudPlatform，项目名称:data-timeseries-java，代码行数:12，代码来源:FlattenKVIterableDoFn.java

示例13: processElement

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Override
public void processElement(
    DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
    throws Exception {

  c.output(KV.of(c.element().getKey(), TSAggValueProto.newBuilder(c.element().getValue())
      .setCloseTime(c.window().maxTimestamp().getMillis()).build()));

}

开发者ID:GoogleCloudPlatform，项目名称:data-timeseries-java，代码行数:10，代码来源:EmbedWindowTimeIntoAggregateDoFn.java

示例14: run

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
public static void run() {
  DataflowPipelineOptions options = PipelineOptionsFactory.create()
      .as(DataflowPipelineOptions.class);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  options.setProject("chrome-oven-144308");
  options.setFilesToStage(
      detectClassPathResourcesToStage(
          DataflowPipelineRunner.class.getClassLoader()
      )
  );
  options.setStagingLocation("gs://dataflow-chrome-oven-144308/stagingForScheduledPipeline");

  Pipeline p = Pipeline.create(options);

  System.out.println("get here 0");
  p.apply(TextIO.Read.from("gs://dataflow-samples/shakespeare/*"))
      .apply(ParDo.named("ExtractWords").of(new DoFn<String, String>() {
        @Override
        public void processElement(ProcessContext c) {
          System.out.println("get here 1");
          for (String word : c.element().split("[^a-zA-Z']+")) {
            if (!word.isEmpty()) {
              c.output(word);
            }
          }
        }
      }))
      .apply(Count.<String>perElement())
      .apply("FormatResults", MapElements.via(new SimpleFunction<KV<String, Long>, String>() {
        @Override
        public String apply(KV<String, Long> input) {
          System.out.println("get here 3");
          return input.getKey() + ": " + input.getValue();
        }
      }))

      .apply(TextIO.Write.to("gs://dataflow-chrome-oven-144308/scheduled"));

  p.run();
}

开发者ID:viktort，项目名称:appengine-cron-example，代码行数:41，代码来源:ScheduledMinimalWordCount.java

示例15: processElement

import com.google.cloud.dataflow.sdk.transforms.DoFn; //导入依赖的package包/类
@Override
public void processElement(DoFn<Entity, TableRow>.ProcessContext context) throws Exception {
  System.out.println("Processing table row");
  Entity e = context.element();

  TableRow row = entityToRow(new BQSchema().getSchema(), e);
  System.out.println("key is" + getKeyString(e));
  row.put("_key", getKeyString(e));
  
  context.output(row);
}

开发者ID:cobookman，项目名称:DatastoreToGCS，代码行数:12，代码来源:BQBackup.java

注：本文中的com.google.cloud.dataflow.sdk.transforms.DoFn类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。