当前位置: 首页>>代码示例>>Java>>正文


Java Create类代码示例

本文整理汇总了Java中com.google.cloud.dataflow.sdk.transforms.Create的典型用法代码示例。如果您正苦于以下问题:Java Create类的具体用法?Java Create怎么用?Java Create使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


Create类属于com.google.cloud.dataflow.sdk.transforms包,在下文中一共展示了Create类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: loadArtistCreditsByKey

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@org.junit.Test
public void loadArtistCreditsByKey() {
  DirectPipeline p = DirectPipeline.createForTest();
  Long artistCreditIds[] = {634509L, 846332L};
  PCollection<String> text = p.apply(Create.of(artistCreditLinesOfJson)).setCoder(StringUtf8Coder.of());
  PCollection<KV<Long, MusicBrainzDataObject>> artistCredits = MusicBrainzTransforms.loadTableFromText(text, "artist_credit_name", "artist_credit");
  PCollection<Long> artistCreditIdPCollection =
      artistCredits.apply(MapElements.via((KV<Long, MusicBrainzDataObject> kv) -> {
            Long k = kv.getKey();
            return k;
          })
              .withOutputType(new TypeDescriptor<Long>() {
              })
      );
  DataflowAssert.that(artistCreditIdPCollection).containsInAnyOrder(634509L, 846332L);
}
 
开发者ID:GoogleCloudPlatform,项目名称:bigquery-etl-dataflow-sample,代码行数:17,代码来源:MusicBrainzTransformsTest.java

示例2: loadArtistsWithMapping

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@org.junit.Test
public void loadArtistsWithMapping() {

  DirectPipeline p = DirectPipeline.createForTest();

  PCollection<String> artistText = p.apply("artist", Create.of(artistLinesOfJson)).setCoder(StringUtf8Coder.of());
  Map<String, PCollectionView<Map<Long, String>>> maps = new HashMap<>();
  PCollection<String> areaMapText = p.apply("area", Create.of(areaLinesOfJson)).setCoder(StringUtf8Coder.of());
  PCollectionView<Map<Long, String>> areamap = MusicBrainzTransforms.loadMapFromText(areaMapText, "id", "area");
  maps.put("area", areamap);
  PCollection<KV<Long, MusicBrainzDataObject>> loadedArtists = MusicBrainzTransforms.loadTableFromText(artistText, "artist", "id", maps);

  PCollection<String> areas = loadedArtists.apply("areaLabels", MapElements.via((KV<Long, MusicBrainzDataObject> row) -> {
    return (String) row.getValue().getColumnValue("area");
  }).withOutputType(new TypeDescriptor<String>() {
  }));

  DataflowAssert.that(areas).satisfies((areaLabels) -> {
    List<String> theList = new ArrayList<>();
    areaLabels.forEach(theList::add);
    assert (theList.contains("Canada"));
    return null;
  });


}
 
开发者ID:GoogleCloudPlatform,项目名称:bigquery-etl-dataflow-sample,代码行数:27,代码来源:MusicBrainzTransformsTest.java

示例3: setupDataInput

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
    List<KV<String, TSProto>> data) {


  // Assert that we have 44 Elements in the PCollection
  PCollection<KV<String, TSProto>> tsData =
      pipeline.apply("ReadData", Create.of(data))
          .apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {

            @Override
            public void processElement(ProcessContext c) throws Exception {
              c.outputWithTimestamp(c.element(),
                  new DateTime(c.element().getValue().getTime()).toInstant());

            }

          })).setName("Assign TimeStamps");
  return tsData;

}
 
开发者ID:GoogleCloudPlatform,项目名称:data-timeseries-java,代码行数:21,代码来源:FXTimeSeriesPipelineSRGTests.java

示例4: add

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
/**
 * Takes the variants, groups them into shards of size "bigShardSize" at the client, then
 * runs subdivideAndFillReads then fillContext. See their documentation for more information.
 */
public static PCollection<ContextShard> add(
        Pipeline pipeline,
        List<SimpleInterval> intervalsOfInterest, int bigShardSize, List<Variant> variants,
        String bam, int outputShardSize, int margin, final ReadFilter optFilter,
        final ReferenceMultiSource refSource
) throws IOException {

    List<SimpleInterval> shardedIntervals = IntervalUtils.cutToShards(intervalsOfInterest, bigShardSize);
    ArrayList<AddContextDataToReadOptimized.ContextShard> shards = AddContextDataToReadOptimized.fillVariants(shardedIntervals, variants, margin);
    PCollection<AddContextDataToReadOptimized.ContextShard> shardsPCol = pipeline.apply(Create.of(shards));
    return shardsPCol
            // big shards of variants -> smaller shards with variants, reads. We take the opportunity to filter the reads as close to the source as possible.
            .apply(ParDo.named("subdivideAndFillReads").of(AddContextDataToReadOptimized.subdivideAndFillReads(bam, outputShardSize, margin, optFilter)))
                    // add ref bases to the shards.
            .apply(ParDo.named("fillContext").of(AddContextDataToReadOptimized.fillContext(refSource)));

}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:22,代码来源:AddContextDataToReadOptimized.java

示例5: writeToFile

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
/**
 * Takes a few Reads and will write them to a BAM file.
 * The Reads don't have to be sorted initially, the BAM file will be.
 * All the reads must fit into a single worker's memory, so this won't go well if you have too many.
 *
 * @param pipeline the pipeline to add this operation to.
 * @param reads  the reads to write (they don't need to be sorted).
 * @param header the header that corresponds to the reads.
 * @param destPath the GCS or local path to write to (must start with "gs://" if writing to GCS).
 * @param parquet whether to write out BAM or Parquet data (BDG AlignmentRecords); only applies when writing to Hadoop
 */
public static void writeToFile(
        Pipeline pipeline, PCollection<GATKRead> reads, final SAMFileHeader header, final String destPath,
        final boolean parquet) {
    if ( BucketUtils.isHadoopUrl(destPath) ||
            pipeline.getRunner().getClass().equals(SparkPipelineRunner.class)) {
        writeToHadoop(pipeline, reads, header, destPath, parquet);
    } else {
        PCollectionView<Iterable<GATKRead>> iterableView =
                reads.apply(View.<GATKRead>asIterable());

        PCollection<String> dummy = pipeline.apply("output file name", Create.<String>of(destPath));

        dummy.apply(ParDo.named("save to BAM file")
                        .withSideInputs(iterableView)
                        .of(new SaveToBAMFile(header, iterableView))
        );
    }
}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:30,代码来源:SmallBamWriter.java

示例6: of

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
/**
 * Recalibration report on GCS/HDFS -> PCollection of a single BaseRecalOutput.
 * The loading is done at the worker.
 *
 * @param pipeline the pipeline, with authentication information.
 * @param GCSFileName the path to the recalibration report. Must start with "gs://"
 */
static public PCollection<BaseRecalOutput> of(final Pipeline pipeline, String GCSFileName) {
    return pipeline.apply("calibration report name", Create.of(GCSFileName))
            .apply(ParDo.of(new DoFn<String, BaseRecalOutput>() {
                private static final long serialVersionUID = 1L;
                @Override
                public void processElement(ProcessContext c) {
                    final String fname = c.element();
                    File dest = IOUtils.createTempFile("temp-BaseRecal-", ".tmp");
                    try {
                        BucketUtils.copyFile(fname, c.getPipelineOptions(), dest.getPath());
                    } catch (IOException x) {
                        throw new GATKException("Unable to download recalibration table from '" + fname + "'.", x);
                    }
                    c.output(new BaseRecalOutput(dest));
                }

            }).named("ingest calibration report"));
}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:26,代码来源:BaseRecalOutputSource.java

示例7: setupPipeline

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Override
protected void setupPipeline(Pipeline pipeline) {
    if (readArguments.getReadFilesNames().size()>1) {
        throw new UserException("Sorry, we only support a single input file for now.");
    }
    final String filename = readArguments.getReadFilesNames().get(0);
    final ReadsDataflowSource readsSource = new ReadsDataflowSource(filename, pipeline);
    final SAMFileHeader header = readsSource.getHeader();
    final PCollectionView<SAMFileHeader> headerView = pipeline.apply(Create.of(header)).apply(View.asSingleton());
    final SAMSequenceDictionary sequenceDictionary = header.getSequenceDictionary();
    final List<SimpleInterval> intervals = intervalArgumentCollection.intervalsSpecified() ? intervalArgumentCollection.getIntervals(sequenceDictionary)
            : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
    final PCollectionView<BaseRecalOutput> recalInfoSingletonView = BaseRecalOutputSource.loadFileOrRemote(pipeline, BQSR_RECAL_FILE_NAME).apply(View.asSingleton());
    final PCollection<GATKRead> output = readsSource.getReadPCollection(intervals, ValidationStringency.SILENT, false)
            .apply(new ApplyBQSRTransform(headerView, recalInfoSingletonView, bqsrOpts));
    intermediateRemoteBam = OUTPUT;
    if (needsIntermediateCopy()) {
        // The user specified remote execution and provided a local file name. So we're going to have to save to remote storage as a go-between.
        // Note that this may require more permissions
        intermediateRemoteBam = BucketUtils.randomRemotePath(stagingLocation, "temp-applyBqsr-output-", ".bam");
        logger.info("Staging results at " + intermediateRemoteBam);
    }
    SmallBamWriter.writeToFile(pipeline, output, header, intermediateRemoteBam);
}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:25,代码来源:ApplyBQSRDataflow.java

示例8: transformExampleTest

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "refAndInts")
public void transformExampleTest(Integer[] integers, Set<KV<ReferenceShard, Integer>> s) {
    Pipeline p = GATKTestPipeline.create();
    p.getCoderRegistry().registerCoder(ReferenceShard.class, ReferenceShard.CODER);

    List<Integer> ints = Lists.newArrayList(integers);
    PCollection<Integer> pInts = p.apply(Create.of(ints));

    // Note that we needed to make the Transform a full class (not anonymous).
    // So, you can't test a PTransform by writing it within a test.
    PCollection<KV<ReferenceShard, Integer>> keyedInts =
            pInts.apply(new TransformExample());
    List<KV<ReferenceShard, Integer>> e = Lists.newArrayList(s);
    DataflowAssert.that(keyedInts).containsInAnyOrder(e);

    p.run();
}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:18,代码来源:SampleUnitTest.java

示例9: fullRemoveDupesTest

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "dupedPairedReadsAndVariants")
public void fullRemoveDupesTest(List<KV<GATKRead, Variant>> dupes, List<KV<UUID, UUID>> kvUUIDUUID,
                                        List<KV<UUID, Iterable<UUID>>> kvUUIDiUUID, List<KV<UUID, Iterable<Variant>>> kvUUIDiVariant,
                                        List<KV<GATKRead, Iterable<Variant>>> finalExpected) {
    Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    PCollection<KV<GATKRead, Variant>> pKVs = DataflowTestUtils.pCollectionCreateAndVerify(p, dupes,
            KvCoder.of(new GATKReadCoder(), new VariantCoder()));

    PCollection<KV<GATKRead, Iterable<Variant>>> result = pKVs.apply(new RemoveDuplicateReadVariantPairs());
    PCollection<KV<GATKRead, Iterable<Variant>>> pFinalExpected = p.apply(Create.of(finalExpected).withCoder(KvCoder.of(new GATKReadCoder(), IterableCoder.of(new VariantCoder()))));
    DataflowTestUtils.keyIterableValueMatcher(result, pFinalExpected);

    p.run();
}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:17,代码来源:RemoveDuplicateReadVariantPairsUnitTest.java

示例10: refBasesTest

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "bases")
public void refBasesTest(List<KV<ReferenceShard, Iterable<GATKRead>>> kvRefShardiReads,
                         List<SimpleInterval> intervals, List<KV<ReferenceBases, Iterable<GATKRead>>> kvRefBasesiReads) throws IOException {
    Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    final ReferenceMultiSource mockSource = createMockReferenceDataflowSource(intervals, ReferenceWindowFunctions.IDENTITY_FUNCTION);

    PCollection<KV<ReferenceShard, Iterable<GATKRead>>> pInput = p.apply("pInput.Create", Create.of(kvRefShardiReads).withCoder(KvCoder.of(ReferenceShard.CODER, IterableCoder.of(new GATKReadCoder()))));

    PCollection<KV<ReferenceBases, Iterable<GATKRead>>> kvpCollection = RefBasesFromAPI.getBasesForShard(pInput, mockSource);
    PCollection<KV<ReferenceBases, Iterable<GATKRead>>> pkvRefBasesiReads = p.apply("pkvRefBasesiReads.Create", Create.of(kvRefBasesiReads).withCoder(KvCoder.of(SerializableCoder.of(ReferenceBases.class), IterableCoder.of(new GATKReadCoder()))));
    DataflowTestUtils.keyIterableValueMatcher(kvpCollection, pkvRefBasesiReads);

    p.run();
}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:17,代码来源:RefBasesFromAPIUnitTest.java

示例11: noReadsTest

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "bases")
public void noReadsTest(List<KV<ReferenceShard, Iterable<GATKRead>>> kvRefShardiReads,
                         List<SimpleInterval> intervals, List<KV<ReferenceBases, Iterable<GATKRead>>> kvRefBasesiReads) throws IOException {
    Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    final ReferenceMultiSource mockSource = createMockReferenceDataflowSource(intervals, ReferenceWindowFunctions.IDENTITY_FUNCTION);

    List<KV<ReferenceShard, Iterable<GATKRead>>> noReads = Arrays.asList(
            KV.of(new ReferenceShard(0, "1"), Lists.newArrayList()));

    PCollection<KV<ReferenceShard, Iterable<GATKRead>>> pInput = p.apply(Create.of(noReads).withCoder(KvCoder.of(ReferenceShard.CODER, IterableCoder.of(new GATKReadCoder()))));

    // We expect an exception to be thrown if there is a problem. If no error is thrown, it's fine.
    RefBasesFromAPI.getBasesForShard(pInput, mockSource);

    p.run();
}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:19,代码来源:RefBasesFromAPIUnitTest.java

示例12: testRefBasesFromAPIWithCustomWindowFunction

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "RefBasesFromAPIWithCustomWindowFunctionTestData")
public void testRefBasesFromAPIWithCustomWindowFunction( final KV<ReferenceShard, Iterable<GATKRead>> inputShard, final SerializableFunction<GATKRead, SimpleInterval> referenceWindowFunction, final SimpleInterval expectedReferenceInterval ) throws IOException {
    Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    // Assuming everything works properly, we only need a mock response to the expected reference interval,
    // since that is what should end up being queried in ReferenceDataflowSource. If we later try to query an
    // unexpected interval, we'll detect it via an Assert.
    final ReferenceMultiSource mockSource = createMockReferenceDataflowSource(Arrays.asList(expectedReferenceInterval), referenceWindowFunction);

    PCollection<KV<ReferenceShard, Iterable<GATKRead>>> pInput = p.apply("pInput.Create", Create.of(inputShard).withCoder(KvCoder.of(ReferenceShard.CODER, IterableCoder.of(new GATKReadCoder()))));
    PCollection<KV<ReferenceBases, Iterable<GATKRead>>> actualResult = RefBasesFromAPI.getBasesForShard(pInput, mockSource);

    DataflowAssert.that(actualResult).satisfies((Iterable<KV<ReferenceBases, Iterable<GATKRead>>> input) -> {
        for (KV<ReferenceBases, Iterable<GATKRead>> kvPair : input) {
            Assert.assertNotNull(kvPair.getKey(), "Null ReferenceBases in KV pair indicates that reference query in ReferenceDataflowSource used an unexpected/incorrect interval (mock ReferenceDataflowSource returned null)");
            Assert.assertEquals(kvPair.getKey().getInterval(), expectedReferenceInterval, "Wrong interval for ReferenceBases object after applying RefBasesFromAPI");
        }
        return null;
    });

    p.run();
}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:24,代码来源:RefBasesFromAPIUnitTest.java

示例13: testPairReadsWithRefBasesWithCustomWindowFunction

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "PairReadsWithRefBasesWithCustomWindowFunctionTestData")
public void testPairReadsWithRefBasesWithCustomWindowFunction( final KV<ReferenceBases, Iterable<GATKRead>> input, final SerializableFunction<GATKRead, SimpleInterval> referenceWindowFunction, final byte[] expectedBases, final SimpleInterval expectedInterval ) {
    Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    PCollection<KV<ReferenceBases, Iterable<GATKRead>>> pInput = p.apply(Create.of(input)
            .withCoder(KvCoder.of(SerializableCoder.of(ReferenceBases.class), IterableCoder.of(new GATKReadCoder()))));

    PCollection<KV<GATKRead, ReferenceBases>> result = pInput.apply(new PairReadWithRefBases(referenceWindowFunction));

    DataflowAssert.that(result).satisfies((Iterable<KV<GATKRead, ReferenceBases>> resultElements) -> {
        for ( KV<GATKRead, ReferenceBases> kvPair : resultElements ) {
            Assert.assertNotNull(kvPair.getKey(), "Null read in transform result");
            Assert.assertNotNull(kvPair.getValue(), "Null ReferenceBases object paired with read");
            Assert.assertEquals(kvPair.getValue().getBases(), expectedBases, "Wrong bases in ReferenceBases object paired with read");
            Assert.assertEquals(kvPair.getValue().getInterval(), expectedInterval, "Wrong interval in ReferenceBases object paired with read");
        }
        return null;
    });

    p.run();
}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:23,代码来源:PairReadsWithRefBasesUnitTest.java

示例14: addContextDataTest

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "bases")
public void addContextDataTest(List<GATKRead> reads, List<Variant> variantList,
                               List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<KV<GATKRead, ReadContextData>> kvReadContextData,
                               List<SimpleInterval> intervals, List<KV<GATKRead, Iterable<Variant>>> kvReadiVariant) {
    Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    PCollection<GATKRead> pReads = DataflowTestUtils.pCollectionCreateAndVerify(p, reads, new GATKReadCoder());
    PCollection<KV<GATKRead, ReferenceBases>> pReadRef = DataflowTestUtils.pCollectionCreateAndVerify(p, kvReadRefBases,
            KvCoder.of(new GATKReadCoder(), SerializableCoder.of(ReferenceBases.class)));

    PCollection<KV<GATKRead, Iterable<Variant>>> pReadVariants =
            p.apply(Create.of(kvReadiVariant).withCoder(KvCoder.of(new GATKReadCoder(), IterableCoder.of(new VariantCoder()))));

    PCollection<KV<GATKRead, ReadContextData>> joinedResults = AddContextDataToRead.join(pReads, pReadRef, pReadVariants);
    PCollection<KV<GATKRead, ReadContextData>> pkvReadContextData = p.apply(Create.of(kvReadContextData).withCoder(KvCoder.of(new GATKReadCoder(), new ReadContextDataCoder())));
    DataflowTestUtils.keyReadContextDataMatcher(joinedResults, pkvReadContextData);
    p.run();
}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:20,代码来源:AddContextDataToReadUnitTest.java

示例15: fullTest

import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "bases")
public void fullTest(List<GATKRead> reads, List<Variant> variantList,
                  List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<KV<GATKRead, ReadContextData>> kvReadContextData,
                  List<SimpleInterval> intervals, List<KV<GATKRead, Iterable<Variant>>> kvReadiVariant) throws IOException {
    Pipeline p = GATKTestPipeline.create();
    DataflowUtils.registerGATKCoders(p);

    PCollection<GATKRead> pReads = DataflowTestUtils.pCollectionCreateAndVerify(p, reads, new GATKReadCoder());

    PCollection<Variant> pVariant = p.apply(Create.of(variantList));
    VariantsDataflowSource mockVariantsSource = mock(VariantsDataflowSource.class);

    when(mockVariantsSource.getAllVariants()).thenReturn(pVariant);

    ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
    for (SimpleInterval i : intervals) {
        when(mockSource.getReferenceBases(any(PipelineOptions.class), eq(i))).thenReturn(FakeReferenceSource.bases(i));
    }
    when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);

    PCollection<KV<GATKRead, ReadContextData>> result = AddContextDataToRead.add(pReads, mockSource, mockVariantsSource);
    PCollection<KV<GATKRead, ReadContextData>> pkvReadContextData = p.apply(Create.of(kvReadContextData).withCoder(KvCoder.of(new GATKReadCoder(), new ReadContextDataCoder())));
    DataflowTestUtils.keyReadContextDataMatcher(result, pkvReadContextData);
    p.run();
}
 
开发者ID:broadinstitute,项目名称:gatk-dataflow,代码行数:26,代码来源:AddContextDataToReadUnitTest.java


注:本文中的com.google.cloud.dataflow.sdk.transforms.Create类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。