本文整理汇总了Java中com.google.cloud.dataflow.sdk.transforms.Create类的典型用法代码示例。如果您正苦于以下问题:Java Create类的具体用法?Java Create怎么用?Java Create使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Create类属于com.google.cloud.dataflow.sdk.transforms包,在下文中一共展示了Create类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: loadArtistCreditsByKey
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@org.junit.Test
public void loadArtistCreditsByKey() {
DirectPipeline p = DirectPipeline.createForTest();
Long artistCreditIds[] = {634509L, 846332L};
PCollection<String> text = p.apply(Create.of(artistCreditLinesOfJson)).setCoder(StringUtf8Coder.of());
PCollection<KV<Long, MusicBrainzDataObject>> artistCredits = MusicBrainzTransforms.loadTableFromText(text, "artist_credit_name", "artist_credit");
PCollection<Long> artistCreditIdPCollection =
artistCredits.apply(MapElements.via((KV<Long, MusicBrainzDataObject> kv) -> {
Long k = kv.getKey();
return k;
})
.withOutputType(new TypeDescriptor<Long>() {
})
);
DataflowAssert.that(artistCreditIdPCollection).containsInAnyOrder(634509L, 846332L);
}
开发者ID:GoogleCloudPlatform,项目名称:bigquery-etl-dataflow-sample,代码行数:17,代码来源:MusicBrainzTransformsTest.java
示例2: loadArtistsWithMapping
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@org.junit.Test
public void loadArtistsWithMapping() {
DirectPipeline p = DirectPipeline.createForTest();
PCollection<String> artistText = p.apply("artist", Create.of(artistLinesOfJson)).setCoder(StringUtf8Coder.of());
Map<String, PCollectionView<Map<Long, String>>> maps = new HashMap<>();
PCollection<String> areaMapText = p.apply("area", Create.of(areaLinesOfJson)).setCoder(StringUtf8Coder.of());
PCollectionView<Map<Long, String>> areamap = MusicBrainzTransforms.loadMapFromText(areaMapText, "id", "area");
maps.put("area", areamap);
PCollection<KV<Long, MusicBrainzDataObject>> loadedArtists = MusicBrainzTransforms.loadTableFromText(artistText, "artist", "id", maps);
PCollection<String> areas = loadedArtists.apply("areaLabels", MapElements.via((KV<Long, MusicBrainzDataObject> row) -> {
return (String) row.getValue().getColumnValue("area");
}).withOutputType(new TypeDescriptor<String>() {
}));
DataflowAssert.that(areas).satisfies((areaLabels) -> {
List<String> theList = new ArrayList<>();
areaLabels.forEach(theList::add);
assert (theList.contains("Canada"));
return null;
});
}
开发者ID:GoogleCloudPlatform,项目名称:bigquery-etl-dataflow-sample,代码行数:27,代码来源:MusicBrainzTransformsTest.java
示例3: setupDataInput
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
List<KV<String, TSProto>> data) {
// Assert that we have 44 Elements in the PCollection
PCollection<KV<String, TSProto>> tsData =
pipeline.apply("ReadData", Create.of(data))
.apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {
@Override
public void processElement(ProcessContext c) throws Exception {
c.outputWithTimestamp(c.element(),
new DateTime(c.element().getValue().getTime()).toInstant());
}
})).setName("Assign TimeStamps");
return tsData;
}
示例4: add
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
/**
* Takes the variants, groups them into shards of size "bigShardSize" at the client, then
* runs subdivideAndFillReads then fillContext. See their documentation for more information.
*/
public static PCollection<ContextShard> add(
Pipeline pipeline,
List<SimpleInterval> intervalsOfInterest, int bigShardSize, List<Variant> variants,
String bam, int outputShardSize, int margin, final ReadFilter optFilter,
final ReferenceMultiSource refSource
) throws IOException {
List<SimpleInterval> shardedIntervals = IntervalUtils.cutToShards(intervalsOfInterest, bigShardSize);
ArrayList<AddContextDataToReadOptimized.ContextShard> shards = AddContextDataToReadOptimized.fillVariants(shardedIntervals, variants, margin);
PCollection<AddContextDataToReadOptimized.ContextShard> shardsPCol = pipeline.apply(Create.of(shards));
return shardsPCol
// big shards of variants -> smaller shards with variants, reads. We take the opportunity to filter the reads as close to the source as possible.
.apply(ParDo.named("subdivideAndFillReads").of(AddContextDataToReadOptimized.subdivideAndFillReads(bam, outputShardSize, margin, optFilter)))
// add ref bases to the shards.
.apply(ParDo.named("fillContext").of(AddContextDataToReadOptimized.fillContext(refSource)));
}
示例5: writeToFile
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
/**
* Takes a few Reads and will write them to a BAM file.
* The Reads don't have to be sorted initially, the BAM file will be.
* All the reads must fit into a single worker's memory, so this won't go well if you have too many.
*
* @param pipeline the pipeline to add this operation to.
* @param reads the reads to write (they don't need to be sorted).
* @param header the header that corresponds to the reads.
* @param destPath the GCS or local path to write to (must start with "gs://" if writing to GCS).
* @param parquet whether to write out BAM or Parquet data (BDG AlignmentRecords); only applies when writing to Hadoop
*/
public static void writeToFile(
Pipeline pipeline, PCollection<GATKRead> reads, final SAMFileHeader header, final String destPath,
final boolean parquet) {
if ( BucketUtils.isHadoopUrl(destPath) ||
pipeline.getRunner().getClass().equals(SparkPipelineRunner.class)) {
writeToHadoop(pipeline, reads, header, destPath, parquet);
} else {
PCollectionView<Iterable<GATKRead>> iterableView =
reads.apply(View.<GATKRead>asIterable());
PCollection<String> dummy = pipeline.apply("output file name", Create.<String>of(destPath));
dummy.apply(ParDo.named("save to BAM file")
.withSideInputs(iterableView)
.of(new SaveToBAMFile(header, iterableView))
);
}
}
示例6: of
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
/**
* Recalibration report on GCS/HDFS -> PCollection of a single BaseRecalOutput.
* The loading is done at the worker.
*
* @param pipeline the pipeline, with authentication information.
* @param GCSFileName the path to the recalibration report. Must start with "gs://"
*/
static public PCollection<BaseRecalOutput> of(final Pipeline pipeline, String GCSFileName) {
return pipeline.apply("calibration report name", Create.of(GCSFileName))
.apply(ParDo.of(new DoFn<String, BaseRecalOutput>() {
private static final long serialVersionUID = 1L;
@Override
public void processElement(ProcessContext c) {
final String fname = c.element();
File dest = IOUtils.createTempFile("temp-BaseRecal-", ".tmp");
try {
BucketUtils.copyFile(fname, c.getPipelineOptions(), dest.getPath());
} catch (IOException x) {
throw new GATKException("Unable to download recalibration table from '" + fname + "'.", x);
}
c.output(new BaseRecalOutput(dest));
}
}).named("ingest calibration report"));
}
示例7: setupPipeline
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Override
protected void setupPipeline(Pipeline pipeline) {
if (readArguments.getReadFilesNames().size()>1) {
throw new UserException("Sorry, we only support a single input file for now.");
}
final String filename = readArguments.getReadFilesNames().get(0);
final ReadsDataflowSource readsSource = new ReadsDataflowSource(filename, pipeline);
final SAMFileHeader header = readsSource.getHeader();
final PCollectionView<SAMFileHeader> headerView = pipeline.apply(Create.of(header)).apply(View.asSingleton());
final SAMSequenceDictionary sequenceDictionary = header.getSequenceDictionary();
final List<SimpleInterval> intervals = intervalArgumentCollection.intervalsSpecified() ? intervalArgumentCollection.getIntervals(sequenceDictionary)
: IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
final PCollectionView<BaseRecalOutput> recalInfoSingletonView = BaseRecalOutputSource.loadFileOrRemote(pipeline, BQSR_RECAL_FILE_NAME).apply(View.asSingleton());
final PCollection<GATKRead> output = readsSource.getReadPCollection(intervals, ValidationStringency.SILENT, false)
.apply(new ApplyBQSRTransform(headerView, recalInfoSingletonView, bqsrOpts));
intermediateRemoteBam = OUTPUT;
if (needsIntermediateCopy()) {
// The user specified remote execution and provided a local file name. So we're going to have to save to remote storage as a go-between.
// Note that this may require more permissions
intermediateRemoteBam = BucketUtils.randomRemotePath(stagingLocation, "temp-applyBqsr-output-", ".bam");
logger.info("Staging results at " + intermediateRemoteBam);
}
SmallBamWriter.writeToFile(pipeline, output, header, intermediateRemoteBam);
}
示例8: transformExampleTest
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "refAndInts")
public void transformExampleTest(Integer[] integers, Set<KV<ReferenceShard, Integer>> s) {
Pipeline p = GATKTestPipeline.create();
p.getCoderRegistry().registerCoder(ReferenceShard.class, ReferenceShard.CODER);
List<Integer> ints = Lists.newArrayList(integers);
PCollection<Integer> pInts = p.apply(Create.of(ints));
// Note that we needed to make the Transform a full class (not anonymous).
// So, you can't test a PTransform by writing it within a test.
PCollection<KV<ReferenceShard, Integer>> keyedInts =
pInts.apply(new TransformExample());
List<KV<ReferenceShard, Integer>> e = Lists.newArrayList(s);
DataflowAssert.that(keyedInts).containsInAnyOrder(e);
p.run();
}
示例9: fullRemoveDupesTest
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "dupedPairedReadsAndVariants")
public void fullRemoveDupesTest(List<KV<GATKRead, Variant>> dupes, List<KV<UUID, UUID>> kvUUIDUUID,
List<KV<UUID, Iterable<UUID>>> kvUUIDiUUID, List<KV<UUID, Iterable<Variant>>> kvUUIDiVariant,
List<KV<GATKRead, Iterable<Variant>>> finalExpected) {
Pipeline p = GATKTestPipeline.create();
DataflowUtils.registerGATKCoders(p);
PCollection<KV<GATKRead, Variant>> pKVs = DataflowTestUtils.pCollectionCreateAndVerify(p, dupes,
KvCoder.of(new GATKReadCoder(), new VariantCoder()));
PCollection<KV<GATKRead, Iterable<Variant>>> result = pKVs.apply(new RemoveDuplicateReadVariantPairs());
PCollection<KV<GATKRead, Iterable<Variant>>> pFinalExpected = p.apply(Create.of(finalExpected).withCoder(KvCoder.of(new GATKReadCoder(), IterableCoder.of(new VariantCoder()))));
DataflowTestUtils.keyIterableValueMatcher(result, pFinalExpected);
p.run();
}
示例10: refBasesTest
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "bases")
public void refBasesTest(List<KV<ReferenceShard, Iterable<GATKRead>>> kvRefShardiReads,
List<SimpleInterval> intervals, List<KV<ReferenceBases, Iterable<GATKRead>>> kvRefBasesiReads) throws IOException {
Pipeline p = GATKTestPipeline.create();
DataflowUtils.registerGATKCoders(p);
final ReferenceMultiSource mockSource = createMockReferenceDataflowSource(intervals, ReferenceWindowFunctions.IDENTITY_FUNCTION);
PCollection<KV<ReferenceShard, Iterable<GATKRead>>> pInput = p.apply("pInput.Create", Create.of(kvRefShardiReads).withCoder(KvCoder.of(ReferenceShard.CODER, IterableCoder.of(new GATKReadCoder()))));
PCollection<KV<ReferenceBases, Iterable<GATKRead>>> kvpCollection = RefBasesFromAPI.getBasesForShard(pInput, mockSource);
PCollection<KV<ReferenceBases, Iterable<GATKRead>>> pkvRefBasesiReads = p.apply("pkvRefBasesiReads.Create", Create.of(kvRefBasesiReads).withCoder(KvCoder.of(SerializableCoder.of(ReferenceBases.class), IterableCoder.of(new GATKReadCoder()))));
DataflowTestUtils.keyIterableValueMatcher(kvpCollection, pkvRefBasesiReads);
p.run();
}
示例11: noReadsTest
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "bases")
public void noReadsTest(List<KV<ReferenceShard, Iterable<GATKRead>>> kvRefShardiReads,
List<SimpleInterval> intervals, List<KV<ReferenceBases, Iterable<GATKRead>>> kvRefBasesiReads) throws IOException {
Pipeline p = GATKTestPipeline.create();
DataflowUtils.registerGATKCoders(p);
final ReferenceMultiSource mockSource = createMockReferenceDataflowSource(intervals, ReferenceWindowFunctions.IDENTITY_FUNCTION);
List<KV<ReferenceShard, Iterable<GATKRead>>> noReads = Arrays.asList(
KV.of(new ReferenceShard(0, "1"), Lists.newArrayList()));
PCollection<KV<ReferenceShard, Iterable<GATKRead>>> pInput = p.apply(Create.of(noReads).withCoder(KvCoder.of(ReferenceShard.CODER, IterableCoder.of(new GATKReadCoder()))));
// We expect an exception to be thrown if there is a problem. If no error is thrown, it's fine.
RefBasesFromAPI.getBasesForShard(pInput, mockSource);
p.run();
}
示例12: testRefBasesFromAPIWithCustomWindowFunction
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "RefBasesFromAPIWithCustomWindowFunctionTestData")
public void testRefBasesFromAPIWithCustomWindowFunction( final KV<ReferenceShard, Iterable<GATKRead>> inputShard, final SerializableFunction<GATKRead, SimpleInterval> referenceWindowFunction, final SimpleInterval expectedReferenceInterval ) throws IOException {
Pipeline p = GATKTestPipeline.create();
DataflowUtils.registerGATKCoders(p);
// Assuming everything works properly, we only need a mock response to the expected reference interval,
// since that is what should end up being queried in ReferenceDataflowSource. If we later try to query an
// unexpected interval, we'll detect it via an Assert.
final ReferenceMultiSource mockSource = createMockReferenceDataflowSource(Arrays.asList(expectedReferenceInterval), referenceWindowFunction);
PCollection<KV<ReferenceShard, Iterable<GATKRead>>> pInput = p.apply("pInput.Create", Create.of(inputShard).withCoder(KvCoder.of(ReferenceShard.CODER, IterableCoder.of(new GATKReadCoder()))));
PCollection<KV<ReferenceBases, Iterable<GATKRead>>> actualResult = RefBasesFromAPI.getBasesForShard(pInput, mockSource);
DataflowAssert.that(actualResult).satisfies((Iterable<KV<ReferenceBases, Iterable<GATKRead>>> input) -> {
for (KV<ReferenceBases, Iterable<GATKRead>> kvPair : input) {
Assert.assertNotNull(kvPair.getKey(), "Null ReferenceBases in KV pair indicates that reference query in ReferenceDataflowSource used an unexpected/incorrect interval (mock ReferenceDataflowSource returned null)");
Assert.assertEquals(kvPair.getKey().getInterval(), expectedReferenceInterval, "Wrong interval for ReferenceBases object after applying RefBasesFromAPI");
}
return null;
});
p.run();
}
示例13: testPairReadsWithRefBasesWithCustomWindowFunction
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "PairReadsWithRefBasesWithCustomWindowFunctionTestData")
public void testPairReadsWithRefBasesWithCustomWindowFunction( final KV<ReferenceBases, Iterable<GATKRead>> input, final SerializableFunction<GATKRead, SimpleInterval> referenceWindowFunction, final byte[] expectedBases, final SimpleInterval expectedInterval ) {
Pipeline p = GATKTestPipeline.create();
DataflowUtils.registerGATKCoders(p);
PCollection<KV<ReferenceBases, Iterable<GATKRead>>> pInput = p.apply(Create.of(input)
.withCoder(KvCoder.of(SerializableCoder.of(ReferenceBases.class), IterableCoder.of(new GATKReadCoder()))));
PCollection<KV<GATKRead, ReferenceBases>> result = pInput.apply(new PairReadWithRefBases(referenceWindowFunction));
DataflowAssert.that(result).satisfies((Iterable<KV<GATKRead, ReferenceBases>> resultElements) -> {
for ( KV<GATKRead, ReferenceBases> kvPair : resultElements ) {
Assert.assertNotNull(kvPair.getKey(), "Null read in transform result");
Assert.assertNotNull(kvPair.getValue(), "Null ReferenceBases object paired with read");
Assert.assertEquals(kvPair.getValue().getBases(), expectedBases, "Wrong bases in ReferenceBases object paired with read");
Assert.assertEquals(kvPair.getValue().getInterval(), expectedInterval, "Wrong interval in ReferenceBases object paired with read");
}
return null;
});
p.run();
}
示例14: addContextDataTest
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "bases")
public void addContextDataTest(List<GATKRead> reads, List<Variant> variantList,
List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<KV<GATKRead, ReadContextData>> kvReadContextData,
List<SimpleInterval> intervals, List<KV<GATKRead, Iterable<Variant>>> kvReadiVariant) {
Pipeline p = GATKTestPipeline.create();
DataflowUtils.registerGATKCoders(p);
PCollection<GATKRead> pReads = DataflowTestUtils.pCollectionCreateAndVerify(p, reads, new GATKReadCoder());
PCollection<KV<GATKRead, ReferenceBases>> pReadRef = DataflowTestUtils.pCollectionCreateAndVerify(p, kvReadRefBases,
KvCoder.of(new GATKReadCoder(), SerializableCoder.of(ReferenceBases.class)));
PCollection<KV<GATKRead, Iterable<Variant>>> pReadVariants =
p.apply(Create.of(kvReadiVariant).withCoder(KvCoder.of(new GATKReadCoder(), IterableCoder.of(new VariantCoder()))));
PCollection<KV<GATKRead, ReadContextData>> joinedResults = AddContextDataToRead.join(pReads, pReadRef, pReadVariants);
PCollection<KV<GATKRead, ReadContextData>> pkvReadContextData = p.apply(Create.of(kvReadContextData).withCoder(KvCoder.of(new GATKReadCoder(), new ReadContextDataCoder())));
DataflowTestUtils.keyReadContextDataMatcher(joinedResults, pkvReadContextData);
p.run();
}
示例15: fullTest
import com.google.cloud.dataflow.sdk.transforms.Create; //导入依赖的package包/类
@Test(dataProvider = "bases")
public void fullTest(List<GATKRead> reads, List<Variant> variantList,
List<KV<GATKRead, ReferenceBases>> kvReadRefBases, List<KV<GATKRead, ReadContextData>> kvReadContextData,
List<SimpleInterval> intervals, List<KV<GATKRead, Iterable<Variant>>> kvReadiVariant) throws IOException {
Pipeline p = GATKTestPipeline.create();
DataflowUtils.registerGATKCoders(p);
PCollection<GATKRead> pReads = DataflowTestUtils.pCollectionCreateAndVerify(p, reads, new GATKReadCoder());
PCollection<Variant> pVariant = p.apply(Create.of(variantList));
VariantsDataflowSource mockVariantsSource = mock(VariantsDataflowSource.class);
when(mockVariantsSource.getAllVariants()).thenReturn(pVariant);
ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
for (SimpleInterval i : intervals) {
when(mockSource.getReferenceBases(any(PipelineOptions.class), eq(i))).thenReturn(FakeReferenceSource.bases(i));
}
when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
PCollection<KV<GATKRead, ReadContextData>> result = AddContextDataToRead.add(pReads, mockSource, mockVariantsSource);
PCollection<KV<GATKRead, ReadContextData>> pkvReadContextData = p.apply(Create.of(kvReadContextData).withCoder(KvCoder.of(new GATKReadCoder(), new ReadContextDataCoder())));
DataflowTestUtils.keyReadContextDataMatcher(result, pkvReadContextData);
p.run();
}