本文整理汇总了Java中org.apache.beam.sdk.Pipeline.apply方法的典型用法代码示例。如果您正苦于以下问题:Java Pipeline.apply方法的具体用法?Java Pipeline.apply怎么用?Java Pipeline.apply使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.beam.sdk.Pipeline
的用法示例。
在下文中一共展示了Pipeline.apply方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.create();
options.setRunner(DirectRunner.class); // forced for this demo
Pipeline p = Pipeline.create(options);
// register Avro coders for serializing our messages
Coders.registerAvroCoders(p, ExtendedRecord.class, UntypedOccurrence.class);
PCollection<UntypedOccurrence> verbatimRecords = p.apply(
"Read Avro", AvroIO.read(UntypedOccurrence.class).from("demo/output/data*"));
verbatimRecords.apply("Write file per Genus",
AvroIO.write(UntypedOccurrence.class)
.to("demo/output-split/data*") // prefix, is required but overwritten
.to(new GenusDynamicAvroDestinations(
FileSystems.matchNewResource("demo/output-split/data*", true))));
LOG.info("Starting the pipeline");
PipelineResult result = p.run();
result.waitUntilFinish();
LOG.info("Pipeline finished with state: {} ", result.getState());
}
示例2: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
Configuration conf = new Configuration(); // assume defaults on CP
conf.setClass("mapreduce.job.inputformat.class", DwCAInputFormat.class, InputFormat.class);
conf.setStrings("mapreduce.input.fileinputformat.inputdir", "hdfs://ha-nn/tmp/dwca-lep5.zip");
conf.setClass("key.class", Text.class, Object.class);
conf.setClass("value.class", ExtendedRecord.class, Object.class);
Pipeline p = newPipeline(args, conf);
Coders.registerAvroCoders(p, UntypedOccurrence.class, TypedOccurrence.class, ExtendedRecord.class);
PCollection<KV<Text, ExtendedRecord>> rawRecords =
p.apply("Read DwC-A", HadoopInputFormatIO.<Text, ExtendedRecord>read().withConfiguration(conf));
PCollection<UntypedOccurrence> verbatimRecords = rawRecords.apply(
"Convert to Avro", ParDo.of(fromExtendedRecordKVP()));
verbatimRecords.apply(
"Write Avro files", AvroIO.write(UntypedOccurrence.class).to("hdfs://ha-nn/tmp/dwca-lep5.avro"));
LOG.info("Starting the pipeline");
PipelineResult result = p.run();
result.waitUntilFinish();
LOG.info("Pipeline finished with state: {} ", result.getState());
}
示例3: testCheckingForSuccessWhenPAssertFails
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
* Tests that when we just see a tentative failure for a {@link PAssert} it is considered a
* conclusive failure.
*/
@Test
public void testCheckingForSuccessWhenPAssertFails() throws Exception {
DataflowPipelineJob job =
spy(new DataflowPipelineJob(mockClient, "test-job", options, null));
Pipeline p = TestPipeline.create(options);
PCollection<Integer> pc = p.apply(Create.of(1, 2, 3));
PAssert.that(pc).containsInAnyOrder(1, 2, 3);
when(mockClient.getJobMetrics(anyString()))
.thenReturn(
buildJobMetrics(generateMockMetrics(false /* success */, true /* tentative */)));
TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);
doReturn(State.DONE).when(job).getState();
assertThat(runner.checkForPAssertSuccess(job), equalTo(Optional.of(false)));
}
示例4: testCheckingForSuccessSkipsNonTentativeMetrics
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
@Test
public void testCheckingForSuccessSkipsNonTentativeMetrics() throws Exception {
DataflowPipelineJob job = spy(new DataflowPipelineJob(mockClient, "test-job", options, null));
Pipeline p = TestPipeline.create(options);
PCollection<Integer> pc = p.apply(Create.of(1, 2, 3));
PAssert.that(pc).containsInAnyOrder(1, 2, 3);
when(mockClient.getJobMetrics(anyString()))
.thenReturn(
buildJobMetrics(generateMockMetrics(true /* success */, false /* tentative */)));
TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);
runner.updatePAssertCount(p);
doReturn(State.RUNNING).when(job).getState();
assertThat(runner.checkForPAssertSuccess(job), equalTo(Optional.<Boolean>absent()));
}
示例5: cacheCandidatesUpdaterTest
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
@Test
public void cacheCandidatesUpdaterTest() throws Exception {
SparkPipelineOptions options =
PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
options.setRunner(TestSparkRunner.class);
Pipeline pipeline = Pipeline.create(options);
PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar"));
// first read
pCollection.apply(Count.<String>globally());
// second read
// as we access the same PCollection two times, the Spark runner does optimization and so
// will cache the RDD representing this PCollection
pCollection.apply(Count.<String>globally());
JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
SparkRunner.CacheVisitor cacheVisitor =
new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt);
pipeline.traverseTopologically(cacheVisitor);
assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection));
}
示例6: testEquals
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
@Test
public void testEquals() {
Pipeline p = TestPipeline.create();
PCollection<String> first = p.apply("Meta", Create.of("foo", "bar"));
PCollection<String> second = p.apply("Pythonic", Create.of("spam, ham"));
PCollection<String> third = p.apply("Syntactic", Create.of("eggs", "baz"));
EqualsTester tester = new EqualsTester();
// tester.addEqualityGroup(PCollectionList.empty(p), PCollectionList.empty(p));
// tester.addEqualityGroup(PCollectionList.of(first).and(second));
// Constructors should all produce equivalent
tester.addEqualityGroup(
PCollectionList.of(first).and(second).and(third),
PCollectionList.of(first).and(second).and(third),
// PCollectionList.<String>empty(p).and(first).and(second).and(third),
// PCollectionList.of(ImmutableList.of(first, second, third)),
// PCollectionList.of(first).and(ImmutableList.of(second, third)),
PCollectionList.of(ImmutableList.of(first, second)).and(third));
// Order is considered
tester.addEqualityGroup(PCollectionList.of(first).and(third).and(second));
tester.addEqualityGroup(PCollectionList.empty(TestPipeline.create()));
tester.testEquals();
}
示例7: testWithInvalidContext
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
private void testWithInvalidContext(JavaSparkContext jsc) {
SparkContextOptions options = getSparkContextOptions(jsc);
Pipeline p = Pipeline.create(options);
PCollection<String> inputWords = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder
.of()));
inputWords.apply(new WordCount.CountWords())
.apply(MapElements.via(new WordCount.FormatAsTextFn()));
try {
p.run().waitUntilFinish();
fail("Should throw an exception when The provided Spark context is null or stopped");
} catch (RuntimeException e){
assert(e.getMessage().contains(PROVIDED_CONTEXT_EXCEPTION));
}
}
示例8: testParDoChaining
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
@Test
public void testParDoChaining() throws Exception {
Pipeline p = Pipeline.create();
long numElements = 1000;
PCollection<Long> input = p.apply(GenerateSequence.from(0).to(numElements));
PAssert.thatSingleton(input.apply("Count", Count.<Long>globally())).isEqualTo(numElements);
ApexPipelineOptions options = PipelineOptionsFactory.as(ApexPipelineOptions.class);
DAG dag = TestApexRunner.translate(p, options);
String[] expectedThreadLocal = { "/CreateActual/FilterActuals/Window.Assign" };
Set<String> actualThreadLocal = Sets.newHashSet();
for (DAG.StreamMeta sm : dag.getAllStreamsMeta()) {
DAG.OutputPortMeta opm = sm.getSource();
if (sm.getLocality() == Locality.THREAD_LOCAL) {
String name = opm.getOperatorMeta().getName();
String prefix = "PAssert$";
if (name.startsWith(prefix)) {
// remove indeterministic prefix
name = name.substring(prefix.length() + 1);
}
actualThreadLocal.add(name);
}
}
Assert.assertThat(actualThreadLocal, Matchers.hasItems(expectedThreadLocal));
}
示例9: testStreamingOnSuccessMatcherWhenPipelineFails
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
* Tests that when a streaming pipeline terminates in FAIL that the {@link
* TestPipelineOptions#setOnSuccessMatcher(SerializableMatcher) on success matcher} is not
* invoked.
*/
@Test
public void testStreamingOnSuccessMatcherWhenPipelineFails() throws Exception {
options.setStreaming(true);
Pipeline p = TestPipeline.create(options);
PCollection<Integer> pc = p.apply(Create.of(1, 2, 3));
PAssert.that(pc).containsInAnyOrder(1, 2, 3);
final DataflowPipelineJob mockJob = Mockito.mock(DataflowPipelineJob.class);
when(mockJob.getState()).thenReturn(State.FAILED);
when(mockJob.getProjectId()).thenReturn("test-project");
when(mockJob.getJobId()).thenReturn("test-job");
DataflowRunner mockRunner = Mockito.mock(DataflowRunner.class);
when(mockRunner.run(any(Pipeline.class))).thenReturn(mockJob);
TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);
options.as(TestPipelineOptions.class).setOnSuccessMatcher(new TestFailureMatcher());
when(mockJob.waitUntilFinish(any(Duration.class), any(JobMessagesHandler.class)))
.thenReturn(State.FAILED);
expectedException.expect(RuntimeException.class);
runner.run(p, mockRunner);
// If the onSuccessMatcher were invoked, it would have crashed here with AssertionError
}
示例10: testAssertionFailure
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
@Test
public void testAssertionFailure() throws Exception {
ApexPipelineOptions options = PipelineOptionsFactory.create()
.as(ApexPipelineOptions.class);
options.setRunner(TestApexRunner.class);
Pipeline pipeline = Pipeline.create(options);
PCollection<Integer> pcollection = pipeline
.apply(Create.of(1, 2, 3, 4));
PAssert.that(pcollection).containsInAnyOrder(2, 1, 4, 3, 7);
Throwable exc = runExpectingAssertionFailure(pipeline);
Pattern expectedPattern = Pattern.compile(
"Expected: iterable over \\[((<4>|<7>|<3>|<2>|<1>)(, )?){5}\\] in any order");
// A loose pattern, but should get the job done.
assertTrue(
"Expected error message from PAssert with substring matching "
+ expectedPattern
+ " but the message was \""
+ exc.getMessage()
+ "\"",
expectedPattern.matcher(exc.getMessage()).find());
}
示例11: filterAlreadyProcessedDocuments
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
* @param contentToIndexNotSkipped
* @param contentNotToIndexSkipped
* @param pipeline
* @param options
* @return
*/
private static ContentToIndexOrNot filterAlreadyProcessedDocuments(
PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped,
Pipeline pipeline, IndexerPipelineOptions options) {
PCollection<KV<String,Long>> alreadyProcessedDocs = null;
if (!options.getWriteTruncate()) {
String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options);
alreadyProcessedDocs = pipeline
.apply("Get already processed Documents",BigQueryIO.read().fromQuery(query))
.apply(ParDo.of(new GetDocumentHashFn()));
} else {
Map<String, Long> map = new HashMap<String,Long>();
alreadyProcessedDocs = pipeline
.apply("Create empty side input of Docs",
Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of())));
}
final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput =
alreadyProcessedDocs.apply(View.<String,Long>asMap());
PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped
.apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn()))
.apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create())
.apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput))
.withSideInputs(alreadyProcessedDocsSideInput)
.withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection
TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection
PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag);
PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag);
// Merge the sets of items that are dupes or skipped
PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped);
ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections()));
return content;
}
示例12: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.create();
options.setRunner(DirectRunner.class); // forced for this demo
Pipeline p = Pipeline.create(options);
// register Avro coders for serializing our messages
Coders.registerAvroCoders(p, ExtendedRecord.class, UntypedOccurrence.class);
// Read the DwC-A using our custom reader
PCollection<ExtendedRecord> rawRecords = p.apply(
"Read from Darwin Core Archive", DwCAIO.Read.withPaths("demo/dwca.zip", "demo/target/tmp"));
// Convert the ExtendedRecord into an UntypedOccurrence record
DoFn<ExtendedRecord,UntypedOccurrence> fn = BeamFunctions.beamify(FunctionFactory.untypedOccurrenceBuilder());
// TODO: Explore the generics as to why the coder registry does not find it and we need to set the coder explicitly
PCollection<UntypedOccurrence> verbatimRecords = rawRecords.apply(
"Convert the objects into untyped DwC style records",ParDo.of(fn))
.setCoder(AvroCoder.of(UntypedOccurrence.class));
// Write the result as an Avro file
verbatimRecords.apply(
"Save the records as Avro", AvroIO.write(UntypedOccurrence.class).to("demo/output/data"));
LOG.info("Starting the pipeline");
PipelineResult result = p.run();
result.waitUntilFinish();
LOG.info("Pipeline finished with state: {} ", result.getState());
}
示例13: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
Configuration conf = new Configuration(); // assume defaults on CP
Pipeline p = newPipeline(args, conf);
Coders.registerAvroCoders(p, UntypedOccurrenceLowerCase.class, TypedOccurrence.class, ExtendedRecord.class);
// Read Avro files
PCollection<UntypedOccurrenceLowerCase> verbatimRecords = p.apply(
"Read Avro files", AvroIO.read(UntypedOccurrenceLowerCase.class).from(SOURCE_PATH));
// Convert the objects (interpretation)
PCollection<TypedOccurrence> interpreted = verbatimRecords.apply(
"Interpret occurrence records", ParDo.of(BeamFunctions.beamify(FunctionFactory.interpretOccurrenceLowerCase())))
.setCoder(AvroCoder.of(TypedOccurrence.class));
// Do the nub lookup
PCollection<TypedOccurrence> matched = interpreted.apply(
"Align to backbone using species/match", ParDo.of(
BeamFunctions.beamify(FunctionFactory.gbifSpeciesMatch())))
.setCoder(AvroCoder.of(TypedOccurrence.class));
// Write the file to SOLR
final SolrIO.ConnectionConfiguration conn = SolrIO.ConnectionConfiguration
.create(SOLR_HOST);
PCollection<SolrInputDocument> inputDocs = matched.apply(
"Convert to SOLR", ParDo.of(new SolrDocBuilder()));
inputDocs.apply(SolrIO.write().to("beam-demo1").withConnectionConfiguration(conn));
// instruct the writer to use a provided document ID
LOG.info("Starting the pipeline");
PipelineResult result = p.run();
result.waitUntilFinish();
LOG.info("Pipeline finished with state: {} ", result.getState());
}
示例14: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
Configuration conf = new Configuration(); // assume defaults on CP
Pipeline p = newPipeline(args, conf);
Coders.registerAvroCoders(p, UntypedOccurrenceLowerCase.class, TypedOccurrence.class, ExtendedRecord.class);
// Read Avro files
PCollection<UntypedOccurrenceLowerCase> verbatimRecords = p.apply(
"Read Avro files", AvroIO.read(UntypedOccurrenceLowerCase.class).from(SOURCE_PATH));
// Convert the objects (interpretation)
PCollection<TypedOccurrence> interpreted = verbatimRecords.apply(
"Interpret occurrence records", ParDo.of(BeamFunctions.beamify(FunctionFactory.interpretOccurrenceLowerCase())))
.setCoder(AvroCoder.of(TypedOccurrence.class));
// Do the nub lookup
PCollection<TypedOccurrence> matched = interpreted.apply(
"Align to backbone using species/match", ParDo.of(
BeamFunctions.beamify(FunctionFactory.gbifSpeciesMatch())))
.setCoder(AvroCoder.of(TypedOccurrence.class));
// Convert to JSON
PCollection<String> json = matched.apply(
"Convert to JSON", ParDo.of(BeamFunctions.asJson(TypedOccurrence.class)));
// Write the file to ES
ElasticsearchIO.ConnectionConfiguration conn = ElasticsearchIO.ConnectionConfiguration
.create(ES_HOSTS,ES_INDEX, ES_TYPE);
// Index in ES
json.apply(ElasticsearchIO.write().withConnectionConfiguration(conn).withMaxBatchSize(BATCH_SIZE));
// instruct the writer to use a provided document ID
LOG.info("Starting the pipeline");
PipelineResult result = p.run();
result.waitUntilFinish();
LOG.info("Pipeline finished with state: {} ", result.getState());
}
示例15: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.create();
options.setRunner(DirectRunner.class); // forced for this demo
Pipeline p = Pipeline.create(options);
// register Avro coders for serializing our messages
Coders.registerAvroCoders(p, ExtendedRecord.class, UntypedOccurrence.class);
// Read the DwC-A using our custom reader
PCollection<ExtendedRecord> rawRecords = p.apply(
"Read from Darwin Core Archive", DwCAIO.Read.withPaths("/tmp/dwca-s-bryophytes-v4.1.zip", "demo/target/tmp"));
// Convert the ExtendedRecord into an UntypedOccurrence record
PCollection<UntypedOccurrence> verbatimRecords = rawRecords.apply(
"Convert the objects into untyped DwC style records",
ParDo.of(BeamFunctions.beamify(FunctionFactory.untypedOccurrenceBuilder())))
.setCoder(AvroCoder.of(UntypedOccurrence.class));
// Write the file to SOLR
final SolrIO.ConnectionConfiguration conn = SolrIO.ConnectionConfiguration
.create(SOLR_HOSTS);
PCollection<SolrInputDocument> inputDocs = verbatimRecords.apply(
"Convert to SOLR", ParDo.of(new SolrDocBuilder()));
inputDocs.apply(SolrIO.write().to("beam-demo1").withConnectionConfiguration(conn));
LOG.info("Starting the pipeline");
PipelineResult result = p.run();
result.waitUntilFinish();
LOG.info("Pipeline finished with state: {} ", result.getState());
}