Java Pipeline.apply方法代码示例

本文整理汇总了Java中org.apache.beam.sdk.Pipeline.apply方法的典型用法代码示例。如果您正苦于以下问题：Java Pipeline.apply方法的具体用法？Java Pipeline.apply怎么用？Java Pipeline.apply使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.beam.sdk.Pipeline的用法示例。

在下文中一共展示了Pipeline.apply方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(DirectRunner.class); // forced for this demo
  Pipeline p = Pipeline.create(options);

  // register Avro coders for serializing our messages
  Coders.registerAvroCoders(p, ExtendedRecord.class, UntypedOccurrence.class);

  PCollection<UntypedOccurrence> verbatimRecords = p.apply(
    "Read Avro", AvroIO.read(UntypedOccurrence.class).from("demo/output/data*"));

  verbatimRecords.apply("Write file per Genus",
                        AvroIO.write(UntypedOccurrence.class)
                              .to("demo/output-split/data*") // prefix, is required but overwritten
                              .to(new GenusDynamicAvroDestinations(
                                FileSystems.matchNewResource("demo/output-split/data*", true))));


  LOG.info("Starting the pipeline");
  PipelineResult result = p.run();
  result.waitUntilFinish();
  LOG.info("Pipeline finished with state: {} ", result.getState());
}

开发者ID:gbif，项目名称:pipelines，代码行数:24，代码来源:MultiAvroOutDemo.java

示例2: main

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
  Configuration conf = new Configuration(); // assume defaults on CP
  conf.setClass("mapreduce.job.inputformat.class", DwCAInputFormat.class, InputFormat.class);
  conf.setStrings("mapreduce.input.fileinputformat.inputdir", "hdfs://ha-nn/tmp/dwca-lep5.zip");
  conf.setClass("key.class", Text.class, Object.class);
  conf.setClass("value.class", ExtendedRecord.class, Object.class);

  Pipeline p = newPipeline(args, conf);
  Coders.registerAvroCoders(p, UntypedOccurrence.class, TypedOccurrence.class, ExtendedRecord.class);

  PCollection<KV<Text, ExtendedRecord>> rawRecords =
    p.apply("Read DwC-A", HadoopInputFormatIO.<Text, ExtendedRecord>read().withConfiguration(conf));

  PCollection<UntypedOccurrence> verbatimRecords = rawRecords.apply(
    "Convert to Avro", ParDo.of(fromExtendedRecordKVP()));

  verbatimRecords.apply(
    "Write Avro files", AvroIO.write(UntypedOccurrence.class).to("hdfs://ha-nn/tmp/dwca-lep5.avro"));

  LOG.info("Starting the pipeline");
  PipelineResult result = p.run();
  result.waitUntilFinish();
  LOG.info("Pipeline finished with state: {} ", result.getState());
}

开发者ID:gbif，项目名称:pipelines，代码行数:25，代码来源:DwCA2AvroPipeline.java

示例3: testCheckingForSuccessWhenPAssertFails

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
 * Tests that when we just see a tentative failure for a {@link PAssert} it is considered a
 * conclusive failure.
 */
@Test
public void testCheckingForSuccessWhenPAssertFails() throws Exception {
  DataflowPipelineJob job =
      spy(new DataflowPipelineJob(mockClient, "test-job", options, null));
  Pipeline p = TestPipeline.create(options);
  PCollection<Integer> pc = p.apply(Create.of(1, 2, 3));
  PAssert.that(pc).containsInAnyOrder(1, 2, 3);

  when(mockClient.getJobMetrics(anyString()))
      .thenReturn(
          buildJobMetrics(generateMockMetrics(false /* success */, true /* tentative */)));

  TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);
  doReturn(State.DONE).when(job).getState();
  assertThat(runner.checkForPAssertSuccess(job), equalTo(Optional.of(false)));
}

开发者ID:apache，项目名称:beam，代码行数:21，代码来源:TestDataflowRunnerTest.java

示例4: testCheckingForSuccessSkipsNonTentativeMetrics

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
@Test
public void testCheckingForSuccessSkipsNonTentativeMetrics() throws Exception {
  DataflowPipelineJob job = spy(new DataflowPipelineJob(mockClient, "test-job", options, null));
  Pipeline p = TestPipeline.create(options);
  PCollection<Integer> pc = p.apply(Create.of(1, 2, 3));
  PAssert.that(pc).containsInAnyOrder(1, 2, 3);

  when(mockClient.getJobMetrics(anyString()))
      .thenReturn(
          buildJobMetrics(generateMockMetrics(true /* success */, false /* tentative */)));

  TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);
  runner.updatePAssertCount(p);
  doReturn(State.RUNNING).when(job).getState();
  assertThat(runner.checkForPAssertSuccess(job), equalTo(Optional.<Boolean>absent()));
}

开发者ID:apache，项目名称:beam，代码行数:17，代码来源:TestDataflowRunnerTest.java

示例5: cacheCandidatesUpdaterTest

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
@Test
public void cacheCandidatesUpdaterTest() throws Exception {
  SparkPipelineOptions options =
      PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
  options.setRunner(TestSparkRunner.class);
  Pipeline pipeline = Pipeline.create(options);
  PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar"));
  // first read
  pCollection.apply(Count.<String>globally());
  // second read
  // as we access the same PCollection two times, the Spark runner does optimization and so
  // will cache the RDD representing this PCollection
  pCollection.apply(Count.<String>globally());

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
  SparkRunner.CacheVisitor cacheVisitor =
      new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt);
  pipeline.traverseTopologically(cacheVisitor);
  assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection));
}

开发者ID:apache，项目名称:beam，代码行数:22，代码来源:CacheTest.java

示例6: testEquals

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
@Test
  public void testEquals() {
    Pipeline p = TestPipeline.create();
    PCollection<String> first = p.apply("Meta", Create.of("foo", "bar"));
    PCollection<String> second = p.apply("Pythonic", Create.of("spam, ham"));
    PCollection<String> third = p.apply("Syntactic", Create.of("eggs", "baz"));

    EqualsTester tester = new EqualsTester();
//    tester.addEqualityGroup(PCollectionList.empty(p), PCollectionList.empty(p));
//    tester.addEqualityGroup(PCollectionList.of(first).and(second));
    // Constructors should all produce equivalent
    tester.addEqualityGroup(
        PCollectionList.of(first).and(second).and(third),
        PCollectionList.of(first).and(second).and(third),
//        PCollectionList.<String>empty(p).and(first).and(second).and(third),
//        PCollectionList.of(ImmutableList.of(first, second, third)),
//        PCollectionList.of(first).and(ImmutableList.of(second, third)),
        PCollectionList.of(ImmutableList.of(first, second)).and(third));
    // Order is considered
    tester.addEqualityGroup(PCollectionList.of(first).and(third).and(second));
    tester.addEqualityGroup(PCollectionList.empty(TestPipeline.create()));

    tester.testEquals();
  }

开发者ID:apache，项目名称:beam，代码行数:25，代码来源:PCollectionListTest.java

示例7: testWithInvalidContext

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
private void testWithInvalidContext(JavaSparkContext jsc) {
    SparkContextOptions options = getSparkContextOptions(jsc);

    Pipeline p = Pipeline.create(options);
    PCollection<String> inputWords = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder
            .of()));
    inputWords.apply(new WordCount.CountWords())
            .apply(MapElements.via(new WordCount.FormatAsTextFn()));

    try {
        p.run().waitUntilFinish();
        fail("Should throw an exception when The provided Spark context is null or stopped");
    } catch (RuntimeException e){
        assert(e.getMessage().contains(PROVIDED_CONTEXT_EXCEPTION));
    }
}

开发者ID:apache，项目名称:beam，代码行数:17，代码来源:ProvidedSparkContextTest.java

示例8: testParDoChaining

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
@Test
public void testParDoChaining() throws Exception {
  Pipeline p = Pipeline.create();
  long numElements = 1000;
  PCollection<Long> input = p.apply(GenerateSequence.from(0).to(numElements));
  PAssert.thatSingleton(input.apply("Count", Count.<Long>globally())).isEqualTo(numElements);

  ApexPipelineOptions options = PipelineOptionsFactory.as(ApexPipelineOptions.class);
  DAG dag = TestApexRunner.translate(p, options);

  String[] expectedThreadLocal = { "/CreateActual/FilterActuals/Window.Assign" };
  Set<String> actualThreadLocal = Sets.newHashSet();
  for (DAG.StreamMeta sm : dag.getAllStreamsMeta()) {
    DAG.OutputPortMeta opm = sm.getSource();
    if (sm.getLocality() == Locality.THREAD_LOCAL) {
       String name = opm.getOperatorMeta().getName();
       String prefix = "PAssert$";
       if (name.startsWith(prefix)) {
         // remove indeterministic prefix
         name = name.substring(prefix.length() + 1);
       }
       actualThreadLocal.add(name);
    }
  }
  Assert.assertThat(actualThreadLocal, Matchers.hasItems(expectedThreadLocal));
}

开发者ID:apache，项目名称:beam，代码行数:27，代码来源:ApexRunnerTest.java

示例9: testStreamingOnSuccessMatcherWhenPipelineFails

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
 * Tests that when a streaming pipeline terminates in FAIL that the {@link
 * TestPipelineOptions#setOnSuccessMatcher(SerializableMatcher) on success matcher} is not
 * invoked.
 */
@Test
public void testStreamingOnSuccessMatcherWhenPipelineFails() throws Exception {
  options.setStreaming(true);
  Pipeline p = TestPipeline.create(options);
  PCollection<Integer> pc = p.apply(Create.of(1, 2, 3));
  PAssert.that(pc).containsInAnyOrder(1, 2, 3);

  final DataflowPipelineJob mockJob = Mockito.mock(DataflowPipelineJob.class);
  when(mockJob.getState()).thenReturn(State.FAILED);
  when(mockJob.getProjectId()).thenReturn("test-project");
  when(mockJob.getJobId()).thenReturn("test-job");

  DataflowRunner mockRunner = Mockito.mock(DataflowRunner.class);
  when(mockRunner.run(any(Pipeline.class))).thenReturn(mockJob);

  TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);
  options.as(TestPipelineOptions.class).setOnSuccessMatcher(new TestFailureMatcher());

  when(mockJob.waitUntilFinish(any(Duration.class), any(JobMessagesHandler.class)))
      .thenReturn(State.FAILED);

  expectedException.expect(RuntimeException.class);
  runner.run(p, mockRunner);
  // If the onSuccessMatcher were invoked, it would have crashed here with AssertionError
}

开发者ID:apache，项目名称:beam，代码行数:31，代码来源:TestDataflowRunnerTest.java

示例10: testAssertionFailure

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
@Test
public void testAssertionFailure() throws Exception {
  ApexPipelineOptions options = PipelineOptionsFactory.create()
      .as(ApexPipelineOptions.class);
  options.setRunner(TestApexRunner.class);
  Pipeline pipeline = Pipeline.create(options);

  PCollection<Integer> pcollection = pipeline
      .apply(Create.of(1, 2, 3, 4));
  PAssert.that(pcollection).containsInAnyOrder(2, 1, 4, 3, 7);

  Throwable exc = runExpectingAssertionFailure(pipeline);
  Pattern expectedPattern = Pattern.compile(
      "Expected: iterable over \\[((<4>|<7>|<3>|<2>|<1>)(, )?){5}\\] in any order");
  // A loose pattern, but should get the job done.
  assertTrue(
      "Expected error message from PAssert with substring matching "
          + expectedPattern
          + " but the message was \""
          + exc.getMessage()
          + "\"",
      expectedPattern.matcher(exc.getMessage()).find());
}

开发者ID:apache，项目名称:beam，代码行数:24，代码来源:ParDoTranslatorTest.java

示例11: filterAlreadyProcessedDocuments

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
 * @param contentToIndexNotSkipped
 * @param contentNotToIndexSkipped
 * @param pipeline
 * @param options
 * @return
 */
private static ContentToIndexOrNot filterAlreadyProcessedDocuments(
		PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped,
		Pipeline pipeline, IndexerPipelineOptions options) {
	PCollection<KV<String,Long>> alreadyProcessedDocs = null;
	
	if (!options.getWriteTruncate()) {
		String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options);
		alreadyProcessedDocs = pipeline
			.apply("Get already processed Documents",BigQueryIO.read().fromQuery(query))
			.apply(ParDo.of(new GetDocumentHashFn()));

	} else {
		Map<String, Long> map = new HashMap<String,Long>();
		alreadyProcessedDocs = pipeline
			.apply("Create empty side input of Docs",
				Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of())));
	}			
	
	final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput =  
		alreadyProcessedDocs.apply(View.<String,Long>asMap());
	
	PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped
		.apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn()))
		.apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create())
		.apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput))
			.withSideInputs(alreadyProcessedDocsSideInput)
			.withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection
				TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection	
	
	PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag);
	PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag);
	
	// Merge the sets of items that are dupes or skipped
	PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped);
	
	ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections()));
	return content;
}

开发者ID:GoogleCloudPlatform，项目名称:dataflow-opinion-analysis，代码行数:46，代码来源:IndexerPipeline.java

示例12: main

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(DirectRunner.class); // forced for this demo
  Pipeline p = Pipeline.create(options);

  // register Avro coders for serializing our messages
  Coders.registerAvroCoders(p, ExtendedRecord.class, UntypedOccurrence.class);

  // Read the DwC-A using our custom reader
  PCollection<ExtendedRecord> rawRecords = p.apply(
    "Read from Darwin Core Archive", DwCAIO.Read.withPaths("demo/dwca.zip", "demo/target/tmp"));

  // Convert the ExtendedRecord into an UntypedOccurrence record
  DoFn<ExtendedRecord,UntypedOccurrence> fn = BeamFunctions.beamify(FunctionFactory.untypedOccurrenceBuilder());

  // TODO: Explore the generics as to why the coder registry does not find it and we need to set the coder explicitly
  PCollection<UntypedOccurrence> verbatimRecords = rawRecords.apply(
    "Convert the objects into untyped DwC style records",ParDo.of(fn))
                                                             .setCoder(AvroCoder.of(UntypedOccurrence.class));

  // Write the result as an Avro file
  verbatimRecords.apply(
    "Save the records as Avro", AvroIO.write(UntypedOccurrence.class).to("demo/output/data"));

  LOG.info("Starting the pipeline");
  PipelineResult result = p.run();
  result.waitUntilFinish();
  LOG.info("Pipeline finished with state: {} ", result.getState());
}

开发者ID:gbif，项目名称:pipelines，代码行数:30，代码来源:DwCA2AvroPipeline.java

示例13: main

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {

        Configuration conf = new Configuration(); // assume defaults on CP
        Pipeline p = newPipeline(args, conf);
        Coders.registerAvroCoders(p, UntypedOccurrenceLowerCase.class, TypedOccurrence.class, ExtendedRecord.class);

        // Read Avro files
        PCollection<UntypedOccurrenceLowerCase> verbatimRecords = p.apply(
                "Read Avro files", AvroIO.read(UntypedOccurrenceLowerCase.class).from(SOURCE_PATH));

        // Convert the objects (interpretation)
        PCollection<TypedOccurrence> interpreted = verbatimRecords.apply(
                "Interpret occurrence records", ParDo.of(BeamFunctions.beamify(FunctionFactory.interpretOccurrenceLowerCase())))
                .setCoder(AvroCoder.of(TypedOccurrence.class));

        // Do the nub lookup
        PCollection<TypedOccurrence> matched = interpreted.apply(
                "Align to backbone using species/match", ParDo.of(
                        BeamFunctions.beamify(FunctionFactory.gbifSpeciesMatch())))
                .setCoder(AvroCoder.of(TypedOccurrence.class));

        // Write the file to SOLR
        final SolrIO.ConnectionConfiguration conn = SolrIO.ConnectionConfiguration
                .create(SOLR_HOST);

        PCollection<SolrInputDocument> inputDocs = matched.apply(
                "Convert to SOLR", ParDo.of(new SolrDocBuilder()));

        inputDocs.apply(SolrIO.write().to("beam-demo1").withConnectionConfiguration(conn));

        // instruct the writer to use a provided document ID
        LOG.info("Starting the pipeline");
        PipelineResult result = p.run();
        result.waitUntilFinish();
        LOG.info("Pipeline finished with state: {} ", result.getState());
    }

开发者ID:gbif，项目名称:pipelines，代码行数:37，代码来源:Avro2SolrPipeline.java

示例14: main

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {

    Configuration conf = new Configuration(); // assume defaults on CP
    Pipeline p = newPipeline(args, conf);
    Coders.registerAvroCoders(p, UntypedOccurrenceLowerCase.class, TypedOccurrence.class, ExtendedRecord.class);

    // Read Avro files
    PCollection<UntypedOccurrenceLowerCase> verbatimRecords = p.apply(
      "Read Avro files", AvroIO.read(UntypedOccurrenceLowerCase.class).from(SOURCE_PATH));

    // Convert the objects (interpretation)
    PCollection<TypedOccurrence> interpreted = verbatimRecords.apply(
      "Interpret occurrence records", ParDo.of(BeamFunctions.beamify(FunctionFactory.interpretOccurrenceLowerCase())))
                                                              .setCoder(AvroCoder.of(TypedOccurrence.class));

    // Do the nub lookup
    PCollection<TypedOccurrence> matched = interpreted.apply(
      "Align to backbone using species/match", ParDo.of(
        BeamFunctions.beamify(FunctionFactory.gbifSpeciesMatch())))
                                                      .setCoder(AvroCoder.of(TypedOccurrence.class));

    // Convert to JSON
    PCollection<String> json = matched.apply(
      "Convert to JSON", ParDo.of(BeamFunctions.asJson(TypedOccurrence.class)));

    // Write the file to ES
    ElasticsearchIO.ConnectionConfiguration conn = ElasticsearchIO.ConnectionConfiguration
      .create(ES_HOSTS,ES_INDEX, ES_TYPE);

    // Index in ES
    json.apply(ElasticsearchIO.write().withConnectionConfiguration(conn).withMaxBatchSize(BATCH_SIZE));

    // instruct the writer to use a provided document ID
    LOG.info("Starting the pipeline");
    PipelineResult result = p.run();
    result.waitUntilFinish();
    LOG.info("Pipeline finished with state: {} ", result.getState());
  }

开发者ID:gbif，项目名称:pipelines，代码行数:39，代码来源:Avro2ElasticSearchPipeline.java

示例15: main

import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(DirectRunner.class); // forced for this demo
  Pipeline p = Pipeline.create(options);

  // register Avro coders for serializing our messages
  Coders.registerAvroCoders(p, ExtendedRecord.class, UntypedOccurrence.class);

  // Read the DwC-A using our custom reader
  PCollection<ExtendedRecord> rawRecords = p.apply(
    "Read from Darwin Core Archive", DwCAIO.Read.withPaths("/tmp/dwca-s-bryophytes-v4.1.zip", "demo/target/tmp"));

  // Convert the ExtendedRecord into an UntypedOccurrence record
  PCollection<UntypedOccurrence> verbatimRecords = rawRecords.apply(
    "Convert the objects into untyped DwC style records",
    ParDo.of(BeamFunctions.beamify(FunctionFactory.untypedOccurrenceBuilder())))
                                                             .setCoder(AvroCoder.of(UntypedOccurrence.class));

  // Write the file to SOLR
  final SolrIO.ConnectionConfiguration conn = SolrIO.ConnectionConfiguration
    .create(SOLR_HOSTS);

  PCollection<SolrInputDocument> inputDocs = verbatimRecords.apply(
    "Convert to SOLR", ParDo.of(new SolrDocBuilder()));

  inputDocs.apply(SolrIO.write().to("beam-demo1").withConnectionConfiguration(conn));

  LOG.info("Starting the pipeline");
  PipelineResult result = p.run();
  result.waitUntilFinish();
  LOG.info("Pipeline finished with state: {} ", result.getState());
}

开发者ID:gbif，项目名称:pipelines，代码行数:33，代码来源:DwCA2SolrPipeline.java

注：本文中的org.apache.beam.sdk.Pipeline.apply方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。