本文整理汇总了Java中org.apache.beam.sdk.Pipeline.create方法的典型用法代码示例。如果您正苦于以下问题:Java Pipeline.create方法的具体用法?Java Pipeline.create怎么用?Java Pipeline.create使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.beam.sdk.Pipeline
的用法示例。
在下文中一共展示了Pipeline.create方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/** Run a batch pipeline to calculate hourly team scores. */
public static void main(String[] args) throws Exception {
Options options =
PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("ReadLogs", TextIO.read().from(options.getInput()))
.apply("SetTimestamps", WithTimestamps.of(new SetTimestampFn()))
.apply("FixedWindows", Window.<String>into(FixedWindows.of(ONE_HOUR)))
.apply("TeamScores", new CalculateTeamScores(options.getOutputPrefix()));
pipeline.run();
}
示例2: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
* Runs the DatastoreToGcs dataflow pipeline
*/
public static void main(String[] args) throws IOException, ScriptException {
Options options = PipelineOptionsFactory.fromArgs(args)
.withValidation()
.as(Options.class);
options.setRunner(DataflowRunner.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("IngestEntities",
DatastoreIO.v1().read()
.withProjectId(options.getDatastoreProjectId())
.withLiteralGqlQuery(options.getGqlQuery())
.withNamespace(options.getNamespace()))
.apply("EntityToJson", ParDo.of(EntityToJson.newBuilder()
.setJsTransformPath(options.getJsTransformPath())
.setJsTransformFunctionName(options.getJsTransformFunctionName())
.build()))
.apply("JsonToGcs", TextIO.write().to(options.getSavePath())
.withSuffix(".json"));
pipeline.run();
}
示例3: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
* Runs the GcsToDatastore dataflow pipeline
*/
public static void main(String[] args) throws IOException, ScriptException {
Options options = PipelineOptionsFactory.fromArgs(args)
.withValidation()
.as(Options.class);
options.setRunner(DataflowRunner.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("IngestJson", TextIO.read()
.from(options.getJsonPathPrefix()))
.apply("GcsToEntity", ParDo.of(JsonToEntity.newBuilder()
.setJsTransformPath(options.getJsTransformPath())
.setJsTransformFunctionName(options.getJsTransformFunctionName())
.build()))
.apply(DatastoreIO.v1().write()
.withProjectId(options.getDatastoreProjectId()));
pipeline.run();
}
示例4: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
Options options =
PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply(KafkaIO.<String, String>read()
.withBootstrapServers(options.getKafkaBootstrapServer())
.withTopic(options.getTopic())
.withKeyDeserializer(StringDeserializer.class)
.withValueDeserializer(StringDeserializer.class)
.withTimestampFn(new SetTimestampFn()))
.apply("Values", ParDo.of(new ValuesFn()))
.apply("FixedWindows", Window.<String>into(FixedWindows.of(FIVE_MINUTES))
.triggering(AfterWatermark.pastEndOfWindow()
.withEarlyFirings(AfterProcessingTime.pastFirstElementInPane()
.plusDelayOf(TWO_MINUTES))
.withLateFirings(AfterPane.elementCountAtLeast(1)))
.withAllowedLateness(TEN_MINUTES)
.accumulatingFiredPanes())
.apply("TeamScore", new CalculateTeamScores(options.getOutputPrefix()));
pipeline.run();
}
示例5: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
PipelineOptionsFactory.register(TemplateOptions.class);
TemplateOptions options = PipelineOptionsFactory
.fromArgs(args)
.withValidation()
.as(TemplateOptions.class);
options.setAutoscalingAlgorithm(THROUGHPUT_BASED);
Pipeline pipeline = Pipeline.create(options);
pipeline.apply(BigQueryIO.read().from(options.getBigQueryTableName()))
.apply(ParDo.of(new DoFn<TableRow, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
String commaSep = c.element().values()
.stream()
.map(cell -> cell.toString().trim())
.collect(Collectors.joining("\",\""));
c.output(commaSep);
}
}))
.apply(TextIO.write().to(options.getOutputFile())
.withoutSharding()
.withWritableByteChannelFactory(GZIP)
);
pipeline.run();
}
示例6: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.create();
options.setRunner(DirectRunner.class); // forced for this demo
Pipeline p = Pipeline.create(options);
// register Avro coders for serializing our messages
Coders.registerAvroCoders(p, ExtendedRecord.class, UntypedOccurrence.class);
PCollection<UntypedOccurrence> verbatimRecords = p.apply(
"Read Avro", AvroIO.read(UntypedOccurrence.class).from("demo/output/data*"));
verbatimRecords.apply("Write file per Genus",
AvroIO.write(UntypedOccurrence.class)
.to("demo/output-split/data*") // prefix, is required but overwritten
.to(new GenusDynamicAvroDestinations(
FileSystems.matchNewResource("demo/output-split/data*", true))));
LOG.info("Starting the pipeline");
PipelineResult result = p.run();
result.waitUntilFinish();
LOG.info("Pipeline finished with state: {} ", result.getState());
}
示例7: testWithInvalidContext
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
private void testWithInvalidContext(JavaSparkContext jsc) {
SparkContextOptions options = getSparkContextOptions(jsc);
Pipeline p = Pipeline.create(options);
PCollection<String> inputWords = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder
.of()));
inputWords.apply(new WordCount.CountWords())
.apply(MapElements.via(new WordCount.FormatAsTextFn()));
try {
p.run().waitUntilFinish();
fail("Should throw an exception when The provided Spark context is null or stopped");
} catch (RuntimeException e){
assert(e.getMessage().contains(PROVIDED_CONTEXT_EXCEPTION));
}
}
示例8: testWaitUntilFinishTimeout
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
@Test
public void testWaitUntilFinishTimeout() throws Exception {
DirectOptions options = PipelineOptionsFactory.as(DirectOptions.class);
options.setBlockOnRun(false);
options.setRunner(DirectRunner.class);
Pipeline p = Pipeline.create(options);
p
.apply(Create.of(1L))
.apply(ParDo.of(
new DoFn<Long, Long>() {
@ProcessElement
public void hang(ProcessContext context) throws InterruptedException {
// Hangs "forever"
Thread.sleep(Long.MAX_VALUE);
}
}));
PipelineResult result = p.run();
// The pipeline should never complete;
assertThat(result.getState(), is(State.RUNNING));
// Must time out, otherwise this test will never complete
result.waitUntilFinish(Duration.millis(1L));
assertThat(result.getState(), is(State.RUNNING));
}
示例9: testE2EV1Write
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
* An end-to-end test for {@link DatastoreV1.Write}.
*
* <p>Write some test entities to Cloud Datastore.
* Read and count all the entities. Verify that the count matches the
* number of entities written.
*/
@Test
public void testE2EV1Write() throws Exception {
Pipeline p = Pipeline.create(options);
// Write to datastore
p.apply(GenerateSequence.from(0).to(numEntities))
.apply(ParDo.of(new CreateEntityFn(options.getKind(), options.getNamespace(), ancestor, 0)))
.apply(DatastoreIO.v1().write().withProjectId(project));
p.run();
// Count number of entities written to datastore.
long numEntitiesWritten = countEntities(options, project, ancestor);
assertEquals(numEntitiesWritten, numEntities);
}
示例10: execute
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
* Executes the given sql.
*/
public void execute(String sqlString) throws Exception {
BeamSqlParser parser = new BeamSqlParser(sqlString);
SqlNode sqlNode = parser.impl().parseSqlStmtEof();
if (sqlNode instanceof SqlCreateTable) {
handleCreateTable((SqlCreateTable) sqlNode, metaStore);
} else {
PipelineOptions options = PipelineOptionsFactory.fromArgs(new String[] {}).withValidation()
.as(PipelineOptions.class);
options.setJobName("BeamPlanCreator");
Pipeline pipeline = Pipeline.create(options);
compilePipeline(sqlString, pipeline, env);
pipeline.run();
}
}
示例11: testTemplateRunnerLoggedErrorForFile
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
* Tests that the {@link DataflowRunner} with {@code --templateLocation} throws the appropriate
* exception when an output file is not writable.
*/
@Test
public void testTemplateRunnerLoggedErrorForFile() throws Exception {
DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
options.setJobName("TestJobName");
options.setRunner(DataflowRunner.class);
options.setTemplateLocation("//bad/path");
options.setProject("test-project");
options.setTempLocation(tmpFolder.getRoot().getPath());
options.setGcpCredential(new TestCredential());
options.setPathValidatorClass(NoopPathValidator.class);
Pipeline p = Pipeline.create(options);
thrown.expectMessage("Cannot create output file at");
thrown.expect(RuntimeException.class);
p.run();
}
示例12: createIndexerPipeline
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
* This function creates the DAG graph of transforms. It can be called from main()
* as well as from the ControlPipeline.
* @param options
* @return
* @throws Exception
*/
public static Pipeline createIndexerPipeline(FileIndexerPipelineOptions options) throws Exception {
IndexerPipelineUtils.validateIndexerPipelineOptions(options);
Pipeline pipeline = Pipeline.create(options);
// PHASE: Read raw content from sources
PCollection<InputContent> readContent = pipeline
.apply("Read entire CSV file", org.apache.beam.sdk.io.Read.from(new RecordFileSource<String>(
ValueProvider.StaticValueProvider.of(options.getInputFile()),
StringUtf8Coder.of(), RecordFileSource.DEFAULT_RECORD_SEPARATOR))) //
.apply("Parse CSV file into InputContent objects", ParDo.of(new ParseCSVFile()));
// Define the accumulators of all filters
PCollection<InputContent> contentToIndex = readContent;
// PHASE: Index documents (extract opinions and entities/tags).
// Return successfully indexed docs, and create a Bigtable write transform to store errors
// in Dead Letter table.
PCollection<ContentIndexSummary> indexes = indexDocuments(options, contentToIndex);
if (options.getRatioEnrichWithCNLP() > 0)
indexes = enrichWithCNLP(indexes, options.getRatioEnrichWithCNLP());
// PHASE: Write to BigQuery
// For the Indexes that are unique ("filteredIndexes"), create records in webresource, document, and sentiment.
// Then, merge resulting webresources with webresourceRowsUnindexed and webresourceDeduped
indexes
.apply(ParDo.of(new CreateCSVLineFromIndexSummaryFn()))
.apply(TextIO.write()
.to(options.getOutputFile()));
return pipeline;
}
示例13: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
/**
* Runs the DatastoreToBigQuery dataflow pipeline
*/
public static void main(String[] args) throws IOException, ScriptException {
Options options = PipelineOptionsFactory.fromArgs(args)
.withValidation()
.as(Options.class);
NestedValueProvider<String, String> bqJsonSchema = NestedValueProvider
.of(options.getBqJsonSchema(), new ValueProviderHelpers.GcsLoad());
options.setRunner(DataflowRunner.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("IngestEntities",
DatastoreIO.v1().read()
.withProjectId(options.getDatastoreProjectId())
.withLiteralGqlQuery(options.getGqlQuery())
.withNamespace(options.getNamespace()))
.apply("EntityToTableRow", ParDo.of(EntityToTableRow.newBuilder()
.setJsTransformPath(options.getJsTransformPath())
.setJsTransformFunctionName(options.getJsTransformFunctionName())
.setStrictCast(options.getStrictCast())
.setTableSchemaJson(bqJsonSchema)
.build()))
.apply("TableRowToBigQuery", BigQueryIO.writeTableRows()
.to(options.getBqTableSpec())
.withJsonSchema(bqJsonSchema)
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(WriteDisposition.WRITE_APPEND));
pipeline.run();
}
示例14: main
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
public static void main(String[] args) {
PipelineOptionsFactory.register(TemplateOptions.class);
TemplateOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(TemplateOptions.class);
Pipeline pipeline = Pipeline.create(options);
pipeline.apply("READ", TextIO.read().from(options.getInputFile()).withCompressionType(GZIP))
.apply("TRANSFORM", ParDo.of(new WikiParDo()))
.apply("WRITE", BigQueryIO.writeTableRows()
.to(String.format("%s:devfest_melbourne_2017.wiki_demo", options.getProject()))
.withCreateDisposition(CREATE_IF_NEEDED)
.withWriteDisposition(WRITE_APPEND)
.withSchema(getTableSchema()));
pipeline.run();
}
示例15: run
import org.apache.beam.sdk.Pipeline; //导入方法依赖的package包/类
protected static void run(Options options) {
String projectId = options.getProject();
String inputSubscription = options.getInputPubsubSubscription();
String datasetId = options.getOutputBigQueryDataset();
String tablePrefix = options.getOutputBigQueryTable();
// Input
String subscriptionName = "projects/" + projectId + "/subscriptions/" + inputSubscription;
PubsubIO.Read<String> pubsubReader = PubsubIO.<String>read().subscription(subscriptionName)
.withCoder(StringUtf8Coder.of());
// Output
TableSchema schema = PubsubMessage2TableRowFn.getOutputTableSchema();
TableNameByWindowFn tableRefefenceFunction =
new TableNameByWindowFn(projectId, datasetId, tablePrefix);
BigQueryIO.Write.Bound bqWriter = BigQueryIO.Write
.withSchema(schema)
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
.to(tableRefefenceFunction);
// Build and run pipeline
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply(options.getInputPubsubSubscription(), pubsubReader)
.apply(new LogTransformer())
.apply(options.getOutputBigQueryTable(), bqWriter);
pipeline.run();
}