本文整理汇总了Java中org.apache.beam.sdk.transforms.ParDo类的典型用法代码示例。如果您正苦于以下问题:Java ParDo类的具体用法?Java ParDo怎么用?Java ParDo使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ParDo类属于org.apache.beam.sdk.transforms包,在下文中一共展示了ParDo类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: filterAlreadyProcessedUrls
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
/**
* @param options
* @param pipeline
* @param readContent
* @return
*/
private static PCollection<InputContent> filterAlreadyProcessedUrls(
PCollection<InputContent> readContent, Pipeline pipeline,
IndexerPipelineOptions options) {
PCollection<InputContent> contentToProcess;
String query = IndexerPipelineUtils.buildBigQueryProcessedUrlsQuery(options);
PCollection<KV<String,Long>> alreadyProcessedUrls = pipeline
.apply("Get processed URLs",BigQueryIO.read().fromQuery(query))
.apply(ParDo.of(new GetUrlFn()));
final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput =
alreadyProcessedUrls.apply(View.<String,Long>asMap());
contentToProcess = readContent
.apply(ParDo.of(new FilterProcessedUrls(alreadyProcessedUrlsSideInput))
.withSideInputs(alreadyProcessedUrlsSideInput));
return contentToProcess;
}
示例2: testUnwritableRemoveContainerPipeline
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testUnwritableRemoveContainerPipeline() throws Exception {
final Map<String, String> dataConfiguration = singletonMap("repository",
getClass().getResource("/dataDirectory2").toURI().toString());
final File root = new File(getClass().getResource("/dataDirectory2").toURI());
assumeTrue(root.setReadOnly());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(CONTAINER_KV))
.apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), false)));
PAssert.that(pCollection).empty();
pipeline.run();
root.setWritable(true);
}
示例3: testUnwritableAddContainerPipeline
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testUnwritableAddContainerPipeline() throws Exception {
final Map<String, String> dataConfiguration = singletonMap("repository",
getClass().getResource("/dataDirectory2").toURI().toString());
final File root = new File(getClass().getResource("/dataDirectory2").toURI());
assumeTrue(root.setReadOnly());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(CONTAINER_KV))
.apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), true)));
PAssert.that(pCollection).empty();
pipeline.run();
root.setWritable(true);
}
示例4: testDefaultCoder
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testDefaultCoder() throws Exception {
p.enableAbandonedNodeEnforcement(true);
// Use MyRecord as input and output types without explicitly specifying
// a coder (this uses the default coders, which may not be
// SerializableCoder).
PCollection<String> output =
p.apply(Create.of("Hello", "World"))
.apply(ParDo.of(new StringToRecord()))
.apply(ParDo.of(new RecordToString()));
PAssert.that(output)
.containsInAnyOrder("Hello", "World");
p.run();
}
示例5: enrichWithCNLP
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
/**
* @param indexes
* @return
*/
private static PCollection<ContentIndexSummary> enrichWithCNLP(
PCollection<ContentIndexSummary> indexes, Float ratio) {
PCollectionTuple splitAB = indexes
.apply(ParDo.of(new SplitAB(ratio))
.withOutputTags(PipelineTags.BranchA,
TupleTagList.of(PipelineTags.BranchB)));
PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
ParDo.of(new EnrichWithCNLPEntities()));
//Merge all collections with WebResource table records
PCollectionList<ContentIndexSummary> contentIndexSummariesList =
PCollectionList.of(branchACol).and(enrichedBCol);
PCollection<ContentIndexSummary> allIndexSummaries =
contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());
indexes = allIndexSummaries;
return indexes;
}
示例6: enrichWithCNLP
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
/**
* @param filteredIndexes
* @return
*/
private static PCollection<ContentIndexSummary> enrichWithCNLP(
PCollection<ContentIndexSummary> filteredIndexes, Float ratio) {
PCollectionTuple splitAB = filteredIndexes
.apply(ParDo.of(new SplitAB(ratio))
.withOutputTags(PipelineTags.BranchA,
TupleTagList.of(PipelineTags.BranchB)));
PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
ParDo.of(new EnrichWithCNLPEntities()));
//Merge all collections with WebResource table records
PCollectionList<ContentIndexSummary> contentIndexSummariesList =
PCollectionList.of(branchACol).and(enrichedBCol);
PCollection<ContentIndexSummary> allIndexSummaries =
contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());
filteredIndexes = allIndexSummaries;
return filteredIndexes;
}
示例7: filterSoftDuplicates
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
/**
* @param Document indexes
* @return a POJO containing 2 PCollections: Unique docs, and Duplicates
*/
private static ContentDuplicateOrNot filterSoftDuplicates(
PCollection<ContentIndexSummary> indexes) {
//
PCollectionTuple dedupeOrNot = indexes
.apply("Extract Text grouping key",
ParDo.of(new GetContentIndexSummaryKeyFn()))
.apply("Group by Text grouping key",
GroupByKey.<ContentSoftDeduplicationKey, ContentIndexSummary>create())
.apply("Eliminate Text dupes",
ParDo.of(new EliminateTextDupes())
.withOutputTags(PipelineTags.indexedContentNotToDedupeTag,
TupleTagList.of(PipelineTags.indexedContentToDedupeTag)));
PCollection<TableRow> dedupedWebresources =
dedupeOrNot.get(PipelineTags.indexedContentToDedupeTag)
.apply(ParDo.of(new CreateWebresourceTableRowFromDupeIndexSummaryFn()));
ContentDuplicateOrNot contentDuplicateOrNot = new ContentDuplicateOrNot(
dedupeOrNot.get(PipelineTags.indexedContentNotToDedupeTag),
dedupedWebresources);
return contentDuplicateOrNot;
}
示例8: main
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
/**
* Runs the DatastoreToGcs dataflow pipeline
*/
public static void main(String[] args) throws IOException, ScriptException {
Options options = PipelineOptionsFactory.fromArgs(args)
.withValidation()
.as(Options.class);
options.setRunner(DataflowRunner.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("IngestEntities",
DatastoreIO.v1().read()
.withProjectId(options.getDatastoreProjectId())
.withLiteralGqlQuery(options.getGqlQuery())
.withNamespace(options.getNamespace()))
.apply("EntityToJson", ParDo.of(EntityToJson.newBuilder()
.setJsTransformPath(options.getJsTransformPath())
.setJsTransformFunctionName(options.getJsTransformFunctionName())
.build()))
.apply("JsonToGcs", TextIO.write().to(options.getSavePath())
.withSuffix(".json"));
pipeline.run();
}
示例9: main
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
/**
* Runs the GcsToDatastore dataflow pipeline
*/
public static void main(String[] args) throws IOException, ScriptException {
Options options = PipelineOptionsFactory.fromArgs(args)
.withValidation()
.as(Options.class);
options.setRunner(DataflowRunner.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("IngestJson", TextIO.read()
.from(options.getJsonPathPrefix()))
.apply("GcsToEntity", ParDo.of(JsonToEntity.newBuilder()
.setJsTransformPath(options.getJsTransformPath())
.setJsTransformFunctionName(options.getJsTransformFunctionName())
.build()))
.apply(DatastoreIO.v1().write()
.withProjectId(options.getDatastoreProjectId()));
pipeline.run();
}
示例10: testCachePipeline
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testCachePipeline() throws Exception {
final KV<String, String> kv = KV.of("trellis:repository/resource", null);
final Map<String, String> dataConfiguration = singletonMap("repository",
getClass().getResource("/root").toURI().toString());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new CacheWriter(dataConfiguration)));
PAssert.that(pCollection).containsInAnyOrder(asList(kv));
pipeline.run();
}
示例11: testUnableToCachePipeline
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testUnableToCachePipeline() throws Exception {
final KV<String, String> kv = KV.of("trellis:repository/some-other-resource", null);
final Map<String, String> dataConfiguration = singletonMap("repository",
getClass().getResource("/root").toURI().toString());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new CacheWriter(dataConfiguration)));
PAssert.that(pCollection).empty();
pipeline.run();
}
示例12: testInvalidDirectoryPipeline
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDirectoryPipeline() throws Exception {
final KV<String, String> kv = KV.of("trellis:repository/resource", null);
final Map<String, String> dataConfiguration = singletonMap("foo",
getClass().getResource("/root").toURI().toString());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new CacheWriter(dataConfiguration)));
PAssert.that(pCollection).empty();
pipeline.run();
}
示例13: testInvalidDataPipeline
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDataPipeline() throws Exception {
final String dataset = "<trellis:repository/resource> " +
"<http://purl.org/dc/terms/subject> <trellis:repository/resource/member> " +
"<http://www.w3.org/ns/ldp#PreferConta";
final KV<String, String> kv = KV.of("trellis:repository/resource", dataset);
final Map<String, String> dataConfiguration = singletonMap("repository",
getClass().getResource("/dataDirectory").toURI().toString());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), false)));
PAssert.that(pCollection).empty();
pipeline.run();
}
示例14: testInvalidDataPipeline
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDataPipeline() throws Exception {
final String dataset = "<trellis:repository/resource> " +
"<http://purl.org/dc/terms/subject> <trellis:repository/resource/member> " +
"<http://www.w3.org/ns/ldp#PreferConta";
final KV<String, String> kv = KV.of("trellis:repository/resource", dataset);
final Map<String, String> dataConfiguration = singletonMap("repository", "http://localhost/");
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new EventProcessor(dataConfiguration)));
PAssert.that(pCollection).empty();
pipeline.run();
}
示例15: main
import org.apache.beam.sdk.transforms.ParDo; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
Options options =
PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply(KafkaIO.<String, String>read()
.withBootstrapServers(options.getKafkaBootstrapServer())
.withTopic(options.getTopic())
.withKeyDeserializer(StringDeserializer.class)
.withValueDeserializer(StringDeserializer.class)
.withTimestampFn(new SetTimestampFn()))
.apply("Values", ParDo.of(new ValuesFn()))
.apply("FixedWindows", Window.<String>into(FixedWindows.of(FIVE_MINUTES))
.triggering(AfterWatermark.pastEndOfWindow()
.withEarlyFirings(AfterProcessingTime.pastFirstElementInPane()
.plusDelayOf(TWO_MINUTES))
.withLateFirings(AfterPane.elementCountAtLeast(1)))
.withAllowedLateness(TEN_MINUTES)
.accumulatingFiredPanes())
.apply("TeamScore", new CalculateTeamScores(options.getOutputPrefix()));
pipeline.run();
}