本文整理汇总了Java中org.apache.beam.sdk.values.PCollection类的典型用法代码示例。如果您正苦于以下问题:Java PCollection类的具体用法?Java PCollection怎么用?Java PCollection使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
PCollection类属于org.apache.beam.sdk.values包,在下文中一共展示了PCollection类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: filterAlreadyProcessedUrls
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
/**
* @param options
* @param pipeline
* @param readContent
* @return
*/
private static PCollection<InputContent> filterAlreadyProcessedUrls(
PCollection<InputContent> readContent, Pipeline pipeline,
IndexerPipelineOptions options) {
PCollection<InputContent> contentToProcess;
String query = IndexerPipelineUtils.buildBigQueryProcessedUrlsQuery(options);
PCollection<KV<String,Long>> alreadyProcessedUrls = pipeline
.apply("Get processed URLs",BigQueryIO.read().fromQuery(query))
.apply(ParDo.of(new GetUrlFn()));
final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput =
alreadyProcessedUrls.apply(View.<String,Long>asMap());
contentToProcess = readContent
.apply(ParDo.of(new FilterProcessedUrls(alreadyProcessedUrlsSideInput))
.withSideInputs(alreadyProcessedUrlsSideInput));
return contentToProcess;
}
示例2: testUnwritableRemoveContainerPipeline
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testUnwritableRemoveContainerPipeline() throws Exception {
final Map<String, String> dataConfiguration = singletonMap("repository",
getClass().getResource("/dataDirectory2").toURI().toString());
final File root = new File(getClass().getResource("/dataDirectory2").toURI());
assumeTrue(root.setReadOnly());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(CONTAINER_KV))
.apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), false)));
PAssert.that(pCollection).empty();
pipeline.run();
root.setWritable(true);
}
示例3: testUnwritableAddContainerPipeline
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testUnwritableAddContainerPipeline() throws Exception {
final Map<String, String> dataConfiguration = singletonMap("repository",
getClass().getResource("/dataDirectory2").toURI().toString());
final File root = new File(getClass().getResource("/dataDirectory2").toURI());
assumeTrue(root.setReadOnly());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(CONTAINER_KV))
.apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), true)));
PAssert.that(pCollection).empty();
pipeline.run();
root.setWritable(true);
}
示例4: buildBeamPipeline
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Override
public PCollection<BeamRecord> buildBeamPipeline(PCollectionTuple inputPCollections
, BeamSqlEnv sqlEnv) throws Exception {
RelNode input = getInput();
String stageName = BeamSqlRelUtils.getStageName(this);
PCollection<BeamRecord> upstream =
BeamSqlRelUtils.getBeamRelInput(input).buildBeamPipeline(inputPCollections, sqlEnv);
BeamSqlExpressionExecutor executor = new BeamSqlFnExecutor(this);
PCollection<BeamRecord> projectStream = upstream.apply(stageName, ParDo
.of(new BeamSqlProjectFn(getRelTypeName(), executor,
CalciteUtils.toBeamRowType(rowType))));
projectStream.setCoder(CalciteUtils.toBeamRowType(getRowType()).getRecordCoder());
return projectStream;
}
示例5: enrichWithCNLP
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
/**
* @param indexes
* @return
*/
private static PCollection<ContentIndexSummary> enrichWithCNLP(
PCollection<ContentIndexSummary> indexes, Float ratio) {
PCollectionTuple splitAB = indexes
.apply(ParDo.of(new SplitAB(ratio))
.withOutputTags(PipelineTags.BranchA,
TupleTagList.of(PipelineTags.BranchB)));
PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
ParDo.of(new EnrichWithCNLPEntities()));
//Merge all collections with WebResource table records
PCollectionList<ContentIndexSummary> contentIndexSummariesList =
PCollectionList.of(branchACol).and(enrichedBCol);
PCollection<ContentIndexSummary> allIndexSummaries =
contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());
indexes = allIndexSummaries;
return indexes;
}
示例6: enrichWithCNLP
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
/**
* @param filteredIndexes
* @return
*/
private static PCollection<ContentIndexSummary> enrichWithCNLP(
PCollection<ContentIndexSummary> filteredIndexes, Float ratio) {
PCollectionTuple splitAB = filteredIndexes
.apply(ParDo.of(new SplitAB(ratio))
.withOutputTags(PipelineTags.BranchA,
TupleTagList.of(PipelineTags.BranchB)));
PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
ParDo.of(new EnrichWithCNLPEntities()));
//Merge all collections with WebResource table records
PCollectionList<ContentIndexSummary> contentIndexSummariesList =
PCollectionList.of(branchACol).and(enrichedBCol);
PCollection<ContentIndexSummary> allIndexSummaries =
contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());
filteredIndexes = allIndexSummaries;
return filteredIndexes;
}
示例7: filterSoftDuplicates
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
/**
* @param Document indexes
* @return a POJO containing 2 PCollections: Unique docs, and Duplicates
*/
private static ContentDuplicateOrNot filterSoftDuplicates(
PCollection<ContentIndexSummary> indexes) {
//
PCollectionTuple dedupeOrNot = indexes
.apply("Extract Text grouping key",
ParDo.of(new GetContentIndexSummaryKeyFn()))
.apply("Group by Text grouping key",
GroupByKey.<ContentSoftDeduplicationKey, ContentIndexSummary>create())
.apply("Eliminate Text dupes",
ParDo.of(new EliminateTextDupes())
.withOutputTags(PipelineTags.indexedContentNotToDedupeTag,
TupleTagList.of(PipelineTags.indexedContentToDedupeTag)));
PCollection<TableRow> dedupedWebresources =
dedupeOrNot.get(PipelineTags.indexedContentToDedupeTag)
.apply(ParDo.of(new CreateWebresourceTableRowFromDupeIndexSummaryFn()));
ContentDuplicateOrNot contentDuplicateOrNot = new ContentDuplicateOrNot(
dedupeOrNot.get(PipelineTags.indexedContentNotToDedupeTag),
dedupedWebresources);
return contentDuplicateOrNot;
}
示例8: testCachePipeline
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testCachePipeline() throws Exception {
final KV<String, String> kv = KV.of("trellis:repository/resource", null);
final Map<String, String> dataConfiguration = singletonMap("repository",
getClass().getResource("/root").toURI().toString());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new CacheWriter(dataConfiguration)));
PAssert.that(pCollection).containsInAnyOrder(asList(kv));
pipeline.run();
}
示例9: testUnableToCachePipeline
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testUnableToCachePipeline() throws Exception {
final KV<String, String> kv = KV.of("trellis:repository/some-other-resource", null);
final Map<String, String> dataConfiguration = singletonMap("repository",
getClass().getResource("/root").toURI().toString());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new CacheWriter(dataConfiguration)));
PAssert.that(pCollection).empty();
pipeline.run();
}
示例10: testInvalidDirectoryPipeline
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDirectoryPipeline() throws Exception {
final KV<String, String> kv = KV.of("trellis:repository/resource", null);
final Map<String, String> dataConfiguration = singletonMap("foo",
getClass().getResource("/root").toURI().toString());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new CacheWriter(dataConfiguration)));
PAssert.that(pCollection).empty();
pipeline.run();
}
示例11: testInvalidDataPipeline
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDataPipeline() throws Exception {
final String dataset = "<trellis:repository/resource> " +
"<http://purl.org/dc/terms/subject> <trellis:repository/resource/member> " +
"<http://www.w3.org/ns/ldp#PreferConta";
final KV<String, String> kv = KV.of("trellis:repository/resource", dataset);
final Map<String, String> dataConfiguration = singletonMap("repository",
getClass().getResource("/dataDirectory").toURI().toString());
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), false)));
PAssert.that(pCollection).empty();
pipeline.run();
}
示例12: testInvalidDataPipeline
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDataPipeline() throws Exception {
final String dataset = "<trellis:repository/resource> " +
"<http://purl.org/dc/terms/subject> <trellis:repository/resource/member> " +
"<http://www.w3.org/ns/ldp#PreferConta";
final KV<String, String> kv = KV.of("trellis:repository/resource", dataset);
final Map<String, String> dataConfiguration = singletonMap("repository", "http://localhost/");
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new EventProcessor(dataConfiguration)));
PAssert.that(pCollection).empty();
pipeline.run();
}
示例13: testInvalidDataPipeline
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDataPipeline() throws Exception {
final String dataset = "<trellis:repository/resource> " +
"<http://purl.org/dc/terms/subject> <trellis:repository/resource/member> " +
"<http://www.w3.org/ns/ldp#PreferConta";
final KV<String, String> kv = KV.of("trellis:repository/resource", dataset);
final String dataConfiguration = getClass().getResource("/dataDirectory").toURI().toString();
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), false)));
PAssert.that(pCollection).empty();
pipeline.run();
}
示例14: testInvalidDataPipeline
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDataPipeline() throws Exception {
final String dataset = "<trellis:repository/resource> " +
"<http://purl.org/dc/terms/subject> <trellis:repository/resource/member> " +
"<http://www.w3.org/ns/ldp#PreferConta";
final KV<String, String> kv = KV.of("trellis:repository/resource", dataset);
final String dataConfiguration = "http://localhost/";
final PCollection<KV<String, String>> pCollection = pipeline
.apply("Create", Create.of(kv))
.apply(ParDo.of(new EventProcessor(dataConfiguration)));
PAssert.that(pCollection).empty();
pipeline.run();
}
示例15: testTransformer
import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
public void testTransformer() {
// Read file
InputStream inputStream =
this.getClass().getResourceAsStream("/dummy-log");
// Convert to JSON string
String json = readStream(inputStream);
LogAggregator.LogTransformer transformer = new LogAggregator.LogTransformer();
List<String> inputs = Arrays.asList(json, json, json);
PCollection<String> inputCollection = pipeline.apply(Create.of(inputs));
PCollection<TableRow> outputCollection = inputCollection.apply(transformer);
PAssert.that(outputCollection).satisfies(new HasColumnsCheckerFn());
pipeline.run().waitUntilFinish();
}