当前位置: 首页>>代码示例>>Java>>正文


Java PCollection类代码示例

本文整理汇总了Java中org.apache.beam.sdk.values.PCollection的典型用法代码示例。如果您正苦于以下问题:Java PCollection类的具体用法?Java PCollection怎么用?Java PCollection使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


PCollection类属于org.apache.beam.sdk.values包,在下文中一共展示了PCollection类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: filterAlreadyProcessedUrls

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
/**
 * @param options
 * @param pipeline
 * @param readContent
 * @return
 */
private static PCollection<InputContent> filterAlreadyProcessedUrls(
		PCollection<InputContent> readContent, Pipeline pipeline, 
		IndexerPipelineOptions options) {
	PCollection<InputContent> contentToProcess;
	String query = IndexerPipelineUtils.buildBigQueryProcessedUrlsQuery(options);
	PCollection<KV<String,Long>> alreadyProcessedUrls = pipeline
		.apply("Get processed URLs",BigQueryIO.read().fromQuery(query))
		.apply(ParDo.of(new GetUrlFn()));

	final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput =
		alreadyProcessedUrls.apply(View.<String,Long>asMap());
	  
	contentToProcess = readContent
		.apply(ParDo.of(new FilterProcessedUrls(alreadyProcessedUrlsSideInput))
			.withSideInputs(alreadyProcessedUrlsSideInput));
	return contentToProcess;
}
 
开发者ID:GoogleCloudPlatform,项目名称:dataflow-opinion-analysis,代码行数:24,代码来源:IndexerPipeline.java

示例2: testUnwritableRemoveContainerPipeline

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testUnwritableRemoveContainerPipeline() throws Exception {

    final Map<String, String> dataConfiguration = singletonMap("repository",
            getClass().getResource("/dataDirectory2").toURI().toString());

    final File root = new File(getClass().getResource("/dataDirectory2").toURI());

    assumeTrue(root.setReadOnly());

    final PCollection<KV<String, String>> pCollection = pipeline
        .apply("Create", Create.of(CONTAINER_KV))
        .apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), false)));

    PAssert.that(pCollection).empty();

    pipeline.run();
    root.setWritable(true);
}
 
开发者ID:trellis-ldp-archive,项目名称:trellis-rosid-file-streaming,代码行数:21,代码来源:BeamProcessorTest.java

示例3: testUnwritableAddContainerPipeline

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testUnwritableAddContainerPipeline() throws Exception {

    final Map<String, String> dataConfiguration = singletonMap("repository",
            getClass().getResource("/dataDirectory2").toURI().toString());

    final File root = new File(getClass().getResource("/dataDirectory2").toURI());

    assumeTrue(root.setReadOnly());

    final PCollection<KV<String, String>> pCollection = pipeline
        .apply("Create", Create.of(CONTAINER_KV))
        .apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), true)));

    PAssert.that(pCollection).empty();

    pipeline.run();
    root.setWritable(true);
}
 
开发者ID:trellis-ldp-archive,项目名称:trellis-rosid-file-streaming,代码行数:21,代码来源:BeamProcessorTest.java

示例4: buildBeamPipeline

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Override
public PCollection<BeamRecord> buildBeamPipeline(PCollectionTuple inputPCollections
    , BeamSqlEnv sqlEnv) throws Exception {
  RelNode input = getInput();
  String stageName = BeamSqlRelUtils.getStageName(this);

  PCollection<BeamRecord> upstream =
      BeamSqlRelUtils.getBeamRelInput(input).buildBeamPipeline(inputPCollections, sqlEnv);

  BeamSqlExpressionExecutor executor = new BeamSqlFnExecutor(this);

  PCollection<BeamRecord> projectStream = upstream.apply(stageName, ParDo
      .of(new BeamSqlProjectFn(getRelTypeName(), executor,
          CalciteUtils.toBeamRowType(rowType))));
  projectStream.setCoder(CalciteUtils.toBeamRowType(getRowType()).getRecordCoder());

  return projectStream;
}
 
开发者ID:apache,项目名称:beam,代码行数:19,代码来源:BeamProjectRel.java

示例5: enrichWithCNLP

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
/**
 * @param indexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> indexes, Float ratio) {
	
	PCollectionTuple splitAB = indexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	indexes = allIndexSummaries;
	return indexes;
}
 
开发者ID:GoogleCloudPlatform,项目名称:dataflow-opinion-analysis,代码行数:28,代码来源:FileIndexerPipeline.java

示例6: enrichWithCNLP

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
/**
 * @param filteredIndexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> filteredIndexes, Float ratio) {
	
	PCollectionTuple splitAB = filteredIndexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	filteredIndexes = allIndexSummaries;
	return filteredIndexes;
}
 
开发者ID:GoogleCloudPlatform,项目名称:dataflow-opinion-analysis,代码行数:28,代码来源:IndexerPipeline.java

示例7: filterSoftDuplicates

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
/**
 * @param Document indexes
 * @return a POJO containing 2 PCollections: Unique docs, and Duplicates
 */
private static ContentDuplicateOrNot filterSoftDuplicates(
		PCollection<ContentIndexSummary> indexes) {
	// 
	PCollectionTuple dedupeOrNot = indexes
		.apply("Extract Text grouping key", 
			ParDo.of(new GetContentIndexSummaryKeyFn()))
		.apply("Group by Text grouping key", 
			GroupByKey.<ContentSoftDeduplicationKey, ContentIndexSummary>create())
		.apply("Eliminate Text dupes", 
			ParDo.of(new EliminateTextDupes())
				.withOutputTags(PipelineTags.indexedContentNotToDedupeTag, 
					TupleTagList.of(PipelineTags.indexedContentToDedupeTag))); 	
		
	PCollection<TableRow> dedupedWebresources = 
		dedupeOrNot.get(PipelineTags.indexedContentToDedupeTag)
			.apply(ParDo.of(new CreateWebresourceTableRowFromDupeIndexSummaryFn()));
	
	ContentDuplicateOrNot contentDuplicateOrNot = new ContentDuplicateOrNot(
		dedupeOrNot.get(PipelineTags.indexedContentNotToDedupeTag),
		dedupedWebresources);
	
	return contentDuplicateOrNot;
}
 
开发者ID:GoogleCloudPlatform,项目名称:dataflow-opinion-analysis,代码行数:28,代码来源:IndexerPipeline.java

示例8: testCachePipeline

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testCachePipeline() throws Exception {

    final KV<String, String> kv = KV.of("trellis:repository/resource", null);

    final Map<String, String> dataConfiguration = singletonMap("repository",
            getClass().getResource("/root").toURI().toString());

    final PCollection<KV<String, String>> pCollection = pipeline
        .apply("Create", Create.of(kv))
        .apply(ParDo.of(new CacheWriter(dataConfiguration)));

    PAssert.that(pCollection).containsInAnyOrder(asList(kv));

    pipeline.run();
}
 
开发者ID:trellis-ldp-archive,项目名称:trellis-rosid-file-streaming,代码行数:18,代码来源:CacheWriterTest.java

示例9: testUnableToCachePipeline

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testUnableToCachePipeline() throws Exception {

    final KV<String, String> kv = KV.of("trellis:repository/some-other-resource", null);

    final Map<String, String> dataConfiguration = singletonMap("repository",
            getClass().getResource("/root").toURI().toString());

    final PCollection<KV<String, String>> pCollection = pipeline
        .apply("Create", Create.of(kv))
        .apply(ParDo.of(new CacheWriter(dataConfiguration)));

    PAssert.that(pCollection).empty();

    pipeline.run();
}
 
开发者ID:trellis-ldp-archive,项目名称:trellis-rosid-file-streaming,代码行数:18,代码来源:CacheWriterTest.java

示例10: testInvalidDirectoryPipeline

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDirectoryPipeline() throws Exception {

    final KV<String, String> kv = KV.of("trellis:repository/resource", null);

    final Map<String, String> dataConfiguration = singletonMap("foo",
            getClass().getResource("/root").toURI().toString());

    final PCollection<KV<String, String>> pCollection = pipeline
        .apply("Create", Create.of(kv))
        .apply(ParDo.of(new CacheWriter(dataConfiguration)));

    PAssert.that(pCollection).empty();

    pipeline.run();
}
 
开发者ID:trellis-ldp-archive,项目名称:trellis-rosid-file-streaming,代码行数:18,代码来源:CacheWriterTest.java

示例11: testInvalidDataPipeline

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDataPipeline() throws Exception {

    final String dataset = "<trellis:repository/resource> " +
        "<http://purl.org/dc/terms/subject> <trellis:repository/resource/member> " +
        "<http://www.w3.org/ns/ldp#PreferConta";
    final KV<String, String> kv = KV.of("trellis:repository/resource", dataset);

    final Map<String, String> dataConfiguration = singletonMap("repository",
            getClass().getResource("/dataDirectory").toURI().toString());

    final PCollection<KV<String, String>> pCollection = pipeline
        .apply("Create", Create.of(kv))
        .apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), false)));

    PAssert.that(pCollection).empty();

    pipeline.run();
}
 
开发者ID:trellis-ldp-archive,项目名称:trellis-rosid-file-streaming,代码行数:21,代码来源:BeamProcessorTest.java

示例12: testInvalidDataPipeline

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDataPipeline() throws Exception {

    final String dataset = "<trellis:repository/resource> " +
        "<http://purl.org/dc/terms/subject> <trellis:repository/resource/member> " +
        "<http://www.w3.org/ns/ldp#PreferConta";
    final KV<String, String> kv = KV.of("trellis:repository/resource", dataset);

    final Map<String, String> dataConfiguration = singletonMap("repository", "http://localhost/");

    final PCollection<KV<String, String>> pCollection = pipeline
        .apply("Create", Create.of(kv))
        .apply(ParDo.of(new EventProcessor(dataConfiguration)));

    PAssert.that(pCollection).empty();

    pipeline.run();
}
 
开发者ID:trellis-ldp-archive,项目名称:trellis-rosid-file-streaming,代码行数:20,代码来源:EventProcessorTest.java

示例13: testInvalidDataPipeline

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDataPipeline() throws Exception {

    final String dataset = "<trellis:repository/resource> " +
        "<http://purl.org/dc/terms/subject> <trellis:repository/resource/member> " +
        "<http://www.w3.org/ns/ldp#PreferConta";
    final KV<String, String> kv = KV.of("trellis:repository/resource", dataset);

    final String dataConfiguration = getClass().getResource("/dataDirectory").toURI().toString();

    final PCollection<KV<String, String>> pCollection = pipeline
        .apply("Create", Create.of(kv))
        .apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), false)));

    PAssert.that(pCollection).empty();

    pipeline.run();
}
 
开发者ID:trellis-ldp,项目名称:trellis-rosid,代码行数:20,代码来源:BeamProcessorTest.java

示例14: testInvalidDataPipeline

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
@Category(NeedsRunner.class)
public void testInvalidDataPipeline() throws Exception {

    final String dataset = "<trellis:repository/resource> " +
        "<http://purl.org/dc/terms/subject> <trellis:repository/resource/member> " +
        "<http://www.w3.org/ns/ldp#PreferConta";
    final KV<String, String> kv = KV.of("trellis:repository/resource", dataset);

    final String dataConfiguration = "http://localhost/";

    final PCollection<KV<String, String>> pCollection = pipeline
        .apply("Create", Create.of(kv))
        .apply(ParDo.of(new EventProcessor(dataConfiguration)));

    PAssert.that(pCollection).empty();

    pipeline.run();
}
 
开发者ID:trellis-ldp,项目名称:trellis-rosid,代码行数:20,代码来源:EventProcessorTest.java

示例15: testTransformer

import org.apache.beam.sdk.values.PCollection; //导入依赖的package包/类
@Test
public void testTransformer() {
  // Read file
  InputStream inputStream =
      this.getClass().getResourceAsStream("/dummy-log");
  // Convert to JSON string
  String json = readStream(inputStream);

  LogAggregator.LogTransformer transformer = new LogAggregator.LogTransformer();

  List<String> inputs = Arrays.asList(json, json, json);
  PCollection<String> inputCollection = pipeline.apply(Create.of(inputs));
  PCollection<TableRow> outputCollection = inputCollection.apply(transformer);
  PAssert.that(outputCollection).satisfies(new HasColumnsCheckerFn());
  pipeline.run().waitUntilFinish();
}
 
开发者ID:yu-iskw,项目名称:google-log-aggregation-example,代码行数:17,代码来源:LogAggregatorTest.java


注:本文中的org.apache.beam.sdk.values.PCollection类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。