Java PCollection.apply方法代码示例

本文整理汇总了Java中org.apache.beam.sdk.values.PCollection.apply方法的典型用法代码示例。如果您正苦于以下问题：Java PCollection.apply方法的具体用法？Java PCollection.apply怎么用？Java PCollection.apply使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.beam.sdk.values.PCollection的用法示例。

在下文中一共展示了PCollection.apply方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: filterAlreadyProcessedUrls

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
/**
 * @param options
 * @param pipeline
 * @param readContent
 * @return
 */
private static PCollection<InputContent> filterAlreadyProcessedUrls(
		PCollection<InputContent> readContent, Pipeline pipeline, 
		IndexerPipelineOptions options) {
	PCollection<InputContent> contentToProcess;
	String query = IndexerPipelineUtils.buildBigQueryProcessedUrlsQuery(options);
	PCollection<KV<String,Long>> alreadyProcessedUrls = pipeline
		.apply("Get processed URLs",BigQueryIO.read().fromQuery(query))
		.apply(ParDo.of(new GetUrlFn()));

	final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput =
		alreadyProcessedUrls.apply(View.<String,Long>asMap());
	  
	contentToProcess = readContent
		.apply(ParDo.of(new FilterProcessedUrls(alreadyProcessedUrlsSideInput))
			.withSideInputs(alreadyProcessedUrlsSideInput));
	return contentToProcess;
}

开发者ID:GoogleCloudPlatform，项目名称:dataflow-opinion-analysis，代码行数:24，代码来源:IndexerPipeline.java

示例2: enrichWithCNLP

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
/**
 * @param indexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> indexes, Float ratio) {
	
	PCollectionTuple splitAB = indexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	indexes = allIndexSummaries;
	return indexes;
}

开发者ID:GoogleCloudPlatform，项目名称:dataflow-opinion-analysis，代码行数:28，代码来源:FileIndexerPipeline.java

示例3: enrichWithCNLP

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
/**
 * @param filteredIndexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> filteredIndexes, Float ratio) {
	
	PCollectionTuple splitAB = filteredIndexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	filteredIndexes = allIndexSummaries;
	return filteredIndexes;
}

开发者ID:GoogleCloudPlatform，项目名称:dataflow-opinion-analysis，代码行数:28，代码来源:IndexerPipeline.java

示例4: testTransformer

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
@Test
public void testTransformer() {
  // Read file
  InputStream inputStream =
      this.getClass().getResourceAsStream("/dummy-log");
  // Convert to JSON string
  String json = readStream(inputStream);

  LogAggregator.LogTransformer transformer = new LogAggregator.LogTransformer();

  List<String> inputs = Arrays.asList(json, json, json);
  PCollection<String> inputCollection = pipeline.apply(Create.of(inputs));
  PCollection<TableRow> outputCollection = inputCollection.apply(transformer);
  PAssert.that(outputCollection).satisfies(new HasColumnsCheckerFn());
  pipeline.run().waitUntilFinish();
}

开发者ID:yu-iskw，项目名称:google-log-aggregation-example，代码行数:17，代码来源:LogAggregatorTest.java

示例5: main

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(DirectRunner.class); // forced for this demo
  Pipeline p = Pipeline.create(options);

  // register Avro coders for serializing our messages
  Coders.registerAvroCoders(p, ExtendedRecord.class, UntypedOccurrence.class);

  PCollection<UntypedOccurrence> verbatimRecords = p.apply(
    "Read Avro", AvroIO.read(UntypedOccurrence.class).from("demo/output/data*"));

  verbatimRecords.apply("Write file per Genus",
                        AvroIO.write(UntypedOccurrence.class)
                              .to("demo/output-split/data*") // prefix, is required but overwritten
                              .to(new GenusDynamicAvroDestinations(
                                FileSystems.matchNewResource("demo/output-split/data*", true))));


  LOG.info("Starting the pipeline");
  PipelineResult result = p.run();
  result.waitUntilFinish();
  LOG.info("Pipeline finished with state: {} ", result.getState());
}

开发者ID:gbif，项目名称:pipelines，代码行数:24，代码来源:MultiAvroOutDemo.java

示例6: main

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
public static void main(String[] args) {
  Configuration conf = new Configuration(); // assume defaults on CP
  conf.setClass("mapreduce.job.inputformat.class", DwCAInputFormat.class, InputFormat.class);
  conf.setStrings("mapreduce.input.fileinputformat.inputdir", "hdfs://ha-nn/tmp/dwca-lep5.zip");
  conf.setClass("key.class", Text.class, Object.class);
  conf.setClass("value.class", ExtendedRecord.class, Object.class);

  Pipeline p = newPipeline(args, conf);
  Coders.registerAvroCoders(p, UntypedOccurrence.class, TypedOccurrence.class, ExtendedRecord.class);

  PCollection<KV<Text, ExtendedRecord>> rawRecords =
    p.apply("Read DwC-A", HadoopInputFormatIO.<Text, ExtendedRecord>read().withConfiguration(conf));

  PCollection<UntypedOccurrence> verbatimRecords = rawRecords.apply(
    "Convert to Avro", ParDo.of(fromExtendedRecordKVP()));

  verbatimRecords.apply(
    "Write Avro files", AvroIO.write(UntypedOccurrence.class).to("hdfs://ha-nn/tmp/dwca-lep5.avro"));

  LOG.info("Starting the pipeline");
  PipelineResult result = p.run();
  result.waitUntilFinish();
  LOG.info("Pipeline finished with state: {} ", result.getState());
}

开发者ID:gbif，项目名称:pipelines，代码行数:25，代码来源:DwCA2AvroPipeline.java

示例7: expand

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
@Override
public PCollection<KV<K, V>> expand(PCollection<V> in) {
  PCollection<KV<K, V>> result =
      in.apply("AddKeys", MapElements.via(new SimpleFunction<V, KV<K, V>>() {
        @Override
        public KV<K, V> apply(V element) {
          return KV.of(fn.apply(element), element);
        }
      }));

  try {
    Coder<K> keyCoder;
    CoderRegistry coderRegistry = in.getPipeline().getCoderRegistry();
    if (keyClass == null) {
      keyCoder = coderRegistry.getOutputCoder(fn, in.getCoder());
    } else {
      keyCoder = coderRegistry.getCoder(TypeDescriptor.of(keyClass));
    }
    // TODO: Remove when we can set the coder inference context.
    result.setCoder(KvCoder.of(keyCoder, in.getCoder()));
  } catch (CannotProvideCoderException exc) {
    // let lazy coder inference have a try
  }

  return result;
}

开发者ID:apache，项目名称:beam，代码行数:27，代码来源:WithKeys.java

示例8: expand

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
@Override
public PCollectionList<T> expand(PCollection<T> in) {
  final TupleTagList outputTags = partitionDoFn.getOutputTags();

  PCollectionTuple outputs = in.apply(
      ParDo
      .of(partitionDoFn)
      .withOutputTags(new TupleTag<Void>(){}, outputTags));

  PCollectionList<T> pcs = PCollectionList.empty(in.getPipeline());
  Coder<T> coder = in.getCoder();

  for (TupleTag<?> outputTag : outputTags.getAll()) {
    // All the tuple tags are actually TupleTag<T>
    // And all the collections are actually PCollection<T>
    @SuppressWarnings("unchecked")
    TupleTag<T> typedOutputTag = (TupleTag<T>) outputTag;
    pcs = pcs.and(outputs.get(typedOutputTag).setCoder(coder));
  }
  return pcs;
}

开发者ID:apache，项目名称:beam，代码行数:22，代码来源:Partition.java

示例9: applyTyped

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
private PCollection<Event> applyTyped(PCollection<Event> events) {
  final Coder<Event> coder = events.getCoder();
  return events
      // Force round trip through coder.
      .apply(name + ".Serialize",
          ParDo.of(new DoFn<Event, Event>() {
                private final Counter bytesMetric =
                  Metrics.counter(name , "bytes");

                @ProcessElement
                public void processElement(ProcessContext c) throws CoderException, IOException {
                  ByteArrayOutputStream outStream = new ByteArrayOutputStream();
                  coder.encode(c.element(), outStream, Coder.Context.OUTER);
                  byte[] byteArray = outStream.toByteArray();
                  bytesMetric.inc((long) byteArray.length);
                  ByteArrayInputStream inStream = new ByteArrayInputStream(byteArray);
                  Event event = coder.decode(inStream, Coder.Context.OUTER);
                  c.output(event);
                }
              }));
}

开发者ID:apache，项目名称:beam，代码行数:22，代码来源:Query0.java

示例10: expand

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
@Override
public PCollection<KV<String, List<CompletionCandidate>>> expand(PCollection<String> input) {
  PCollection<CompletionCandidate> candidates = input
    // First count how often each token appears.
    .apply(Count.<String>perElement())

    // Map the KV outputs of Count into our own CompletionCandiate class.
    .apply("CreateCompletionCandidates", ParDo.of(
        new DoFn<KV<String, Long>, CompletionCandidate>() {
          @ProcessElement
          public void processElement(ProcessContext c) {
            c.output(new CompletionCandidate(c.element().getKey(), c.element().getValue()));
          }
        }));

  // Compute the top via either a flat or recursive algorithm.
  if (recursive) {
    return candidates
      .apply(new ComputeTopRecursive(candidatesPerPrefix, 1))
      .apply(Flatten.<KV<String, List<CompletionCandidate>>>pCollections());
  } else {
    return candidates
      .apply(new ComputeTopFlat(candidatesPerPrefix, 1));
  }
}

开发者ID:apache，项目名称:beam，代码行数:26，代码来源:AutoComplete.java

示例11: createInput

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
/**
 * Converts the given list with timestamps into a PCollection.
 */
private PCollection<KV<Integer, String>> createInput(String name,
    Pipeline p, List<KV<Integer, String>> list, List<Long> timestamps) {
  PCollection<KV<Integer, String>> input;
  if (timestamps.isEmpty()) {
    input = p.apply("Create" + name, Create.of(list)
        .withCoder(KvCoder.of(BigEndianIntegerCoder.of(), StringUtf8Coder.of())));
  } else {
    input = p.apply("Create" + name, Create.timestamped(list, timestamps)
        .withCoder(KvCoder.of(BigEndianIntegerCoder.of(), StringUtf8Coder.of())));
  }
  return input.apply(
      "Identity" + name,
      ParDo.of(
          new DoFn<KV<Integer, String>, KV<Integer, String>>() {
            @ProcessElement
            public void processElement(ProcessContext c) {
              c.output(c.element());
            }
          }));
}

开发者ID:apache，项目名称:beam，代码行数:24，代码来源:CoGroupByKeyTest.java

示例12: expand

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
public PCollection<String> expand(PCollection<String> in) {
  return in.apply(
      ParDo.of(
          new DoFn<String, String>() {
            @ProcessElement
            public void processElement(ProcessContext c) throws Exception {
              String[] items = pattern.split(c.element());

              for (String item : items) {
                if (outputEmpty || !item.isEmpty()) {
                  c.output(item);
                }
              }
            }
          }));
}

开发者ID:apache，项目名称:beam，代码行数:17，代码来源:Regex.java

示例13: applyForSingleton

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
static <T, FinalT, ViewT, W extends BoundedWindow> PCollection<?>
applyForSingleton(
    DataflowRunner runner,
    PCollection<T> input,
    DoFn<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>,
        IsmRecord<WindowedValue<FinalT>>> doFn,
    Coder<FinalT> defaultValueCoder,
    PCollectionView<ViewT> view) {

  @SuppressWarnings("unchecked")
  Coder<W> windowCoder = (Coder<W>)
      input.getWindowingStrategy().getWindowFn().windowCoder();

  IsmRecordCoder<WindowedValue<FinalT>> ismCoder =
      coderForSingleton(windowCoder, defaultValueCoder);

  PCollection<IsmRecord<WindowedValue<FinalT>>> reifiedPerWindowAndSorted = input
      .apply(new GroupByWindowHashAsKeyAndWindowAsSortKey<T, W>(ismCoder))
      .apply(ParDo.of(doFn));
  reifiedPerWindowAndSorted.setCoder(ismCoder);

  runner.addPCollectionRequiringIndexedFormat(reifiedPerWindowAndSorted);
  reifiedPerWindowAndSorted.apply(
      CreateDataflowView.<IsmRecord<WindowedValue<FinalT>>, ViewT>forBatch(view));
  return reifiedPerWindowAndSorted;
}

开发者ID:apache，项目名称:beam，代码行数:27，代码来源:BatchViewOverrides.java

示例14: testEndToEndPipeline

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
@Test
public void testEndToEndPipeline() throws Exception {
  ImmutableList<TableRow> inputRows =
      ImmutableList.of(
          new TableRow(),
          new TableRow().set("requestPath", "a/path"),
          new TableRow().set("requestPath", "b/path"),
          new TableRow().set("requestPath", "b/path"),
          new TableRow().set("anotherValue", "b/path"));

  PCollection<TableRow> input = p.apply(Create.of(inputRows));
  PCollection<String> output = input.apply(new CountRequestPaths());

  ImmutableList<String> outputStrings = new ImmutableList.Builder<String>()
      .add("a/path: 1")
      .add("b/path: 2")
      .add("null: 2")
      .build();
  PAssert.that(output).containsInAnyOrder(outputStrings);
  p.run();
}

开发者ID:google，项目名称:nomulus，代码行数:22，代码来源:BigqueryTemplatePipelineTest.java

示例15: indexDocuments

import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
/**
 * @param options
 * @param contentToIndex
 * @return
 */
private static PCollection<ContentIndexSummary> indexDocuments(
		IndexerPipelineOptions options,
		PCollection<InputContent> contentToIndex) {
	
	PCollectionTuple alldocuments = contentToIndex
		.apply(ParDo.of(new IndexDocument())
			.withOutputTags(PipelineTags.successfullyIndexed, // main output
				TupleTagList.of(PipelineTags.unsuccessfullyIndexed))); // side output
		
	PCollection<ContentIndexSummary> indexes = alldocuments
		.get(PipelineTags.successfullyIndexed)
		.setCoder(AvroCoder.of(ContentIndexSummary.class));
	
	// if the Bigtable admin DB is set, write into dead letter table
	if (options.getBigtableIndexerAdminDB() != null) {
		
		PCollection<InputContent> unprocessedDocuments = alldocuments
			.get(PipelineTags.unsuccessfullyIndexed);
		
		BigtableOptions.Builder optionsBuilder =
			new BigtableOptions.Builder()
				.setProjectId(options.getProject())
				.setInstanceId(options.getBigtableIndexerAdminDB());
		BigtableOptions bigtableOptions = optionsBuilder.build();
		
		unprocessedDocuments
			.apply(ParDo.of(new CreateDeadLetterEntries()))
			.apply("Write to Dead Letter table in Bigtable", BigtableIO.write()
					.withBigtableOptions(bigtableOptions)
					.withTableId(IndexerPipelineUtils.DEAD_LETTER_TABLE));
	}
	
	return indexes;
}

开发者ID:GoogleCloudPlatform，项目名称:dataflow-opinion-analysis，代码行数:40，代码来源:FileIndexerPipeline.java

注：本文中的org.apache.beam.sdk.values.PCollection.apply方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。