本文整理汇总了Java中org.apache.beam.sdk.values.PCollection.apply方法的典型用法代码示例。如果您正苦于以下问题:Java PCollection.apply方法的具体用法?Java PCollection.apply怎么用?Java PCollection.apply使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.beam.sdk.values.PCollection
的用法示例。
在下文中一共展示了PCollection.apply方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: filterAlreadyProcessedUrls
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
/**
* @param options
* @param pipeline
* @param readContent
* @return
*/
private static PCollection<InputContent> filterAlreadyProcessedUrls(
PCollection<InputContent> readContent, Pipeline pipeline,
IndexerPipelineOptions options) {
PCollection<InputContent> contentToProcess;
String query = IndexerPipelineUtils.buildBigQueryProcessedUrlsQuery(options);
PCollection<KV<String,Long>> alreadyProcessedUrls = pipeline
.apply("Get processed URLs",BigQueryIO.read().fromQuery(query))
.apply(ParDo.of(new GetUrlFn()));
final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput =
alreadyProcessedUrls.apply(View.<String,Long>asMap());
contentToProcess = readContent
.apply(ParDo.of(new FilterProcessedUrls(alreadyProcessedUrlsSideInput))
.withSideInputs(alreadyProcessedUrlsSideInput));
return contentToProcess;
}
示例2: enrichWithCNLP
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
/**
* @param indexes
* @return
*/
private static PCollection<ContentIndexSummary> enrichWithCNLP(
PCollection<ContentIndexSummary> indexes, Float ratio) {
PCollectionTuple splitAB = indexes
.apply(ParDo.of(new SplitAB(ratio))
.withOutputTags(PipelineTags.BranchA,
TupleTagList.of(PipelineTags.BranchB)));
PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
ParDo.of(new EnrichWithCNLPEntities()));
//Merge all collections with WebResource table records
PCollectionList<ContentIndexSummary> contentIndexSummariesList =
PCollectionList.of(branchACol).and(enrichedBCol);
PCollection<ContentIndexSummary> allIndexSummaries =
contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());
indexes = allIndexSummaries;
return indexes;
}
示例3: enrichWithCNLP
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
/**
* @param filteredIndexes
* @return
*/
private static PCollection<ContentIndexSummary> enrichWithCNLP(
PCollection<ContentIndexSummary> filteredIndexes, Float ratio) {
PCollectionTuple splitAB = filteredIndexes
.apply(ParDo.of(new SplitAB(ratio))
.withOutputTags(PipelineTags.BranchA,
TupleTagList.of(PipelineTags.BranchB)));
PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
ParDo.of(new EnrichWithCNLPEntities()));
//Merge all collections with WebResource table records
PCollectionList<ContentIndexSummary> contentIndexSummariesList =
PCollectionList.of(branchACol).and(enrichedBCol);
PCollection<ContentIndexSummary> allIndexSummaries =
contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());
filteredIndexes = allIndexSummaries;
return filteredIndexes;
}
示例4: testTransformer
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
@Test
public void testTransformer() {
// Read file
InputStream inputStream =
this.getClass().getResourceAsStream("/dummy-log");
// Convert to JSON string
String json = readStream(inputStream);
LogAggregator.LogTransformer transformer = new LogAggregator.LogTransformer();
List<String> inputs = Arrays.asList(json, json, json);
PCollection<String> inputCollection = pipeline.apply(Create.of(inputs));
PCollection<TableRow> outputCollection = inputCollection.apply(transformer);
PAssert.that(outputCollection).satisfies(new HasColumnsCheckerFn());
pipeline.run().waitUntilFinish();
}
示例5: main
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.create();
options.setRunner(DirectRunner.class); // forced for this demo
Pipeline p = Pipeline.create(options);
// register Avro coders for serializing our messages
Coders.registerAvroCoders(p, ExtendedRecord.class, UntypedOccurrence.class);
PCollection<UntypedOccurrence> verbatimRecords = p.apply(
"Read Avro", AvroIO.read(UntypedOccurrence.class).from("demo/output/data*"));
verbatimRecords.apply("Write file per Genus",
AvroIO.write(UntypedOccurrence.class)
.to("demo/output-split/data*") // prefix, is required but overwritten
.to(new GenusDynamicAvroDestinations(
FileSystems.matchNewResource("demo/output-split/data*", true))));
LOG.info("Starting the pipeline");
PipelineResult result = p.run();
result.waitUntilFinish();
LOG.info("Pipeline finished with state: {} ", result.getState());
}
示例6: main
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
public static void main(String[] args) {
Configuration conf = new Configuration(); // assume defaults on CP
conf.setClass("mapreduce.job.inputformat.class", DwCAInputFormat.class, InputFormat.class);
conf.setStrings("mapreduce.input.fileinputformat.inputdir", "hdfs://ha-nn/tmp/dwca-lep5.zip");
conf.setClass("key.class", Text.class, Object.class);
conf.setClass("value.class", ExtendedRecord.class, Object.class);
Pipeline p = newPipeline(args, conf);
Coders.registerAvroCoders(p, UntypedOccurrence.class, TypedOccurrence.class, ExtendedRecord.class);
PCollection<KV<Text, ExtendedRecord>> rawRecords =
p.apply("Read DwC-A", HadoopInputFormatIO.<Text, ExtendedRecord>read().withConfiguration(conf));
PCollection<UntypedOccurrence> verbatimRecords = rawRecords.apply(
"Convert to Avro", ParDo.of(fromExtendedRecordKVP()));
verbatimRecords.apply(
"Write Avro files", AvroIO.write(UntypedOccurrence.class).to("hdfs://ha-nn/tmp/dwca-lep5.avro"));
LOG.info("Starting the pipeline");
PipelineResult result = p.run();
result.waitUntilFinish();
LOG.info("Pipeline finished with state: {} ", result.getState());
}
示例7: expand
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
@Override
public PCollection<KV<K, V>> expand(PCollection<V> in) {
PCollection<KV<K, V>> result =
in.apply("AddKeys", MapElements.via(new SimpleFunction<V, KV<K, V>>() {
@Override
public KV<K, V> apply(V element) {
return KV.of(fn.apply(element), element);
}
}));
try {
Coder<K> keyCoder;
CoderRegistry coderRegistry = in.getPipeline().getCoderRegistry();
if (keyClass == null) {
keyCoder = coderRegistry.getOutputCoder(fn, in.getCoder());
} else {
keyCoder = coderRegistry.getCoder(TypeDescriptor.of(keyClass));
}
// TODO: Remove when we can set the coder inference context.
result.setCoder(KvCoder.of(keyCoder, in.getCoder()));
} catch (CannotProvideCoderException exc) {
// let lazy coder inference have a try
}
return result;
}
示例8: expand
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
@Override
public PCollectionList<T> expand(PCollection<T> in) {
final TupleTagList outputTags = partitionDoFn.getOutputTags();
PCollectionTuple outputs = in.apply(
ParDo
.of(partitionDoFn)
.withOutputTags(new TupleTag<Void>(){}, outputTags));
PCollectionList<T> pcs = PCollectionList.empty(in.getPipeline());
Coder<T> coder = in.getCoder();
for (TupleTag<?> outputTag : outputTags.getAll()) {
// All the tuple tags are actually TupleTag<T>
// And all the collections are actually PCollection<T>
@SuppressWarnings("unchecked")
TupleTag<T> typedOutputTag = (TupleTag<T>) outputTag;
pcs = pcs.and(outputs.get(typedOutputTag).setCoder(coder));
}
return pcs;
}
示例9: applyTyped
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
private PCollection<Event> applyTyped(PCollection<Event> events) {
final Coder<Event> coder = events.getCoder();
return events
// Force round trip through coder.
.apply(name + ".Serialize",
ParDo.of(new DoFn<Event, Event>() {
private final Counter bytesMetric =
Metrics.counter(name , "bytes");
@ProcessElement
public void processElement(ProcessContext c) throws CoderException, IOException {
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
coder.encode(c.element(), outStream, Coder.Context.OUTER);
byte[] byteArray = outStream.toByteArray();
bytesMetric.inc((long) byteArray.length);
ByteArrayInputStream inStream = new ByteArrayInputStream(byteArray);
Event event = coder.decode(inStream, Coder.Context.OUTER);
c.output(event);
}
}));
}
示例10: expand
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
@Override
public PCollection<KV<String, List<CompletionCandidate>>> expand(PCollection<String> input) {
PCollection<CompletionCandidate> candidates = input
// First count how often each token appears.
.apply(Count.<String>perElement())
// Map the KV outputs of Count into our own CompletionCandiate class.
.apply("CreateCompletionCandidates", ParDo.of(
new DoFn<KV<String, Long>, CompletionCandidate>() {
@ProcessElement
public void processElement(ProcessContext c) {
c.output(new CompletionCandidate(c.element().getKey(), c.element().getValue()));
}
}));
// Compute the top via either a flat or recursive algorithm.
if (recursive) {
return candidates
.apply(new ComputeTopRecursive(candidatesPerPrefix, 1))
.apply(Flatten.<KV<String, List<CompletionCandidate>>>pCollections());
} else {
return candidates
.apply(new ComputeTopFlat(candidatesPerPrefix, 1));
}
}
示例11: createInput
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
/**
* Converts the given list with timestamps into a PCollection.
*/
private PCollection<KV<Integer, String>> createInput(String name,
Pipeline p, List<KV<Integer, String>> list, List<Long> timestamps) {
PCollection<KV<Integer, String>> input;
if (timestamps.isEmpty()) {
input = p.apply("Create" + name, Create.of(list)
.withCoder(KvCoder.of(BigEndianIntegerCoder.of(), StringUtf8Coder.of())));
} else {
input = p.apply("Create" + name, Create.timestamped(list, timestamps)
.withCoder(KvCoder.of(BigEndianIntegerCoder.of(), StringUtf8Coder.of())));
}
return input.apply(
"Identity" + name,
ParDo.of(
new DoFn<KV<Integer, String>, KV<Integer, String>>() {
@ProcessElement
public void processElement(ProcessContext c) {
c.output(c.element());
}
}));
}
示例12: expand
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
public PCollection<String> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
String[] items = pattern.split(c.element());
for (String item : items) {
if (outputEmpty || !item.isEmpty()) {
c.output(item);
}
}
}
}));
}
示例13: applyForSingleton
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
static <T, FinalT, ViewT, W extends BoundedWindow> PCollection<?>
applyForSingleton(
DataflowRunner runner,
PCollection<T> input,
DoFn<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>,
IsmRecord<WindowedValue<FinalT>>> doFn,
Coder<FinalT> defaultValueCoder,
PCollectionView<ViewT> view) {
@SuppressWarnings("unchecked")
Coder<W> windowCoder = (Coder<W>)
input.getWindowingStrategy().getWindowFn().windowCoder();
IsmRecordCoder<WindowedValue<FinalT>> ismCoder =
coderForSingleton(windowCoder, defaultValueCoder);
PCollection<IsmRecord<WindowedValue<FinalT>>> reifiedPerWindowAndSorted = input
.apply(new GroupByWindowHashAsKeyAndWindowAsSortKey<T, W>(ismCoder))
.apply(ParDo.of(doFn));
reifiedPerWindowAndSorted.setCoder(ismCoder);
runner.addPCollectionRequiringIndexedFormat(reifiedPerWindowAndSorted);
reifiedPerWindowAndSorted.apply(
CreateDataflowView.<IsmRecord<WindowedValue<FinalT>>, ViewT>forBatch(view));
return reifiedPerWindowAndSorted;
}
示例14: testEndToEndPipeline
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
@Test
public void testEndToEndPipeline() throws Exception {
ImmutableList<TableRow> inputRows =
ImmutableList.of(
new TableRow(),
new TableRow().set("requestPath", "a/path"),
new TableRow().set("requestPath", "b/path"),
new TableRow().set("requestPath", "b/path"),
new TableRow().set("anotherValue", "b/path"));
PCollection<TableRow> input = p.apply(Create.of(inputRows));
PCollection<String> output = input.apply(new CountRequestPaths());
ImmutableList<String> outputStrings = new ImmutableList.Builder<String>()
.add("a/path: 1")
.add("b/path: 2")
.add("null: 2")
.build();
PAssert.that(output).containsInAnyOrder(outputStrings);
p.run();
}
示例15: indexDocuments
import org.apache.beam.sdk.values.PCollection; //导入方法依赖的package包/类
/**
* @param options
* @param contentToIndex
* @return
*/
private static PCollection<ContentIndexSummary> indexDocuments(
IndexerPipelineOptions options,
PCollection<InputContent> contentToIndex) {
PCollectionTuple alldocuments = contentToIndex
.apply(ParDo.of(new IndexDocument())
.withOutputTags(PipelineTags.successfullyIndexed, // main output
TupleTagList.of(PipelineTags.unsuccessfullyIndexed))); // side output
PCollection<ContentIndexSummary> indexes = alldocuments
.get(PipelineTags.successfullyIndexed)
.setCoder(AvroCoder.of(ContentIndexSummary.class));
// if the Bigtable admin DB is set, write into dead letter table
if (options.getBigtableIndexerAdminDB() != null) {
PCollection<InputContent> unprocessedDocuments = alldocuments
.get(PipelineTags.unsuccessfullyIndexed);
BigtableOptions.Builder optionsBuilder =
new BigtableOptions.Builder()
.setProjectId(options.getProject())
.setInstanceId(options.getBigtableIndexerAdminDB());
BigtableOptions bigtableOptions = optionsBuilder.build();
unprocessedDocuments
.apply(ParDo.of(new CreateDeadLetterEntries()))
.apply("Write to Dead Letter table in Bigtable", BigtableIO.write()
.withBigtableOptions(bigtableOptions)
.withTableId(IndexerPipelineUtils.DEAD_LETTER_TABLE));
}
return indexes;
}