本文整理汇总了Java中org.apache.beam.sdk.io.TextIO类的典型用法代码示例。如果您正苦于以下问题:Java TextIO类的具体用法?Java TextIO怎么用?Java TextIO使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
TextIO类属于org.apache.beam.sdk.io包,在下文中一共展示了TextIO类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
.as(Options.class);
options.setRunner(FlinkRunner.class);
Pipeline p = Pipeline.create(options);
KafkaIO.Read<byte[], String> kafkaIOReader = KafkaIO.read()
.withBootstrapServers("192.168.99.100:32771")
.withTopics(Arrays.asList("beam".split(",")))
.updateConsumerProperties(ImmutableMap.of("auto.offset.reset", (Object)"earliest"))
.withValueCoder(StringUtf8Coder.of());
p.apply(kafkaIOReader.withoutMetadata())
.apply(Values.<String>create())
.apply(Window.<String>into(
FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))))
.apply(new CountWords())
.apply(MapElements.via(new FormatAsTextFn()))
.apply("WriteCounts", TextIO.Write.to(options.getOutput()));
p.run();
}
示例2: main
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
/** Run a batch pipeline to calculate hourly team scores. */
public static void main(String[] args) throws Exception {
Options options =
PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("ReadLogs", TextIO.read().from(options.getInput()))
.apply("SetTimestamps", WithTimestamps.of(new SetTimestampFn()))
.apply("FixedWindows", Window.<String>into(FixedWindows.of(ONE_HOUR)))
.apply("TeamScores", new CalculateTeamScores(options.getOutputPrefix()));
pipeline.run();
}
示例3: main
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
/**
* Runs the DatastoreToGcs dataflow pipeline
*/
public static void main(String[] args) throws IOException, ScriptException {
Options options = PipelineOptionsFactory.fromArgs(args)
.withValidation()
.as(Options.class);
options.setRunner(DataflowRunner.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("IngestEntities",
DatastoreIO.v1().read()
.withProjectId(options.getDatastoreProjectId())
.withLiteralGqlQuery(options.getGqlQuery())
.withNamespace(options.getNamespace()))
.apply("EntityToJson", ParDo.of(EntityToJson.newBuilder()
.setJsTransformPath(options.getJsTransformPath())
.setJsTransformFunctionName(options.getJsTransformFunctionName())
.build()))
.apply("JsonToGcs", TextIO.write().to(options.getSavePath())
.withSuffix(".json"));
pipeline.run();
}
示例4: main
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
/**
* Runs the GcsToDatastore dataflow pipeline
*/
public static void main(String[] args) throws IOException, ScriptException {
Options options = PipelineOptionsFactory.fromArgs(args)
.withValidation()
.as(Options.class);
options.setRunner(DataflowRunner.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("IngestJson", TextIO.read()
.from(options.getJsonPathPrefix()))
.apply("GcsToEntity", ParDo.of(JsonToEntity.newBuilder()
.setJsTransformPath(options.getJsTransformPath())
.setJsTransformFunctionName(options.getJsTransformFunctionName())
.build()))
.apply(DatastoreIO.v1().write()
.withProjectId(options.getDatastoreProjectId()));
pipeline.run();
}
示例5: main
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
PipelineOptionsFactory.register(TemplateOptions.class);
TemplateOptions options = PipelineOptionsFactory
.fromArgs(args)
.withValidation()
.as(TemplateOptions.class);
options.setAutoscalingAlgorithm(THROUGHPUT_BASED);
Pipeline pipeline = Pipeline.create(options);
pipeline.apply(BigQueryIO.read().from(options.getBigQueryTableName()))
.apply(ParDo.of(new DoFn<TableRow, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
String commaSep = c.element().values()
.stream()
.map(cell -> cell.toString().trim())
.collect(Collectors.joining("\",\""));
c.output(commaSep);
}
}))
.apply(TextIO.write().to(options.getOutputFile())
.withoutSharding()
.withWritableByteChannelFactory(GZIP)
);
pipeline.run();
}
示例6: main
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
/**
* Run a batch pipeline.
*/
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
// Begin constructing a pipeline configured by commandline flags.
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
// Read events from a text file and parse them.
pipeline
.apply(TextIO.read().from(options.getInput()))
.apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
// Extract and sum username/score pairs from the event data.
.apply("ExtractUserScore", new ExtractAndSumScore("user"))
.apply(
"WriteUserScoreSums",
new WriteToText<KV<String, Integer>>(
options.getOutput(),
configureOutput(),
false));
// Run the batch pipeline.
pipeline.run().waitUntilFinish();
}
示例7: expand
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
@Override
public PDone expand(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
return wordToUriAndTfIdf
.apply("Format", ParDo.of(new DoFn<KV<String, KV<URI, Double>>, String>() {
@ProcessElement
public void processElement(ProcessContext c) {
c.output(String.format("%s,\t%s,\t%f",
c.element().getKey(),
c.element().getValue().getKey(),
c.element().getValue().getValue()));
}
}))
.apply(TextIO.write()
.to(output)
.withSuffix(".csv"));
}
示例8: main
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
public static void main(String[] args) {
Options options = PipelineOptionsFactory.fromArgs(args)
.withValidation()
.as(Options.class);
Pipeline p = Pipeline.create(options);
double samplingThreshold = 0.1;
p.apply(TextIO.read().from(options.getInput()))
.apply(MapElements.via(new ParseTableRowJson()))
.apply(new ComputeTopSessions(samplingThreshold))
.apply("Write", TextIO.write().withoutSharding().to(options.getOutput()));
p.run().waitUntilFinish();
}
示例9: testReadPipeline
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
@Test
@Ignore("TestPipeline needs a way to take in HadoopFileSystemOptions")
public void testReadPipeline() throws Exception {
create("testFileA", "testDataA".getBytes());
create("testFileB", "testDataB".getBytes());
create("testFileC", "testDataC".getBytes());
HadoopFileSystemOptions options = TestPipeline.testingPipelineOptions()
.as(HadoopFileSystemOptions.class);
options.setHdfsConfiguration(ImmutableList.of(fileSystem.fileSystem.getConf()));
FileSystems.setDefaultPipelineOptions(options);
PCollection<String> pc = p.apply(
TextIO.read().from(testPath("testFile*").toString()));
PAssert.that(pc).containsInAnyOrder("testDataA", "testDataB", "testDataC");
p.run();
}
示例10: testText
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
@Test
public void testText() throws Exception {
PCollection<String> inputWords = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
PCollection<String> output = inputWords.apply(new WordCount.CountWords())
.apply(MapElements.via(new WordCount.FormatAsTextFn()));
output.apply(
TextIO.write().to(outputDir.getAbsolutePath()).withNumShards(3).withSuffix(".txt"));
p.run().waitUntilFinish();
int count = 0;
Set<String> expected = Sets.newHashSet("hi: 5", "there: 1", "sue: 2", "bob: 2");
for (File f : tmpDir.getRoot().listFiles(new FileFilter() {
@Override public boolean accept(File pathname) {
return pathname.getName().matches("out-.*\\.txt");
}
})) {
count++;
for (String line : Files.readLines(f, Charsets.UTF_8)) {
assertTrue(line + " not found", expected.remove(line));
}
}
assertEquals(3, count);
assertTrue(expected.isEmpty());
}
示例11: runProgram
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
private static void runProgram(String resultPath) throws Exception {
Pipeline p = FlinkTestPipeline.createForBatch();
PCollection<String> result = p
.apply(GenerateSequence.from(0).to(10))
.apply(ParDo.of(new DoFn<Long, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
c.output(c.element().toString());
}
}));
result.apply(TextIO.write().to(new URI(resultPath).getPath() + "/part"));
p.run();
}
示例12: runProgram
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
private static void runProgram(String resultPath) {
Pipeline p = FlinkTestPipeline.createForStreaming();
p
.apply(GenerateSequence.from(0).to(10))
.apply(ParDo.of(new DoFn<Long, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
c.output(c.element().toString());
}
}))
.apply(TextIO.write().to(resultPath));
p.run();
}
示例13: createPredefinedStep
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
/**
* Returns a Step for a {@link DoFn} by creating and translating a pipeline.
*/
private static Step createPredefinedStep() throws Exception {
DataflowPipelineOptions options = buildPipelineOptions();
DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
Pipeline pipeline = Pipeline.create(options);
String stepName = "DoFn1";
pipeline.apply("ReadMyFile", TextIO.read().from("gs://bucket/in"))
.apply(stepName, ParDo.of(new NoOpFn()))
.apply("WriteMyFile", TextIO.write().to("gs://bucket/out"));
DataflowRunner runner = DataflowRunner.fromOptions(options);
runner.replaceTransforms(pipeline);
Job job =
translator
.translate(
pipeline,
runner,
Collections.<DataflowPackage>emptyList())
.getJob();
assertEquals(8, job.getSteps().size());
Step step = job.getSteps().get(1);
assertEquals(stepName, getString(step.getProperties(), PropertyNames.USER_NAME));
assertAllStepOutputsHaveUniqueIds(job);
return step;
}
示例14: testUnconsumedReads
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
/**
* Tests that all reads are consumed by at least one {@link PTransform}.
*/
@Test
public void testUnconsumedReads() throws IOException {
DataflowPipelineOptions dataflowOptions = buildPipelineOptions();
RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class);
Pipeline p = buildDataflowPipeline(dataflowOptions);
PCollection<String> unconsumed = p.apply(TextIO.read().from(options.getInput()));
DataflowRunner.fromOptions(dataflowOptions).replaceTransforms(p);
final AtomicBoolean unconsumedSeenAsInput = new AtomicBoolean();
p.traverseTopologically(new PipelineVisitor.Defaults() {
@Override
public void visitPrimitiveTransform(Node node) {
unconsumedSeenAsInput.set(true);
}
});
assertThat(unconsumedSeenAsInput.get(), is(true));
}
示例15: expand
import org.apache.beam.sdk.io.TextIO; //导入依赖的package包/类
@Override
public PDone expand(PCollection<String> input) {
// Verify that the input has a compatible window type.
checkArgument(
input.getWindowingStrategy().getWindowFn().windowCoder() == IntervalWindow.getCoder());
// filenamePrefix may contain a directory and a filename component. Pull out only the filename
// component from that path for the PerWindowFiles.
String prefix = "";
ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
if (!resource.isDirectory()) {
prefix = verifyNotNull(
resource.getFilename(),
"A non-directory resource should have a non-null filename: %s",
resource);
}
return input.apply(
TextIO.write()
.to(resource.getCurrentDirectory())
.withFilenamePolicy(new PerWindowFiles(prefix))
.withWindowedWrites()
.withNumShards(3));
}