本文整理汇总了Java中com.google.cloud.dataflow.sdk.io.TextIO类的典型用法代码示例。如果您正苦于以下问题:Java TextIO类的具体用法?Java TextIO怎么用?Java TextIO使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
TextIO类属于com.google.cloud.dataflow.sdk.io包,在下文中一共展示了TextIO类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
/**
* Run a batch pipeline.
*/
public static void main(String[] args) throws Exception {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
TableReference tableRef = new TableReference();
tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
tableRef.setProjectId(options.as(GcpOptions.class).getProject());
tableRef.setTableId(options.getOutputTableName());
// Read events from a CSV file and parse them.
pipeline
.apply(TextIO.Read.from(options.getInput()))
.apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()))
// Extract and sum username/score pairs from the event data.
.apply("ExtractUserScore", new ExtractAndSumScore("user"))
// Write the results to BigQuery.
.apply(ParDo.named("FormatUserScoreSums").of(new FormatUserScoreSumsFn()))
.apply(
BigQueryIO.Write.to(tableRef)
.withSchema(FormatUserScoreSumsFn.getSchema())
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(WriteDisposition.WRITE_APPEND));
pipeline.run();
}
示例2: apply
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
@Override
public PCollection<GameEvent> apply(PBegin begin) {
if (options.getInput() != null && !options.getInput().isEmpty()) {
return begin
.getPipeline()
.apply(TextIO.Read.from(options.getInput()))
.apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()))
.apply(
"AddEventTimestamps",
WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp())));
} else {
return begin
.getPipeline()
.apply(PubsubIO.Read.timestampLabel(TIMESTAMP_ATTRIBUTE).topic(options.getTopic()))
.apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()));
}
}
示例3: main
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
/**
* Run a batch pipeline.
*/
public static void main(String[] args) throws Exception {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
TableReference tableRef = new TableReference();
tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
tableRef.setProjectId(options.as(GcpOptions.class).getProject());
tableRef.setTableId(options.getOutputTableName());
// Read events from a CSV file, parse them and write (import) them to BigQuery.
pipeline
.apply(TextIO.Read.from(options.getInput()))
.apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()))
.apply(ParDo.named("FormatGameEvent").of(new FormatGameEventFn()))
.apply(
BigQueryIO.Write.to(tableRef)
.withSchema(FormatGameEventFn.getSchema())
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(WriteDisposition.WRITE_APPEND));
pipeline.run();
}
示例4: runInjectorPipeline
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
/**
* Runs the batch injector for the streaming pipeline.
*
* <p>The injector pipeline will read from the given text file, and inject data
* into the Google Cloud Pub/Sub topic.
*/
public void runInjectorPipeline(String inputFile, String topic) {
DataflowPipelineOptions copiedOptions = options.cloneAs(DataflowPipelineOptions.class);
copiedOptions.setStreaming(false);
copiedOptions.setWorkerHarnessContainerImage(
DataflowPipelineRunner.BATCH_WORKER_HARNESS_CONTAINER_IMAGE);
copiedOptions.setNumWorkers(
options.as(ExamplePubsubTopicOptions.class).getInjectorNumWorkers());
copiedOptions.setJobName(options.getJobName() + "-injector");
Pipeline injectorPipeline = Pipeline.create(copiedOptions);
injectorPipeline.apply(TextIO.Read.from(inputFile))
.apply(IntraBundleParallelization
.of(PubsubFileInjector.publish(topic))
.withMaxParallelism(20));
DataflowPipelineJob injectorJob = (DataflowPipelineJob) injectorPipeline.run();
jobsToCancel.add(injectorJob);
}
示例5: doGet
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
@Override
public void doGet(HttpServletRequest req, HttpServletResponse resp)
throws IOException {
PrintWriter out = resp.getWriter();
Preconditions.checkNotNull(Constants.ORG_ID);
Preconditions.checkNotNull(Constants.OUTPUT_PREFIX);
Preconditions.checkNotNull(Constants.DATAFLOW_STAGING);
PipelineOptions options;
if (CloudUtil.willExecuteOnCloud()) {
options = getCloudExecutionOptions(Constants.DATAFLOW_STAGING);
} else {
options = getLocalExecutionOptions();
}
new ExportedServiceAccountKeyRemover(options, Constants.ORG_ID)
.attachSink(TextIO.Write.named("Write output messages").to(Constants.OUTPUT_PREFIX))
.run();
out.println("Test passed! The output was written to GCS");
}
示例6: main
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
/**
* Main function for the runner.
* @param args The args this program was called with.
* @throws IOException Thrown if there's an error reading from one of the APIs.
*/
public static void main(String[] args) throws IOException {
Preconditions.checkNotNull(Constants.ORG_NAME);
Preconditions.checkNotNull(Constants.POLICY_BUCKET);
Preconditions.checkNotNull(Constants.OUTPUT_PREFIX);
Preconditions.checkNotNull(Constants.DATAFLOW_STAGING);
GCSFilesSource source = null;
try {
source = new GCSFilesSource(Constants.POLICY_BUCKET, Constants.ORG_NAME);
} catch (GeneralSecurityException e) {
throw new IOException("SecurityException: Cannot create GCSFileSource");
}
PipelineOptions options;
if (CloudUtil.willExecuteOnCloud()) {
options = getCloudExecutionOptions(Constants.DATAFLOW_STAGING);
} else {
options = getLocalExecutionOptions();
}
new OnDemandLiveStateChecker(options, source)
.attachSink(TextIO.Write.named("Write messages to GCS").to(Constants.OUTPUT_PREFIX))
.run();
}
示例7: main
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
public static void main(String[] args) {
String[] arguments = {
String.format("--output=%s/output.txt", System.getProperty("java.io.tmpdir"))
};
Options options = PipelineOptionsFactory.fromArgs(arguments)
.withValidation().as(Options.class);
options.setRunner(FlinkLocalPipelineRunner.class);
Pipeline p = Pipeline.create(options);
p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
.apply(new CountWords())
.apply(TextIO.Write.named("WriteCounts")
.to(options.getOutput())
.withNumShards(options.getNumShards()));
p.run();
}
示例8: translateNode
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
@Override
public void translateNode(TransformTreeNode node, Bound<String> transform, TranslationContext translation) {
String path = transform.getFilepattern();
String name = transform.getName();
Coder<?> coder = transform.getDefaultOutputCoder(transform.getOutput());
if (coder != null && coder != TextIO.DEFAULT_TEXT_CODER) {
throw new UnsupportedOperationException("Currently only supports UTF-8 inputs.");
}
DataSource<String> source = translation.getExecutionEnvironment().readTextFile(path);
if (name != null) {
source = source.name(name);
}
translation.registerDataSet(source, node);
}
示例9: writeText
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
private static <T> TransformEvaluator<TextIO.Write.Bound<T>> writeText() {
return new TransformEvaluator<TextIO.Write.Bound<T>>() {
@Override
public void evaluate(TextIO.Write.Bound<T> transform, EvaluationContext context) {
@SuppressWarnings("unchecked")
JavaPairRDD<T, Void> last =
((JavaRDDLike<WindowedValue<T>, ?>) context.getInputRDD(transform))
.map(WindowingHelpers.<T>unwindowFunction())
.mapToPair(new PairFunction<T, T,
Void>() {
@Override
public Tuple2<T, Void> call(T t) throws Exception {
return new Tuple2<>(t, null);
}
});
ShardTemplateInformation shardTemplateInfo =
new ShardTemplateInformation(transform.getNumShards(),
transform.getShardTemplate(), transform.getFilenamePrefix(),
transform.getFilenameSuffix());
writeHadoopFile(last, new Configuration(), shardTemplateInfo, Text.class,
NullWritable.class, TemplatedTextOutputFormat.class);
}
};
}
示例10: apply
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
@Override
public PCollection<KV<String, String>> apply(PInput input) {
Pipeline pipeline = input.getPipeline();
// Create one TextIO.Read transform for each data file
// and add its output to a PCollectionList.
PCollectionList<KV<String, String>> filesToLines = PCollectionList.empty(pipeline);
for (final String fileLocation : files) {
PTransform<PInput, PCollection<String>> inputSource
= TextIO.Read.from(fileLocation)
.named("TextIO.Read(" + fileLocation + ")");
PCollection<KV<String, String>> oneFileToLines = pipeline
.apply(inputSource)
.apply(WithKeys.<String, String>of(fileLocation));
filesToLines = filesToLines.and(oneFileToLines);
}
return filesToLines.apply(Flatten.<KV<String, String>> pCollections())
.setCoder(getDefaultOutputCoder());
}
开发者ID:GoogleCloudPlatform,项目名称:dataflow-precipitation-pipeline,代码行数:24,代码来源:ReadDataWithFileName.java
示例11: main
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
public static void main(String[] args) {
// CloudBigtableOptions is one way to retrieve the options. It's not required.
// https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/blob/master/java/dataflow-connector-examples/src/main/java/com/google/cloud/bigtable/dataflow/example/HelloWorldWrite.java
BigtableCsvOptions options =
PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class);
CloudBigtableTableConfiguration config =
CloudBigtableTableConfiguration.fromCBTOptions(options);
Pipeline p = Pipeline.create(options);
CloudBigtableIO.initializeForWrite(p);
PCollection<KV<String, Integer>> ngrams =
applyPipelineToParseBooks(p.apply(TextIO.Read.from(options.getInputFile())));
PCollection<Mutation> mutations = ngrams.apply(ParDo.of(ENCODE_NGRAM));
mutations.apply(CloudBigtableIO.writeToTable(config));
// Run the pipeline.
p.run();
}
示例12: main
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
/** Run a batch pipeline. */
public static void main(String[] args) throws Exception {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
TableReference tableRef = new TableReference();
tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
tableRef.setProjectId(options.as(GcpOptions.class).getProject());
tableRef.setTableId(options.getOutputTableName());
// Read events from a CSV file and parse them.
pipeline
.apply(TextIO.Read.from(options.getInput()))
.apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()))
.apply(
"AddEventTimestamps", WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp())))
.apply("WindowedTeamScore", new WindowedTeamScore(Duration.standardMinutes(60)))
// Write the results to BigQuery.
.apply(ParDo.named("FormatTeamScoreSums").of(new FormatTeamScoreSumsFn()))
.apply(
BigQueryIO.Write.to(tableRef)
.withSchema(FormatTeamScoreSumsFn.getSchema())
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(WriteDisposition.WRITE_APPEND));
pipeline.run();
}
示例13: main
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
public static void main(String[] args) {
StorageToDatastoreOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
.as(StorageToDatastoreOptions.class);
Pipeline p = Pipeline.create(options);
p.apply(TextIO.Read.named("ReadLines").from(options.getInputFile())).apply(new CSVToDatastore())
.apply(DatastoreIO.v1().write().withProjectId(options.getProject()));
p.run();
}
示例14: main
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
public static void main(String[] args) {
WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
.as(WordCountOptions.class);
Pipeline p = Pipeline.create(options);
// Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
// static FormatAsTextFn() to the ParDo transform.
p.apply(TextIO.Read.named("ReadLines").from(options.getInputFile()))
.apply(new CountWords())
.apply(ParDo.of(new FormatAsTextFn()))
.apply(TextIO.Write.named("WriteCounts").to(options.getOutput()));
p.run();
}
示例15: main
import com.google.cloud.dataflow.sdk.io.TextIO; //导入依赖的package包/类
public static void main(String[] args) {
WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
.as(WordCountOptions.class);
Pipeline p = Pipeline.create(options);
PCollection<KV<String, Long>> filteredWords =
p.apply(TextIO.Read.named("ReadLines").from(options.getInputFile()))
.apply(new WordCount.CountWords())
.apply(ParDo.of(new FilterTextFn(options.getFilterPattern())));
/**
* Concept #4: DataflowAssert is a set of convenient PTransforms in the style of
* Hamcrest's collection matchers that can be used when writing Pipeline level tests
* to validate the contents of PCollections. DataflowAssert is best used in unit tests
* with small data sets but is demonstrated here as a teaching tool.
*
* <p>Below we verify that the set of filtered words matches our expected counts. Note
* that DataflowAssert does not provide any output and that successful completion of the
* Pipeline implies that the expectations were met. Learn more at
* https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to test
* your Pipeline and see {@link DebuggingWordCountTest} for an example unit test.
*/
List<KV<String, Long>> expectedResults = Arrays.asList(
KV.of("Flourish", 3L),
KV.of("stomach", 1L));
DataflowAssert.that(filteredWords).containsInAnyOrder(expectedResults);
p.run();
}