本文整理汇总了Java中org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO类的典型用法代码示例。如果您正苦于以下问题:Java BigQueryIO类的具体用法?Java BigQueryIO怎么用?Java BigQueryIO使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
BigQueryIO类属于org.apache.beam.sdk.io.gcp.bigquery包,在下文中一共展示了BigQueryIO类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: filterAlreadyProcessedUrls
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
/**
* @param options
* @param pipeline
* @param readContent
* @return
*/
private static PCollection<InputContent> filterAlreadyProcessedUrls(
PCollection<InputContent> readContent, Pipeline pipeline,
IndexerPipelineOptions options) {
PCollection<InputContent> contentToProcess;
String query = IndexerPipelineUtils.buildBigQueryProcessedUrlsQuery(options);
PCollection<KV<String,Long>> alreadyProcessedUrls = pipeline
.apply("Get processed URLs",BigQueryIO.read().fromQuery(query))
.apply(ParDo.of(new GetUrlFn()));
final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput =
alreadyProcessedUrls.apply(View.<String,Long>asMap());
contentToProcess = readContent
.apply(ParDo.of(new FilterProcessedUrls(alreadyProcessedUrlsSideInput))
.withSideInputs(alreadyProcessedUrlsSideInput));
return contentToProcess;
}
示例2: main
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
public static void main(String[] args) {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline p = Pipeline.create(options);
// Build the table schema for the output table.
List<TableFieldSchema> fields = new ArrayList<>();
fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
fields.add(new TableFieldSchema().setName("tornado_count").setType("INTEGER"));
TableSchema schema = new TableSchema().setFields(fields);
p.apply(BigQueryIO.readTableRows().from(options.getInput()))
.apply(new CountTornadoes())
.apply(BigQueryIO.writeTableRows()
.to(options.getOutput())
.withSchema(schema)
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
p.run().waitUntilFinish();
}
示例3: main
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
PipelineOptionsFactory.register(TemplateOptions.class);
TemplateOptions options = PipelineOptionsFactory
.fromArgs(args)
.withValidation()
.as(TemplateOptions.class);
options.setAutoscalingAlgorithm(THROUGHPUT_BASED);
Pipeline pipeline = Pipeline.create(options);
pipeline.apply(BigQueryIO.read().from(options.getBigQueryTableName()))
.apply(ParDo.of(new DoFn<TableRow, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
String commaSep = c.element().values()
.stream()
.map(cell -> cell.toString().trim())
.collect(Collectors.joining("\",\""));
c.output(commaSep);
}
}))
.apply(TextIO.write().to(options.getOutputFile())
.withoutSharding()
.withWritableByteChannelFactory(GZIP)
);
pipeline.run();
}
示例4: expand
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
@Override
public PDone expand(PCollection<InputT> teamAndScore) {
teamAndScore
.apply("ConvertToRow", ParDo.of(new BuildRowFn()))
.apply(
BigQueryIO.writeTableRows()
.to(getTable(projectId, datasetId, tableName))
.withSchema(getSchema())
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(WriteDisposition.WRITE_APPEND));
return PDone.in(teamAndScore.getPipeline());
}
示例5: main
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
public static void main(String[] args)
throws Exception {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline p = Pipeline.create(options);
TableSchema schema = buildWeatherSchemaProjection();
p.apply(BigQueryIO.readTableRows().from(options.getInput()))
.apply(ParDo.of(new ProjectionFn()))
.apply(new BelowGlobalMean(options.getMonthFilter()))
.apply(BigQueryIO.writeTableRows()
.to(options.getOutput())
.withSchema(schema)
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
p.run().waitUntilFinish();
}
示例6: main
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
public static void main(String[] args)
throws Exception {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline p = Pipeline.create(options);
// Build the table schema for the output table.
List<TableFieldSchema> fields = new ArrayList<>();
fields.add(new TableFieldSchema().setName("word").setType("STRING"));
fields.add(new TableFieldSchema().setName("all_plays").setType("STRING"));
TableSchema schema = new TableSchema().setFields(fields);
p.apply(BigQueryIO.readTableRows().from(options.getInput()))
.apply(new PlaysForWord())
.apply(BigQueryIO.writeTableRows()
.to(options.getOutput())
.withSchema(schema)
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
p.run().waitUntilFinish();
}
示例7: main
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
public static void main(String[] args)
throws Exception {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline p = Pipeline.create(options);
// Build the table schema for the output table.
List<TableFieldSchema> fields = new ArrayList<>();
fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
fields.add(new TableFieldSchema().setName("max_mean_temp").setType("FLOAT"));
TableSchema schema = new TableSchema().setFields(fields);
p.apply(BigQueryIO.readTableRows().from(options.getInput()))
.apply(new MaxMeanTemp())
.apply(BigQueryIO.writeTableRows()
.to(options.getOutput())
.withSchema(schema)
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
p.run().waitUntilFinish();
}
示例8: sinkResultsToBigQuery
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
/**
* Send {@code formattedResults} to BigQuery.
*/
private void sinkResultsToBigQuery(
PCollection<String> formattedResults, long now,
String version) {
String tableSpec = tableSpec(now, version);
TableSchema tableSchema =
new TableSchema().setFields(ImmutableList.of(
new TableFieldSchema().setName("result").setType("STRING"),
new TableFieldSchema().setName("records").setMode("REPEATED").setType("RECORD")
.setFields(ImmutableList.of(
new TableFieldSchema().setName("index").setType("INTEGER"),
new TableFieldSchema().setName("value").setType("STRING")))));
NexmarkUtils.console("Writing results to BigQuery table %s", tableSpec);
BigQueryIO.Write io =
BigQueryIO.write().to(tableSpec)
.withSchema(tableSchema)
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND);
formattedResults
.apply(queryName + ".StringToTableRow", ParDo.of(new StringToTableRow()))
.apply(queryName + ".WriteBigQueryResults", io);
}
示例9: deploy
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
/** Deploys the invoicing pipeline as a template on GCS, for a given projectID and GCS bucket. */
public void deploy() {
// We can't store options as a member variable due to serialization concerns.
InvoicingPipelineOptions options = PipelineOptionsFactory.as(InvoicingPipelineOptions.class);
options.setProject(projectId);
options.setRunner(DataflowRunner.class);
options.setStagingLocation(beamBucket + "/staging");
options.setTemplateLocation(beamBucket + "/templates/invoicing");
Pipeline p = Pipeline.create(options);
PCollection<BillingEvent> billingEvents =
p.apply(
"Read BillingEvents from Bigquery",
BigQueryIO.read(BillingEvent::parseFromRecord)
.fromQuery(InvoicingUtils.makeQueryProvider(options.getYearMonth(), projectId))
.withCoder(SerializableCoder.of(BillingEvent.class))
.usingStandardSql()
.withoutValidation()
.withTemplateCompatibility());
applyTerminalTransforms(billingEvents, options.getYearMonth());
p.run();
}
示例10: setWriteOperation
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
private BigQueryIO.Write setWriteOperation(BigQueryIO.Write bigQueryIOPTransform) {
if (properties.tableOperation.getValue() == BigQueryOutputProperties.TableOperation.NONE
|| properties.tableOperation.getValue() == BigQueryOutputProperties.TableOperation.CREATE_IF_NOT_EXISTS) {
switch (properties.writeOperation.getValue()) {
case APPEND:
bigQueryIOPTransform = bigQueryIOPTransform.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND);
break;
case WRITE_TO_EMPTY:
bigQueryIOPTransform = bigQueryIOPTransform.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_EMPTY);
break;
default:
throw new RuntimeException("To be implemented: " + properties.writeOperation.getValue());
}
} else {
if (properties.writeOperation.getValue() != null) {
LOG.info("Write operation " + properties.writeOperation.getValue() + " be ignored when Table operation is "
+ properties.tableOperation.getValue());
}
}
return bigQueryIOPTransform;
}
示例11: main
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
/**
* <p>Creates a dataflow pipeline that creates the following chain:</p>
* <ol>
* <li> Gets the records into the Pipeline
* <li> Creates Puts from each of the records
* <li> Performs a Bigtable Put on the records
* </ol>
*
* @param args Arguments to use to configure the Dataflow Pipeline. The first three are required
* when running via managed resource in Google Cloud Platform. Those options should be omitted
* for LOCAL runs. The last four arguments are to configure the Bigtable connection.
* --runner=BlockingDataflowPipelineRunner
* --project=[dataflow project] \\
* --stagingLocation=gs://[your google storage bucket] \\
* --bigtableProject=[bigtable project] \\
* --bigtableInstanceId=[bigtable instance id] \\
* --bigtableTableId=[bigtable tableName]
*/
public static void main(String[] args) {
// CloudBigtableOptions is one way to retrieve the options. It's not required.
BigQueryBigtableTransferOptions options =
PipelineOptionsFactory.fromArgs(args).withValidation().as(BigQueryBigtableTransferOptions.class);
// CloudBigtableTableConfiguration contains the project, instance and table to connect to.
CloudBigtableTableConfiguration config =
new CloudBigtableTableConfiguration.Builder()
.withProjectId(options.getBigtableProjectId())
.withInstanceId(options.getBigtableInstanceId())
.withTableId(options.getBigtableTableId())
.build();
Pipeline p = Pipeline.create(options);
p
.apply(BigQueryIO.read().from("ReadSourceTable").fromQuery(options.getBqQuery())
.usingStandardSql())
.apply(ParDo.of(MUTATION_TRANSFORM))
.apply(CloudBigtableIO.writeToTable(config));
p.run().waitUntilFinish();
}
示例12: main
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
/**
* Sets up and starts streaming pipeline.
*
* @throws IOException if there is a problem setting up resources
*/
public static void main(String[] args) throws IOException {
TrafficRoutesOptions options = PipelineOptionsFactory.fromArgs(args)
.withValidation()
.as(TrafficRoutesOptions.class);
options.setBigQuerySchema(FormatStatsFn.getSchema());
// Using ExampleUtils to set up required resources.
ExampleUtils exampleUtils = new ExampleUtils(options);
exampleUtils.setup();
Pipeline pipeline = Pipeline.create(options);
TableReference tableRef = new TableReference();
tableRef.setProjectId(options.getProject());
tableRef.setDatasetId(options.getBigQueryDataset());
tableRef.setTableId(options.getBigQueryTable());
pipeline
.apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile()))
// row... => <station route, station speed> ...
.apply(ParDo.of(new ExtractStationSpeedFn()))
// map the incoming data stream into sliding windows.
.apply(Window.<KV<String, StationSpeed>>into(SlidingWindows.of(
Duration.standardMinutes(options.getWindowDuration())).
every(Duration.standardMinutes(options.getWindowSlideEvery()))))
.apply(new TrackSpeed())
.apply(BigQueryIO.writeTableRows().to(tableRef)
.withSchema(FormatStatsFn.getSchema()));
// Run the pipeline.
PipelineResult result = pipeline.run();
// ExampleUtils will try to cancel the pipeline and the injector before the program exists.
exampleUtils.waitToFinish(result);
}
示例13: filterAlreadyProcessedDocuments
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
/**
* @param contentToIndexNotSkipped
* @param contentNotToIndexSkipped
* @param pipeline
* @param options
* @return
*/
private static ContentToIndexOrNot filterAlreadyProcessedDocuments(
PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped,
Pipeline pipeline, IndexerPipelineOptions options) {
PCollection<KV<String,Long>> alreadyProcessedDocs = null;
if (!options.getWriteTruncate()) {
String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options);
alreadyProcessedDocs = pipeline
.apply("Get already processed Documents",BigQueryIO.read().fromQuery(query))
.apply(ParDo.of(new GetDocumentHashFn()));
} else {
Map<String, Long> map = new HashMap<String,Long>();
alreadyProcessedDocs = pipeline
.apply("Create empty side input of Docs",
Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of())));
}
final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput =
alreadyProcessedDocs.apply(View.<String,Long>asMap());
PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped
.apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn()))
.apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create())
.apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput))
.withSideInputs(alreadyProcessedDocsSideInput)
.withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection
TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection
PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag);
PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag);
// Merge the sets of items that are dupes or skipped
PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped);
ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections()));
return content;
}
示例14: main
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
/**
* Runs the DatastoreToBigQuery dataflow pipeline
*/
public static void main(String[] args) throws IOException, ScriptException {
Options options = PipelineOptionsFactory.fromArgs(args)
.withValidation()
.as(Options.class);
NestedValueProvider<String, String> bqJsonSchema = NestedValueProvider
.of(options.getBqJsonSchema(), new ValueProviderHelpers.GcsLoad());
options.setRunner(DataflowRunner.class);
Pipeline pipeline = Pipeline.create(options);
pipeline
.apply("IngestEntities",
DatastoreIO.v1().read()
.withProjectId(options.getDatastoreProjectId())
.withLiteralGqlQuery(options.getGqlQuery())
.withNamespace(options.getNamespace()))
.apply("EntityToTableRow", ParDo.of(EntityToTableRow.newBuilder()
.setJsTransformPath(options.getJsTransformPath())
.setJsTransformFunctionName(options.getJsTransformFunctionName())
.setStrictCast(options.getStrictCast())
.setTableSchemaJson(bqJsonSchema)
.build()))
.apply("TableRowToBigQuery", BigQueryIO.writeTableRows()
.to(options.getBqTableSpec())
.withJsonSchema(bqJsonSchema)
.withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(WriteDisposition.WRITE_APPEND));
pipeline.run();
}
示例15: main
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; //导入依赖的package包/类
public static void main(String[] args) {
PipelineOptionsFactory.register(TemplateOptions.class);
TemplateOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(TemplateOptions.class);
Pipeline pipeline = Pipeline.create(options);
pipeline.apply("READ", TextIO.read().from(options.getInputFile()).withCompressionType(GZIP))
.apply("TRANSFORM", ParDo.of(new WikiParDo()))
.apply("WRITE", BigQueryIO.writeTableRows()
.to(String.format("%s:devfest_melbourne_2017.wiki_demo", options.getProject()))
.withCreateDisposition(CREATE_IF_NEEDED)
.withWriteDisposition(WRITE_APPEND)
.withSchema(getTableSchema()));
pipeline.run();
}