当前位置: 首页>>代码示例>>Java>>正文


Java Pipeline.write方法代码示例

本文整理汇总了Java中org.apache.crunch.Pipeline.write方法的典型用法代码示例。如果您正苦于以下问题:Java Pipeline.write方法的具体用法?Java Pipeline.write怎么用?Java Pipeline.write使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.crunch.Pipeline的用法示例。


在下文中一共展示了Pipeline.write方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testGeneric

import org.apache.crunch.Pipeline; //导入方法依赖的package包/类
@Test
public void testGeneric() throws IOException {
  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:20,代码来源:TestCrunchDatasets.java

示例2: testGenericParquet

import org.apache.crunch.Pipeline; //导入方法依赖的package包/类
@Test
public void testGenericParquet() throws IOException {
  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:20,代码来源:TestCrunchDatasets.java

示例3: testPartitionedSourceAndTarget

import org.apache.crunch.Pipeline; //导入方法依赖的package包/类
@Test
@SuppressWarnings("deprecation")
public void testPartitionedSourceAndTarget() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = partitionStrategy.partitionKey(0);
  Dataset<Record> inputPart0 = inputDataset.getPartition(key, false);
  Dataset<Record> outputPart0 = outputDataset.getPartition(key, true);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputPart0));
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:26,代码来源:TestCrunchDatasets.java

示例4: run

import org.apache.crunch.Pipeline; //导入方法依赖的package包/类
public void run() {

    // TODO: Switch to parameterized views.
    View<ExampleEvent> view = Datasets.load(ScheduledReportApp.EXAMPLE_DS_URI,
        ExampleEvent.class);

    RefinableView<GenericRecord> target = Datasets.load(ScheduledReportApp.REPORT_DS_URI,
        GenericRecord.class);

    // Get the view into which this report will be written.
    DateTime dateTime = getNominalTime().toDateTime(DateTimeZone.UTC);

    View<GenericRecord> output = target
        .with("year", dateTime.getYear())
        .with("month", dateTime.getMonthOfYear())
        .with("day", dateTime.getDayOfMonth())
        .with("hour", dateTime.getHourOfDay())
        .with("minute", dateTime.getMinuteOfHour());

    Pipeline pipeline = getPipeline();

    PCollection<ExampleEvent> events = pipeline.read(CrunchDatasets.asSource(view));

    PTable<Long, ExampleEvent> eventsByUser = events.by(new GetEventId(), Avros.longs());

    // Count of events by user ID.
    PTable<Long, Long> userEventCounts = eventsByUser.keys().count();

    PCollection<GenericData.Record> report = userEventCounts.parallelDo(
        new ToUserReport(),
        Avros.generics(SCHEMA));

    pipeline.write(report, CrunchDatasets.asTarget(output));

    pipeline.run();
  }
 
开发者ID:rbrush,项目名称:kite-apps,代码行数:37,代码来源:ScheduledReportJob.java

示例5: run

import org.apache.crunch.Pipeline; //导入方法依赖的package包/类
@Override
public int run(String[] args) throws Exception {
  JCommander jc = new JCommander(this);
  try {
    jc.parse(args);
  } catch (ParameterException e) {
    jc.usage();
    return 1;
  }

  if (paths == null || paths.size() != 2) {
    jc.usage();
    return 1;
  }

  String inputPathString = paths.get(0);
  String outputPathString = paths.get(1);

  Configuration conf = getConf();
  Path inputPath = new Path(inputPathString);
  Path outputPath = new Path(outputPathString);
  outputPath = outputPath.getFileSystem(conf).makeQualified(outputPath);

  Pipeline pipeline = new MRPipeline(getClass(), conf);

  VariantsLoader variantsLoader;
  if (dataModel.equals("GA4GH")) {
    variantsLoader = new GA4GHVariantsLoader();
  } else if (dataModel.equals("ADAM")) {
    variantsLoader = new ADAMVariantsLoader();
  } else {
    jc.usage();
    return 1;
  }

  Set<String> sampleSet = samples == null ? null :
      Sets.newLinkedHashSet(Splitter.on(',').split(samples));

  PTable<String, SpecificRecord> partitionKeyedRecords =
      variantsLoader.loadPartitionedVariants(inputFormat, inputPath, conf, pipeline,
          variantsOnly, flatten, sampleGroup, sampleSet, redistribute, segmentSize,
          numReducers);

  if (FileUtils.sampleGroupExists(outputPath, conf, sampleGroup)) {
    if (overwrite) {
      FileUtils.deleteSampleGroup(outputPath, conf, sampleGroup);
    } else {
      LOG.error("Sample group already exists: " + sampleGroup);
      return 1;
    }
  }

  pipeline.write(partitionKeyedRecords, new AvroParquetPathPerKeyTarget(outputPath),
      Target.WriteMode.APPEND);

  PipelineResult result = pipeline.done();
  return result.succeeded() ? 0 : 1;
}
 
开发者ID:cloudera,项目名称:quince,代码行数:59,代码来源:LoadVariantsTool.java

示例6: run

import org.apache.crunch.Pipeline; //导入方法依赖的package包/类
public void run(@DataIn(name="example_events", type=ExampleEvent.class) View<ExampleEvent> input,
                @DataOut(name="odd_users", type=ExampleEvent.class) View<ExampleEvent> output) {

  Pipeline pipeline = getPipeline();

  PCollection<ExampleEvent> events = pipeline.read(CrunchDatasets.asSource(input));

  PCollection<ExampleEvent> oddUsers = events.filter(new KeepOddUsers());

  pipeline.write(oddUsers, CrunchDatasets.asTarget(output));

  pipeline.run();
}
 
开发者ID:rbrush,项目名称:kite-apps,代码行数:14,代码来源:TriggeredJob.java


注:本文中的org.apache.crunch.Pipeline.write方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。