Java Pipeline类代码示例

本文整理汇总了Java中org.apache.crunch.Pipeline类的典型用法代码示例。如果您正苦于以下问题：Java Pipeline类的具体用法？Java Pipeline怎么用？Java Pipeline使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

Pipeline类属于org.apache.crunch包，在下文中一共展示了Pipeline类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testGeneric

import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Test
public void testGeneric() throws IOException {
  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}

开发者ID:cloudera，项目名称:cdk，代码行数:20，代码来源:TestCrunchDatasets.java

示例2: testGenericParquet

import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Test
public void testGenericParquet() throws IOException {
  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}

开发者ID:cloudera，项目名称:cdk，代码行数:20，代码来源:TestCrunchDatasets.java

示例3: loadKeyedRecords

import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
  loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
      Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
      Set<String> samples)
      throws IOException {
  PCollection<Pair<org.bdgenomics.formats.avro.Variant, Collection<Genotype>>> adamRecords
      = readVariants(inputFormat, inputPath, conf, pipeline, sampleGroup);
  // The data are now loaded into ADAM variant objects; convert to keyed SpecificRecords
  ADAMToKeyedSpecificRecordFn converter =
      new ADAMToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
  @SuppressWarnings("unchecked")
  PType<SpecificRecord> specificPType = Avros.specifics(converter.getSpecificRecordType());
  return adamRecords.parallelDo("Convert to keyed SpecificRecords",
      converter, Avros.tableOf(KEY_PTYPE, specificPType));
}

开发者ID:cloudera，项目名称:quince，代码行数:17，代码来源:ADAMVariantsLoader.java

示例4: readVariants

import org.apache.crunch.Pipeline; //导入依赖的package包/类
private static PCollection<Pair<Variant, Collection<Genotype>>>
    readVariants(String inputFormat, Path inputPath, Configuration conf,
    Pipeline pipeline, String sampleGroup) throws IOException {
  PCollection<Pair<Variant, Collection<Genotype>>> adamRecords;
  if (inputFormat.equals("VCF")) {
    TableSource<LongWritable, VariantContextWritable> vcfSource =
        From.formattedFile(
            inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
    PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
    PType<Pair<Variant, Collection<Genotype>>> adamPType =
        Avros.pairs(Avros.specifics(org.bdgenomics.formats.avro.Variant.class),
            Avros.collections(Avros.specifics(Genotype.class)));
    adamRecords =
        vcfRecords.parallelDo("VCF to ADAM Variant", new VCFToADAMVariantFn(), adamPType);
  } else if (inputFormat.equals("AVRO")) {
    throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
  } else if (inputFormat.equals("PARQUET")) {
    throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
  } else {
    throw new IllegalStateException("Unrecognized input format: " + inputFormat);
  }
  return adamRecords;
}

开发者ID:cloudera，项目名称:quince，代码行数:24，代码来源:ADAMVariantsLoader.java

示例5: testPartitionedSourceAndTarget

import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Test
@SuppressWarnings("deprecation")
public void testPartitionedSourceAndTarget() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = partitionStrategy.partitionKey(0);
  Dataset<Record> inputPart0 = inputDataset.getPartition(key, false);
  Dataset<Record> outputPart0 = outputDataset.getPartition(key, true);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputPart0));
}

开发者ID:cloudera，项目名称:cdk，代码行数:26，代码来源:TestCrunchDatasets.java

示例6: startMapReduce

import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Override
public void startMapReduce(String taskName) {
    String outputDirectory = propertyConfig.getProperty("sqoop.task." + taskName + ".toJobConfig.outputDirectory");
    String hadoopAddress = propertyConfig.getProperty("sqoop.task." + taskName + ".tolink.linkConfig.uri");
    Pipeline pipeline = new MRPipeline(CrunchMapReduce.class, new Configuration());
    Class<AvroParquetFileSourceTarget> avroParquetFileSourceTargetClass = AvroParquetFileSourceTarget.class;
}

开发者ID:babymm，项目名称:mmsns，代码行数:8，代码来源:CrunchMapReduce.java

示例7: run

import org.apache.crunch.Pipeline; //导入依赖的package包/类
public int run(String[] args) throws Exception {

    String fooInputPath = args[0];
    String barInputPath = args[1];
    String outputPath = args[2];
    int fooValMax = Integer.parseInt(args[3]);
    int joinValMax = Integer.parseInt(args[4]);
    int numberOfReducers = Integer.parseInt(args[5]);

    Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
    
    PCollection<String> fooLines = pipeline.readTextFile(fooInputPath);  //<2>
    PCollection<String> barLines = pipeline.readTextFile(barInputPath);

    PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo(  //<3>
        new FooIndicatorFn(),
        Avros.tableOf(Avros.longs(),
        Avros.pairs(Avros.longs(), Avros.ints())));

    fooTable = fooTable.filter(new FooFilter(fooValMax));  //<4>

    PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
        Avros.tableOf(Avros.longs(), Avros.ints()));

    DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy =   //<5>
        new DefaultJoinStrategy
          <Long, Pair<Long, Integer>, Integer>
          (numberOfReducers);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
        .join(fooTable, barTable, JoinType.INNER_JOIN);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));

    filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>

    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
  }

开发者ID:amitchmca，项目名称:hadooparchitecturebook，代码行数:41，代码来源:JoinFilterExampleCrunch.java

示例8: loadKeyedRecords

import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
    loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
        Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
        Set<String> samples)
    throws IOException {
  PCollection<Variant> variants = readVariants(inputFormat, inputPath,
      conf, pipeline, sampleGroup);

  GA4GHToKeyedSpecificRecordFn converter =
      new GA4GHToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
  @SuppressWarnings("unchecked")
  PType<SpecificRecord> specificPType = Avros.specifics(converter
      .getSpecificRecordType());
  return variants.parallelDo("Convert to keyed SpecificRecords",
      converter, Avros.tableOf(KEY_PTYPE, specificPType));
}

开发者ID:cloudera，项目名称:quince，代码行数:18，代码来源:GA4GHVariantsLoader.java

示例9: readVariants

import org.apache.crunch.Pipeline; //导入依赖的package包/类
private static PCollection<Variant> readVariants(String inputFormat, Path inputPath,
    Configuration conf, Pipeline pipeline, String sampleGroup) throws IOException {
  PCollection<Variant> variants;
  if (inputFormat.equals("VCF")) {
    VCFToGA4GHVariantFn.configureHeaders(
        conf, FileUtils.findVcfs(inputPath, conf), sampleGroup);
    TableSource<LongWritable, VariantContextWritable> vcfSource =
        From.formattedFile(
            inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
    PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
    variants = vcfRecords.parallelDo(
        "VCF to GA4GH Variant", new VCFToGA4GHVariantFn(), Avros.specifics(Variant.class));
  } else if (inputFormat.equals("AVRO")) {
    variants = pipeline.read(From.avroFile(inputPath, Avros.specifics(Variant.class)));
  } else if (inputFormat.equals("PARQUET")) {
    @SuppressWarnings("unchecked")
    Source<Variant> source =
        new AvroParquetFileSource(inputPath, Avros.specifics(Variant.class));
    variants = pipeline.read(source);
  } else {
    throw new IllegalStateException("Unrecognized input format: " + inputFormat);
  }
  return variants;
}

开发者ID:cloudera，项目名称:quince，代码行数:25，代码来源:GA4GHVariantsLoader.java

示例10: loadPartitionedVariants

import org.apache.crunch.Pipeline; //导入依赖的package包/类
/**
 * Load and partition variants.
 * key = (contig, pos, sample_group); value = Variant/Call Avro object
 * @param inputFormat the format of the input data (VCF, AVRO, or PARQUET)
 * @param inputPath the input data path
 * @param conf the Hadoop configuration
 * @param pipeline the Crunch pipeline
 * @param variantsOnly whether to ignore samples and only load variants
 * @param flatten whether to flatten the data types
 * @param sampleGroup an identifier for the group of samples being loaded
 * @param samples the samples to include
 * @param redistribute whether to repartition the data by locus/sample group
 * @param segmentSize the number of base pairs in each segment partition
 * @param numReducers the number of reducers to use
 * @return the keyed variant or call records
 * @throws IOException if an I/O error is encountered during loading
 */
public PTable<String, SpecificRecord> loadPartitionedVariants(
    String inputFormat, Path inputPath, Configuration conf,
    Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
    Set<String> samples, boolean redistribute, long segmentSize, int numReducers)
    throws IOException {
  PTable<Tuple3<String, Long, String>, SpecificRecord> locusSampleKeyedRecords =
      loadKeyedRecords(inputFormat, inputPath, conf, pipeline, variantsOnly, flatten,
          sampleGroup, samples);

  // execute a DISTRIBUTE BY operation if requested
  PTable<Tuple3<String, Long, String>, SpecificRecord> sortedRecords;
  if (redistribute) {
    // partitionKey(chr, chrSeg, sampleGroup), Pair(secondaryKey/pos, originalDatum)
    PTableType<Tuple3<String, Long, String>,
        Pair<Long,
            Pair<Tuple3<String, Long, String>, SpecificRecord>>> reKeyedPType =
        Avros.tableOf(Avros.triples(Avros.strings(), Avros.longs(), Avros.strings()),
            Avros.pairs(Avros.longs(),
                Avros.pairs(locusSampleKeyedRecords.getKeyType(),
                    locusSampleKeyedRecords.getValueType())));
    PTable<Tuple3<String, Long, String>,
        Pair<Long, Pair<Tuple3<String, Long, String>, SpecificRecord>>> reKeyed =
        locusSampleKeyedRecords.parallelDo("Re-keying for redistribution",
            new ReKeyDistributeByFn(segmentSize), reKeyedPType);
    // repartition and sort by pos
    sortedRecords = SecondarySort.sortAndApply(
        reKeyed, new UnKeyForDistributeByFn(),
        locusSampleKeyedRecords.getPTableType(), numReducers);
  } else {
    // input data assumed to be already globally sorted
    sortedRecords = locusSampleKeyedRecords;
  }

  // generate the partition keys
  return sortedRecords.mapKeys("Generate partition keys",
      new LocusSampleToPartitionFn(segmentSize, sampleGroup), Avros.strings());
}

开发者ID:cloudera，项目名称:quince，代码行数:55，代码来源:VariantsLoader.java

示例11: inputPairs

import org.apache.crunch.Pipeline; //导入依赖的package包/类
protected final <V extends RealVector> PCollection<Pair<Integer, V>> inputPairs(
    Pipeline p,
    String inputKey,
    PType<V> ptype) {
  PType<Pair<Integer, V>> inputType = Avros.pairs(Avros.ints(), ptype);
  return p.read(avroInput(inputKey, inputType));
}

开发者ID:apsaltis，项目名称:oryx，代码行数:8，代码来源:KMeansJobStep.java

示例12: run

import org.apache.crunch.Pipeline; //导入依赖的package包/类
public void run() {

    // TODO: Switch to parameterized views.
    View<ExampleEvent> view = Datasets.load(ScheduledReportApp.EXAMPLE_DS_URI,
        ExampleEvent.class);

    RefinableView<GenericRecord> target = Datasets.load(ScheduledReportApp.REPORT_DS_URI,
        GenericRecord.class);

    // Get the view into which this report will be written.
    DateTime dateTime = getNominalTime().toDateTime(DateTimeZone.UTC);

    View<GenericRecord> output = target
        .with("year", dateTime.getYear())
        .with("month", dateTime.getMonthOfYear())
        .with("day", dateTime.getDayOfMonth())
        .with("hour", dateTime.getHourOfDay())
        .with("minute", dateTime.getMinuteOfHour());

    Pipeline pipeline = getPipeline();

    PCollection<ExampleEvent> events = pipeline.read(CrunchDatasets.asSource(view));

    PTable<Long, ExampleEvent> eventsByUser = events.by(new GetEventId(), Avros.longs());

    // Count of events by user ID.
    PTable<Long, Long> userEventCounts = eventsByUser.keys().count();

    PCollection<GenericData.Record> report = userEventCounts.parallelDo(
        new ToUserReport(),
        Avros.generics(SCHEMA));

    pipeline.write(report, CrunchDatasets.asTarget(output));

    pipeline.run();
  }

开发者ID:rbrush，项目名称:kite-apps，代码行数:37，代码来源:ScheduledReportJob.java

示例13: run

import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {
  JCommander jc = new JCommander(this);
  try {
    jc.parse(args);
  } catch (ParameterException e) {
    jc.usage();
    return 1;
  }

  if (paths == null || paths.size() != 2) {
    jc.usage();
    return 1;
  }

  String inputPathString = paths.get(0);
  String outputPathString = paths.get(1);

  Configuration conf = getConf();
  Path inputPath = new Path(inputPathString);
  Path outputPath = new Path(outputPathString);
  outputPath = outputPath.getFileSystem(conf).makeQualified(outputPath);

  Pipeline pipeline = new MRPipeline(getClass(), conf);

  VariantsLoader variantsLoader;
  if (dataModel.equals("GA4GH")) {
    variantsLoader = new GA4GHVariantsLoader();
  } else if (dataModel.equals("ADAM")) {
    variantsLoader = new ADAMVariantsLoader();
  } else {
    jc.usage();
    return 1;
  }

  Set<String> sampleSet = samples == null ? null :
      Sets.newLinkedHashSet(Splitter.on(',').split(samples));

  PTable<String, SpecificRecord> partitionKeyedRecords =
      variantsLoader.loadPartitionedVariants(inputFormat, inputPath, conf, pipeline,
          variantsOnly, flatten, sampleGroup, sampleSet, redistribute, segmentSize,
          numReducers);

  if (FileUtils.sampleGroupExists(outputPath, conf, sampleGroup)) {
    if (overwrite) {
      FileUtils.deleteSampleGroup(outputPath, conf, sampleGroup);
    } else {
      LOG.error("Sample group already exists: " + sampleGroup);
      return 1;
    }
  }

  pipeline.write(partitionKeyedRecords, new AvroParquetPathPerKeyTarget(outputPath),
      Target.WriteMode.APPEND);

  PipelineResult result = pipeline.done();
  return result.succeeded() ? 0 : 1;
}

开发者ID:cloudera，项目名称:quince，代码行数:59，代码来源:LoadVariantsTool.java

示例14: inputVectors

import org.apache.crunch.Pipeline; //导入依赖的package包/类
protected final <V extends RealVector> PCollection<V> inputVectors(Pipeline p, String inputKey, PType<V> ptype) {
  return PTables.asPTable(inputPairs(p, inputKey, ptype)).values();
}

开发者ID:apsaltis，项目名称:oryx，代码行数:4，代码来源:KMeansJobStep.java

示例15: run

import org.apache.crunch.Pipeline; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {

  new JCommander(this, args);

  URI outputUri = URI.create(output);

  // Our crunch job is a MapReduce job
  Pipeline pipeline = new MRPipeline(LegacyHdfs2Cass.class, getConf());

  // Parse & fetch info about target Cassandra cluster
  CassandraParams params = CassandraParams.parse(outputUri);

  // Read records from Avro files in inputFolder
  PCollection<ByteBuffer> records =
      pipeline.read(From.avroFile(inputList(input), Avros.records(ByteBuffer.class)));

  // Transform the input
  String protocol = outputUri.getScheme();
  if (protocol.equalsIgnoreCase("thrift")) {
    records
        // First convert ByteBuffers to ThriftRecords
        .parallelDo(new LegacyHdfsToThrift(), ThriftRecord.PTYPE)
        // Then group the ThriftRecords in preparation for writing them
        .parallelDo(new ThriftRecord.AsPair(), ThriftRecord.AsPair.PTYPE)
        .groupByKey(params.createGroupingOptions())
        // Finally write the ThriftRecords to Cassandra
        .write(new ThriftTarget(outputUri, params));
  }
  else if (protocol.equalsIgnoreCase("cql")) {
    records
        // In case of CQL, convert ByteBuffers to CQLRecords
        .parallelDo(new LegacyHdfsToCQL(), CQLRecord.PTYPE)
        .by(params.getKeyFn(), Avros.bytes())
        .groupByKey(params.createGroupingOptions())
        .write(new CQLTarget(outputUri, params));
  }

  // Execute the pipeline
  PipelineResult result = pipeline.done();
  return result.succeeded() ? 0 : 1;
}

开发者ID:spotify，项目名称:hdfs2cass，代码行数:43，代码来源:LegacyHdfs2Cass.java

注：本文中的org.apache.crunch.Pipeline类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。