当前位置: 首页>>代码示例>>Java>>正文


Java PCollection类代码示例

本文整理汇总了Java中org.apache.crunch.PCollection的典型用法代码示例。如果您正苦于以下问题:Java PCollection类的具体用法?Java PCollection怎么用?Java PCollection使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


PCollection类属于org.apache.crunch包,在下文中一共展示了PCollection类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testGeneric

import org.apache.crunch.PCollection; //导入依赖的package包/类
@Test
public void testGeneric() throws IOException {
  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:20,代码来源:TestCrunchDatasets.java

示例2: testGenericParquet

import org.apache.crunch.PCollection; //导入依赖的package包/类
@Test
public void testGenericParquet() throws IOException {
  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:20,代码来源:TestCrunchDatasets.java

示例3: loadKeyedRecords

import org.apache.crunch.PCollection; //导入依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
  loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
      Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
      Set<String> samples)
      throws IOException {
  PCollection<Pair<org.bdgenomics.formats.avro.Variant, Collection<Genotype>>> adamRecords
      = readVariants(inputFormat, inputPath, conf, pipeline, sampleGroup);
  // The data are now loaded into ADAM variant objects; convert to keyed SpecificRecords
  ADAMToKeyedSpecificRecordFn converter =
      new ADAMToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
  @SuppressWarnings("unchecked")
  PType<SpecificRecord> specificPType = Avros.specifics(converter.getSpecificRecordType());
  return adamRecords.parallelDo("Convert to keyed SpecificRecords",
      converter, Avros.tableOf(KEY_PTYPE, specificPType));
}
 
开发者ID:cloudera,项目名称:quince,代码行数:17,代码来源:ADAMVariantsLoader.java

示例4: readVariants

import org.apache.crunch.PCollection; //导入依赖的package包/类
private static PCollection<Pair<Variant, Collection<Genotype>>>
    readVariants(String inputFormat, Path inputPath, Configuration conf,
    Pipeline pipeline, String sampleGroup) throws IOException {
  PCollection<Pair<Variant, Collection<Genotype>>> adamRecords;
  if (inputFormat.equals("VCF")) {
    TableSource<LongWritable, VariantContextWritable> vcfSource =
        From.formattedFile(
            inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
    PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
    PType<Pair<Variant, Collection<Genotype>>> adamPType =
        Avros.pairs(Avros.specifics(org.bdgenomics.formats.avro.Variant.class),
            Avros.collections(Avros.specifics(Genotype.class)));
    adamRecords =
        vcfRecords.parallelDo("VCF to ADAM Variant", new VCFToADAMVariantFn(), adamPType);
  } else if (inputFormat.equals("AVRO")) {
    throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
  } else if (inputFormat.equals("PARQUET")) {
    throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
  } else {
    throw new IllegalStateException("Unrecognized input format: " + inputFormat);
  }
  return adamRecords;
}
 
开发者ID:cloudera,项目名称:quince,代码行数:24,代码来源:ADAMVariantsLoader.java

示例5: testPartitionedSourceAndTarget

import org.apache.crunch.PCollection; //导入依赖的package包/类
@Test
@SuppressWarnings("deprecation")
public void testPartitionedSourceAndTarget() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = partitionStrategy.partitionKey(0);
  Dataset<Record> inputPart0 = inputDataset.getPartition(key, false);
  Dataset<Record> outputPart0 = outputDataset.getPartition(key, true);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0, GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputPart0));
}
 
开发者ID:cloudera,项目名称:cdk,代码行数:26,代码来源:TestCrunchDatasets.java

示例6: lloydsAlgorithm

import org.apache.crunch.PCollection; //导入依赖的package包/类
/**
 * Runs Lloyd's algorithm on the given points for a given number of iterations, returning the final
 * centers that result.
 * 
 * @param points The data points to cluster
 * @param centers The list of initial centers
 * @param numIterations The number of iterations to run, with each iteration corresponding to a MapReduce job
 * @param approx Whether to use random projection for assigning points to centers
 */
public <V extends RealVector> List<Centers> lloydsAlgorithm(PCollection<V> points, List<Centers> centers,
    int numIterations, boolean approx) {
  PTypeFamily tf = points.getTypeFamily();
  PTableType<Pair<Integer, Integer>, Pair<V, Long>> ptt = tf.tableOf(tf.pairs(tf.ints(), tf.ints()),
      tf.pairs(points.getPType(), tf.longs()));
  Aggregator<Pair<V, Long>> agg = new SumVectorsAggregator<V>();
  for (int i = 0; i < numIterations; i++) {
    KSketchIndex index = new KSketchIndex(centers, projectionBits, projectionSamples, seed);
    LloydsMapFn<V> mapFn = new LloydsMapFn<V>(index, approx);
    centers = new LloydsCenters<V>(points.parallelDo("lloyds-" + i, mapFn, ptt)
        .groupByKey()
        .combineValues(agg), centers.size()).getValue();
  }
  return centers;
}
 
开发者ID:apsaltis,项目名称:oryx,代码行数:25,代码来源:KMeansParallel.java

示例7: createPipeline

import org.apache.crunch.PCollection; //导入依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
  JobStepConfig stepConfig = getConfig();

  String instanceDir = stepConfig.getInstanceDir();
  long generationID = stepConfig.getGenerationID();
  String outputKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "summary/";
  if (!validOutputPath(outputKey)) {
    return null;
  }

  String inboundKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "inbound/";

  InboundSettings settings = InboundSettings.create(ConfigUtils.getDefaultConfig());
  MRPipeline p = createBasicPipeline(StringSplitFn.class);
  PCollection<Record> records = toRecords(p.read(textInput(inboundKey)));
  PCollection<String> jsonSummary = getSummarizer(settings).buildJson(records);
  jsonSummary.write(compressedTextOutput(p.getConfiguration(), outputKey));
  return p;
}
 
开发者ID:apsaltis,项目名称:oryx,代码行数:21,代码来源:SummaryStep.java

示例8: testCategorical

import org.apache.crunch.PCollection; //导入依赖的package包/类
@Test
public void testCategorical() {
  PCollection<String> input = MemPipeline.typedCollectionOf(
      Avros.strings(),
      "1.0,a,3.0,y",
      "0.4,b,1.0,x",
      "3.2,c,29.0,z");
  PCollection<Record> elems = StringSplitFn.apply(input);
  Summary s = new Summarizer()
    .categoricalColumns(1, 3)
    .build(elems).getValue();
  PCollection<RealVector> vecs = elems.parallelDo(new StandardizeFn(s), MLAvros.vector());
  assertEquals(ImmutableList.of(
      Vectors.of(1.0, 1, 0, 0, 3.0, 0.0, 1.0, 0.0),
      Vectors.of(0.4, 0, 1, 0, 1.0, 1.0, 0.0, 0.0),
      Vectors.of(3.2, 0, 0, 1, 29.0, 0, 0, 1)),
      vecs.materialize());
}
 
开发者ID:apsaltis,项目名称:oryx,代码行数:19,代码来源:StringParsingTest.java

示例9: apply

import org.apache.crunch.PCollection; //导入依赖的package包/类
public <T> PCollection<Pair<Integer, T>> apply(PCollection<T> pcollect) {
  PTypeFamily ptf = pcollect.getTypeFamily();
  PType<Pair<Integer, T>> pt = ptf.pairs(ptf.ints(), pcollect.getPType());
  return pcollect.parallelDo("crossfold", new MapFn<T, Pair<Integer, T>>() {
    private transient RandomGenerator rand;
    
    @Override
    public void initialize() {
      if (rand == null) {
        this.rand = RandomManager.getSeededRandom(seed);
      }
    }
    
    @Override
    public Pair<Integer, T> map(T t) {
      return Pair.of(rand.nextInt(numFolds), t);
    }
    
  }, pt);
}
 
开发者ID:apsaltis,项目名称:oryx,代码行数:21,代码来源:Crossfold.java

示例10: testWRS

import org.apache.crunch.PCollection; //导入依赖的package包/类
@Test
public void testWRS() throws Exception {
  Map<String, Integer> histogram = Maps.newHashMap();

  RandomGenerator r = RandomManager.getRandom();
  for (int i = 0; i < 100; i++) {
    PCollection<String> sample = ReservoirSampling.weightedSample(VALUES, 2, r);
    for (String s : sample.materialize()) {
      if (histogram.containsKey(s)) {
        histogram.put(s, 1 + histogram.get(s));
      } else {
        histogram.put(s, 1);
      }
    }
  }
  
  Map<String, Integer> expected = ImmutableMap.of(
      "foo", 48, "bar", 80, "baz", 32, "biz", 40);
  assertEquals(expected, histogram);
}
 
开发者ID:apsaltis,项目名称:oryx,代码行数:21,代码来源:ReservoirSamplingTest.java

示例11: byFieldNames

import org.apache.crunch.PCollection; //导入依赖的package包/类
public static <T extends SpecificRecord> PCollection<ThriftRecord> byFieldNames(
    final PCollection<T> collection,
    final String rowKeyFieldName,
    final String ttlFieldName,
    final String timestampFieldName
) {
  final Class<T> recordType = collection.getPType().getTypeClass();
  T record;
  try {
    record = recordType.getConstructor().newInstance();
  } catch (Exception e) {
    throw new RuntimeException("Could not create an instance of the record to determine it's schema", e);
  }

  ThriftByFieldNamesFn<T> doFn = new ThriftByFieldNamesFn<T>(record.getSchema(), rowKeyFieldName, ttlFieldName, timestampFieldName);
  return collection.parallelDo(doFn, ThriftRecord.PTYPE);
}
 
开发者ID:spotify,项目名称:hdfs2cass,代码行数:18,代码来源:Thrift.java

示例12: run

import org.apache.crunch.PCollection; //导入依赖的package包/类
public int run(String[] args) throws Exception {

    String fooInputPath = args[0];
    String barInputPath = args[1];
    String outputPath = args[2];
    int fooValMax = Integer.parseInt(args[3]);
    int joinValMax = Integer.parseInt(args[4]);
    int numberOfReducers = Integer.parseInt(args[5]);

    Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
    
    PCollection<String> fooLines = pipeline.readTextFile(fooInputPath);  //<2>
    PCollection<String> barLines = pipeline.readTextFile(barInputPath);

    PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo(  //<3>
        new FooIndicatorFn(),
        Avros.tableOf(Avros.longs(),
        Avros.pairs(Avros.longs(), Avros.ints())));

    fooTable = fooTable.filter(new FooFilter(fooValMax));  //<4>

    PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
        Avros.tableOf(Avros.longs(), Avros.ints()));

    DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy =   //<5>
        new DefaultJoinStrategy
          <Long, Pair<Long, Integer>, Integer>
          (numberOfReducers);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
        .join(fooTable, barTable, JoinType.INNER_JOIN);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));

    filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>

    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
  }
 
开发者ID:amitchmca,项目名称:hadooparchitecturebook,代码行数:41,代码来源:JoinFilterExampleCrunch.java

示例13: loadKeyedRecords

import org.apache.crunch.PCollection; //导入依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
    loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
        Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
        Set<String> samples)
    throws IOException {
  PCollection<Variant> variants = readVariants(inputFormat, inputPath,
      conf, pipeline, sampleGroup);

  GA4GHToKeyedSpecificRecordFn converter =
      new GA4GHToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
  @SuppressWarnings("unchecked")
  PType<SpecificRecord> specificPType = Avros.specifics(converter
      .getSpecificRecordType());
  return variants.parallelDo("Convert to keyed SpecificRecords",
      converter, Avros.tableOf(KEY_PTYPE, specificPType));
}
 
开发者ID:cloudera,项目名称:quince,代码行数:18,代码来源:GA4GHVariantsLoader.java

示例14: readVariants

import org.apache.crunch.PCollection; //导入依赖的package包/类
private static PCollection<Variant> readVariants(String inputFormat, Path inputPath,
    Configuration conf, Pipeline pipeline, String sampleGroup) throws IOException {
  PCollection<Variant> variants;
  if (inputFormat.equals("VCF")) {
    VCFToGA4GHVariantFn.configureHeaders(
        conf, FileUtils.findVcfs(inputPath, conf), sampleGroup);
    TableSource<LongWritable, VariantContextWritable> vcfSource =
        From.formattedFile(
            inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
    PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
    variants = vcfRecords.parallelDo(
        "VCF to GA4GH Variant", new VCFToGA4GHVariantFn(), Avros.specifics(Variant.class));
  } else if (inputFormat.equals("AVRO")) {
    variants = pipeline.read(From.avroFile(inputPath, Avros.specifics(Variant.class)));
  } else if (inputFormat.equals("PARQUET")) {
    @SuppressWarnings("unchecked")
    Source<Variant> source =
        new AvroParquetFileSource(inputPath, Avros.specifics(Variant.class));
    variants = pipeline.read(source);
  } else {
    throw new IllegalStateException("Unrecognized input format: " + inputFormat);
  }
  return variants;
}
 
开发者ID:cloudera,项目名称:quince,代码行数:25,代码来源:GA4GHVariantsLoader.java

示例15: testKeyByAvroFieldSimple

import org.apache.crunch.PCollection; //导入依赖的package包/类
@Test
public void testKeyByAvroFieldSimple() throws PlanTimeException {
  TestAvroRecord rec = TestAvroRecord.newBuilder().setFieldA(new Utf8("hello")).setFieldB("world").setFieldC(10L).build();
  PCollection<TestAvroRecord> collection =
          MemPipeline.typedCollectionOf(specifics(TestAvroRecord.class), rec);

  PTable<String, TestAvroRecord> table = AvroCollections.keyByAvroField(collection, "fieldA", strings());
  assertEquals(ImmutableMap.of("hello", rec), table.materializeToMap());
}
 
开发者ID:spotify,项目名称:crunch-lib,代码行数:10,代码来源:AvroCollectionsTest.java


注:本文中的org.apache.crunch.PCollection类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。