Java PCollection.parallelDo方法代码示例

本文整理汇总了Java中org.apache.crunch.PCollection.parallelDo方法的典型用法代码示例。如果您正苦于以下问题：Java PCollection.parallelDo方法的具体用法？Java PCollection.parallelDo怎么用？Java PCollection.parallelDo使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.crunch.PCollection的用法示例。

在下文中一共展示了PCollection.parallelDo方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: loadKeyedRecords

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
  loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
      Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
      Set<String> samples)
      throws IOException {
  PCollection<Pair<org.bdgenomics.formats.avro.Variant, Collection<Genotype>>> adamRecords
      = readVariants(inputFormat, inputPath, conf, pipeline, sampleGroup);
  // The data are now loaded into ADAM variant objects; convert to keyed SpecificRecords
  ADAMToKeyedSpecificRecordFn converter =
      new ADAMToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
  @SuppressWarnings("unchecked")
  PType<SpecificRecord> specificPType = Avros.specifics(converter.getSpecificRecordType());
  return adamRecords.parallelDo("Convert to keyed SpecificRecords",
      converter, Avros.tableOf(KEY_PTYPE, specificPType));
}

开发者ID:cloudera，项目名称:quince，代码行数:17，代码来源:ADAMVariantsLoader.java

示例2: readVariants

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
private static PCollection<Pair<Variant, Collection<Genotype>>>
    readVariants(String inputFormat, Path inputPath, Configuration conf,
    Pipeline pipeline, String sampleGroup) throws IOException {
  PCollection<Pair<Variant, Collection<Genotype>>> adamRecords;
  if (inputFormat.equals("VCF")) {
    TableSource<LongWritable, VariantContextWritable> vcfSource =
        From.formattedFile(
            inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
    PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
    PType<Pair<Variant, Collection<Genotype>>> adamPType =
        Avros.pairs(Avros.specifics(org.bdgenomics.formats.avro.Variant.class),
            Avros.collections(Avros.specifics(Genotype.class)));
    adamRecords =
        vcfRecords.parallelDo("VCF to ADAM Variant", new VCFToADAMVariantFn(), adamPType);
  } else if (inputFormat.equals("AVRO")) {
    throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
  } else if (inputFormat.equals("PARQUET")) {
    throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
  } else {
    throw new IllegalStateException("Unrecognized input format: " + inputFormat);
  }
  return adamRecords;
}

开发者ID:cloudera，项目名称:quince，代码行数:24，代码来源:ADAMVariantsLoader.java

示例3: testCategorical

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Test
public void testCategorical() {
  PCollection<String> input = MemPipeline.typedCollectionOf(
      Avros.strings(),
      "1.0,a,3.0,y",
      "0.4,b,1.0,x",
      "3.2,c,29.0,z");
  PCollection<Record> elems = StringSplitFn.apply(input);
  Summary s = new Summarizer()
    .categoricalColumns(1, 3)
    .build(elems).getValue();
  PCollection<RealVector> vecs = elems.parallelDo(new StandardizeFn(s), MLAvros.vector());
  assertEquals(ImmutableList.of(
      Vectors.of(1.0, 1, 0, 0, 3.0, 0.0, 1.0, 0.0),
      Vectors.of(0.4, 0, 1, 0, 1.0, 1.0, 0.0, 0.0),
      Vectors.of(3.2, 0, 0, 1, 29.0, 0, 0, 1)),
      vecs.materialize());
}

开发者ID:apsaltis，项目名称:oryx，代码行数:19，代码来源:StringParsingTest.java

示例4: apply

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
public <T> PCollection<Pair<Integer, T>> apply(PCollection<T> pcollect) {
  PTypeFamily ptf = pcollect.getTypeFamily();
  PType<Pair<Integer, T>> pt = ptf.pairs(ptf.ints(), pcollect.getPType());
  return pcollect.parallelDo("crossfold", new MapFn<T, Pair<Integer, T>>() {
    private transient RandomGenerator rand;
    
    @Override
    public void initialize() {
      if (rand == null) {
        this.rand = RandomManager.getSeededRandom(seed);
      }
    }
    
    @Override
    public Pair<Integer, T> map(T t) {
      return Pair.of(rand.nextInt(numFolds), t);
    }
    
  }, pt);
}

开发者ID:apsaltis，项目名称:oryx，代码行数:21，代码来源:Crossfold.java

示例5: byFieldNames

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
public static <T extends SpecificRecord> PCollection<ThriftRecord> byFieldNames(
    final PCollection<T> collection,
    final String rowKeyFieldName,
    final String ttlFieldName,
    final String timestampFieldName
) {
  final Class<T> recordType = collection.getPType().getTypeClass();
  T record;
  try {
    record = recordType.getConstructor().newInstance();
  } catch (Exception e) {
    throw new RuntimeException("Could not create an instance of the record to determine it's schema", e);
  }

  ThriftByFieldNamesFn<T> doFn = new ThriftByFieldNamesFn<T>(record.getSchema(), rowKeyFieldName, ttlFieldName, timestampFieldName);
  return collection.parallelDo(doFn, ThriftRecord.PTYPE);
}

开发者ID:spotify，项目名称:hdfs2cass，代码行数:18，代码来源:Thrift.java

示例6: run

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
public int run(String[] args) throws Exception {

    String fooInputPath = args[0];
    String barInputPath = args[1];
    String outputPath = args[2];
    int fooValMax = Integer.parseInt(args[3]);
    int joinValMax = Integer.parseInt(args[4]);
    int numberOfReducers = Integer.parseInt(args[5]);

    Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
    
    PCollection<String> fooLines = pipeline.readTextFile(fooInputPath);  //<2>
    PCollection<String> barLines = pipeline.readTextFile(barInputPath);

    PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo(  //<3>
        new FooIndicatorFn(),
        Avros.tableOf(Avros.longs(),
        Avros.pairs(Avros.longs(), Avros.ints())));

    fooTable = fooTable.filter(new FooFilter(fooValMax));  //<4>

    PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
        Avros.tableOf(Avros.longs(), Avros.ints()));

    DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy =   //<5>
        new DefaultJoinStrategy
          <Long, Pair<Long, Integer>, Integer>
          (numberOfReducers);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
        .join(fooTable, barTable, JoinType.INNER_JOIN);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));

    filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>

    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
  }

开发者ID:amitchmca，项目名称:hadooparchitecturebook，代码行数:41，代码来源:JoinFilterExampleCrunch.java

示例7: loadKeyedRecords

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
    loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
        Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
        Set<String> samples)
    throws IOException {
  PCollection<Variant> variants = readVariants(inputFormat, inputPath,
      conf, pipeline, sampleGroup);

  GA4GHToKeyedSpecificRecordFn converter =
      new GA4GHToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
  @SuppressWarnings("unchecked")
  PType<SpecificRecord> specificPType = Avros.specifics(converter
      .getSpecificRecordType());
  return variants.parallelDo("Convert to keyed SpecificRecords",
      converter, Avros.tableOf(KEY_PTYPE, specificPType));
}

开发者ID:cloudera，项目名称:quince，代码行数:18，代码来源:GA4GHVariantsLoader.java

示例8: readVariants

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
private static PCollection<Variant> readVariants(String inputFormat, Path inputPath,
    Configuration conf, Pipeline pipeline, String sampleGroup) throws IOException {
  PCollection<Variant> variants;
  if (inputFormat.equals("VCF")) {
    VCFToGA4GHVariantFn.configureHeaders(
        conf, FileUtils.findVcfs(inputPath, conf), sampleGroup);
    TableSource<LongWritable, VariantContextWritable> vcfSource =
        From.formattedFile(
            inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
    PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
    variants = vcfRecords.parallelDo(
        "VCF to GA4GH Variant", new VCFToGA4GHVariantFn(), Avros.specifics(Variant.class));
  } else if (inputFormat.equals("AVRO")) {
    variants = pipeline.read(From.avroFile(inputPath, Avros.specifics(Variant.class)));
  } else if (inputFormat.equals("PARQUET")) {
    @SuppressWarnings("unchecked")
    Source<Variant> source =
        new AvroParquetFileSource(inputPath, Avros.specifics(Variant.class));
    variants = pipeline.read(source);
  } else {
    throw new IllegalStateException("Unrecognized input format: " + inputFormat);
  }
  return variants;
}

开发者ID:cloudera，项目名称:quince，代码行数:25，代码来源:GA4GHVariantsLoader.java

示例9: run

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
public int run(String... args) throws Exception {
  if (args.length != 3) {
    System.err.println("Usage: " + CombinedLogFormatConverter.class.getSimpleName() +
        " <input> <dataset_uri> <dataset name>");
    return 1;
  }
  String input = args[0];
  String datasetUri = args[1];
  String datasetName = args[2];

  Schema schema = new Schema.Parser().parse(
      Resources.getResource("combined_log_format.avsc").openStream());

  // Create the dataset
  DatasetRepository repo = DatasetRepositories.open(datasetUri);
  DatasetDescriptor datasetDescriptor = new DatasetDescriptor.Builder()
      .schema(schema).build();
  Dataset<Object> dataset = repo.create(datasetName, datasetDescriptor);

  // Run the job
  final String schemaString = schema.toString();
  AvroType<GenericData.Record> outputType = Avros.generics(schema);
  PCollection<String> lines = readTextFile(input);
  PCollection<GenericData.Record> records = lines.parallelDo(
      new ConvertFn(schemaString), outputType);
  getPipeline().write(records, CrunchDatasets.asTarget(dataset),
      Target.WriteMode.APPEND);
  run();
  return 0;
}

开发者ID:cloudera，项目名称:cdk，代码行数:32，代码来源:CombinedLogFormatConverter.java

示例10: createPipeline

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
  JobStepConfig stepConfig = getConfig();

  String instanceDir = stepConfig.getInstanceDir();
  long generationID = stepConfig.getGenerationID();
  String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
  String outputKey = prefix + "normalized/";
  if (!validOutputPath(outputKey)) {
    return null;
  }

  String inboundKey = prefix + "inbound/";
  String summaryKey = prefix + "summary/";
  Config config = ConfigUtils.getDefaultConfig();
  InboundSettings inbound = InboundSettings.create(config);
  NormalizeSettings settings = NormalizeSettings.create(config);

  MRPipeline p = createBasicPipeline(StringSplitFn.class);
  PCollection<Record> records = toRecords(p.read(textInput(inboundKey)));
  StandardizeFn standardizeFn = getStandardizeFn(inbound, settings, summaryKey);
  PCollection<RealVector> vecs = records.parallelDo("normalize", standardizeFn, MLAvros.vector());

  // assign cross-folds here
  new Crossfold(config.getInt("model.cross-folds"))
      .apply(vecs)
      .write(avroOutput(outputKey));

  return p;
}

开发者ID:apsaltis，项目名称:oryx，代码行数:31，代码来源:NormalizeStep.java

示例11: testSimple

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Test
public void testSimple() {
  PCollection<String> input = MemPipeline.typedCollectionOf(
      Avros.strings(),
      "1.0,2.0,3.0",
      "0.4,2.0,1.0",
      "3.2,17.0,29.0");
  PCollection<Record> elems = StringSplitFn.apply(input);
  PCollection<RealVector> vecs = elems.parallelDo(new StandardizeFn(), MLAvros.vector());
  assertEquals(ImmutableList.of(Vectors.of(1, 2, 3), Vectors.of(0.4, 2, 1),
      Vectors.of(3.2, 17, 29)), vecs.materialize());
}

开发者ID:apsaltis，项目名称:oryx，代码行数:13，代码来源:StringParsingTest.java

示例12: weightedSample

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
public static <T, N extends Number> PCollection<T> weightedSample(
    PCollection<Pair<T, N>> input,
    int sampleSize,
    RandomGenerator random) {
  PTypeFamily ptf = input.getTypeFamily();
  PTable<Integer, Pair<T, N>> groupedIn = input.parallelDo(
      new MapFn<Pair<T, N>, Pair<Integer, Pair<T, N>>>() {
        @Override
        public Pair<Integer, Pair<T, N>> map(Pair<T, N> p) {
          return Pair.of(0, p);
        }
      }, ptf.tableOf(ptf.ints(), input.getPType()));
  return groupedWeightedSample(groupedIn, sampleSize, random).values();
}

开发者ID:apsaltis，项目名称:oryx，代码行数:15，代码来源:ReservoirSampling.java

示例13: createPipeline

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {

  JobStepConfig jobConfig = getConfig();

  String instanceDir = jobConfig.getInstanceDir();
  long generationID = jobConfig.getGenerationID();
  long lastGenerationID = jobConfig.getLastGenerationID();

  String outputKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "input/";
  if (!validOutputPath(outputKey)) {
    return null;
  }

  MRPipeline p = createBasicPipeline(MergeNewOldValuesFn.class);

  String inboundKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "inbound/";

  PCollection<Pair<Long, NumericIDValue>> parsed = p.read(textInput(inboundKey))
      .parallelDo("inboundParse", new DelimitedInputParseFn(),
          Avros.pairs(ALSTypes.LONGS, ALSTypes.IDVALUE));

  PTable<Pair<Long, Integer>, NumericIDValue> inbound = parsed.parallelDo("inbound", new InboundJoinFn(), JOIN_TYPE);

  if (lastGenerationID >= 0) {
    String inputPrefix = Namespaces.getInstanceGenerationPrefix(instanceDir, lastGenerationID) + "input/";
    Preconditions.checkState(Store.get().exists(inputPrefix, false), "Input path does not exist: %s", inputPrefix);
    PTable<Pair<Long, Integer>, NumericIDValue> joinBefore = p.read(input(inputPrefix, ALSTypes.VALUE_MATRIX))
        .parallelDo("lastGeneration", new JoinBeforeMapFn(), JOIN_TYPE);
    inbound = inbound.union(joinBefore);
  }

  GroupingOptions groupingOptions = GroupingOptions.builder()
      .partitionerClass(JoinUtils.getPartitionerClass(inbound.getTypeFamily()))
      .numReducers(getNumReducers())
      .build();

  inbound
      .groupByKey(groupingOptions)
      .parallelDo(new MergeNewOldValuesFn(), ALSTypes.VALUE_MATRIX)
      .write(output(outputKey));
  return p;
}

开发者ID:apsaltis，项目名称:oryx，代码行数:44，代码来源:MergeNewOldStep.java

示例14: createPipeline

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
  JobStepConfig stepConfig = getConfig();
  ClusterSettings settings = ClusterSettings.create(ConfigUtils.getDefaultConfig());

  String instanceDir = stepConfig.getInstanceDir();
  long generationID = stepConfig.getGenerationID();
  int iteration = stepConfig.getIteration();
  String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
  String outputKey = prefix + String.format("sketch/%d/", iteration);
  if (!validOutputPath(outputKey)) {
    return null;
  }

  // get normalized vectors
  String inputKey = prefix + "normalized/";
  MRPipeline p = createBasicPipeline(DistanceToClosestFn.class);
  AvroType<Pair<Integer, RealVector>> inputType = Avros.pairs(Avros.ints(), MLAvros.vector());
  PCollection<Pair<Integer, RealVector>> in = p.read(avroInput(inputKey, inputType));

  // either create or load the set of currently chosen k-sketch vectors
  // they are stored in a KSketchIndex object
  DistanceToClosestFn<RealVector> distanceToClosestFn;
  UpdateIndexFn updateIndexFn;
  if (iteration == 1) { // Iteration 1 is the first real iteration; iteration 0 contains initial state
    KSketchIndex index = createInitialIndex(settings, in);
    distanceToClosestFn = new DistanceToClosestFn<RealVector>(index);
    updateIndexFn = new UpdateIndexFn(index);
  } else {
    // Get the index location from the previous iteration
    String previousIndexKey = prefix + String.format("sketch/%d/", iteration - 1);
    distanceToClosestFn = new DistanceToClosestFn<RealVector>(previousIndexKey);
    updateIndexFn = new UpdateIndexFn(previousIndexKey);
  }

  // compute distance of each vector in dataset to closest vector in k-sketch
  PTable<Integer, Pair<RealVector, Double>> weighted = in.parallelDo("computeDistances", distanceToClosestFn,
      Avros.tableOf(Avros.ints(), Avros.pairs(MLAvros.vector(), Avros.doubles())));

  // run weighted reservoir sampling on the vector to select another group of settings.getSketchPoints()
  // to add to the k-sketch
  PTable<Integer,RealVector> kSketchSample = ReservoirSampling.groupedWeightedSample(weighted,
      settings.getSketchPoints(), RandomManager.getRandom());

  // update the KSketchIndex with the newly-chosen vectors
  kSketchSample.parallelDo("updateIndex", updateIndexFn, Serializables.avro(KSketchIndex.class))
      .write(avroOutput(outputKey));

  return p;
}

开发者ID:apsaltis，项目名称:oryx，代码行数:51，代码来源:KSketchSamplingStep.java

示例15: apply

import org.apache.crunch.PCollection; //导入方法依赖的package包/类
public static PCollection<Record> apply(PCollection<String> in) {
  return in.parallelDo("string-split",
      new StringSplitFn(),
      MLRecords.csvRecord(in.getTypeFamily(), String.valueOf(DelimitedDataUtils.DELIMITER)));
}

开发者ID:apsaltis，项目名称:oryx，代码行数:6，代码来源:StringSplitFn.java

注：本文中的org.apache.crunch.PCollection.parallelDo方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。