当前位置: 首页>>代码示例>>Java>>正文


Java MRPipeline.read方法代码示例

本文整理汇总了Java中org.apache.crunch.impl.mr.MRPipeline.read方法的典型用法代码示例。如果您正苦于以下问题:Java MRPipeline.read方法的具体用法?Java MRPipeline.read怎么用?Java MRPipeline.read使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.crunch.impl.mr.MRPipeline的用法示例。


在下文中一共展示了MRPipeline.read方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: createPipeline

import org.apache.crunch.impl.mr.MRPipeline; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
  JobStepConfig stepConfig = getConfig();
  Config config = ConfigUtils.getDefaultConfig();
  EvaluationSettings settings = EvaluationSettings.create(config);

  String instanceDir = stepConfig.getInstanceDir();
  long generationID = stepConfig.getGenerationID();
  String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
  String outputKey = prefix + "eval/";
  if (!validOutputPath(outputKey)) {
    return null;
  }
  String inputKey = prefix + "weighted/weightedKSketchVectors/";
  MRPipeline p = createBasicPipeline(ClosestSketchVectorFn.class);

  PType<Pair<Integer, WeightedRealVector>> inputType = KMeansTypes.FOLD_WEIGHTED_VECTOR;
  PCollection<Pair<Integer, WeightedRealVector>> weightedSketchVectors = p.read(avroInput(inputKey, inputType));

  PCollection<KMeansEvaluationData> evaluation = weightedSketchVectors
      .parallelDo("replicate",
          new ReplicateValuesFn<Pair<Integer, WeightedRealVector>>(settings.getKValues(), settings.getReplications()),
          Avros.tableOf(Avros.pairs(Avros.ints(), Avros.ints()), Avros.pairs(Avros.ints(), MLAvros.weightedVector())))
      .groupByKey(settings.getParallelism())
      .parallelDo("cluster",
          new KMeansClusteringFn(settings),
          Serializables.avro(KMeansEvaluationData.class));

  // Write out the centers themselves to a text file
  evaluation.parallelDo("replicaCenters", new CentersOutputFn(prefix), Avros.strings())
      .write(compressedTextOutput(p.getConfiguration(), outputKey + "replicaCenters/"));

  // Write out the per-replica stats
  evaluation.parallelDo("replicaStats", new StatsOutputFn(), Avros.strings())
      .write(compressedTextOutput(p.getConfiguration(), outputKey + "replicaStats/"));

  return p;
}
 
开发者ID:apsaltis,项目名称:oryx,代码行数:39,代码来源:ClusteringStep.java

示例2: createPipeline

import org.apache.crunch.impl.mr.MRPipeline; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {

  JobStepConfig jobConfig = getConfig();

  String instanceDir = jobConfig.getInstanceDir();
  long generationID = jobConfig.getGenerationID();
  long lastGenerationID = jobConfig.getLastGenerationID();

  String outputKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "input/";
  if (!validOutputPath(outputKey)) {
    return null;
  }

  MRPipeline p = createBasicPipeline(IdentityFn.class);

  String inboundKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "inbound/";

  PCollection<String> inbound = p.read(textInput(inboundKey));

  if (lastGenerationID >= 0) {
    String inputPrefix = Namespaces.getInstanceGenerationPrefix(instanceDir, lastGenerationID) + "input/";
    Preconditions.checkState(Store.get().exists(inputPrefix, false), "Input path does not exist: %s", inputPrefix);
    PCollection<String> lastInput = p.read(textInput(inputPrefix));
    inbound = inbound.union(lastInput);
  }

  inbound.write(compressedTextOutput(p.getConfiguration(), outputKey));
  return p;
}
 
开发者ID:apsaltis,项目名称:oryx,代码行数:31,代码来源:MergeNewOldStep.java

示例3: createPipeline

import org.apache.crunch.impl.mr.MRPipeline; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
  JobStepConfig stepConfig = getConfig();
  ClusterSettings settings = ClusterSettings.create(ConfigUtils.getDefaultConfig());

  String instanceDir = stepConfig.getInstanceDir();
  long generationID = stepConfig.getGenerationID();
  int iteration = stepConfig.getIteration();
  String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
  String outputKey = prefix + String.format("sketch/%d/", iteration);
  if (!validOutputPath(outputKey)) {
    return null;
  }

  // get normalized vectors
  String inputKey = prefix + "normalized/";
  MRPipeline p = createBasicPipeline(DistanceToClosestFn.class);
  AvroType<Pair<Integer, RealVector>> inputType = Avros.pairs(Avros.ints(), MLAvros.vector());
  PCollection<Pair<Integer, RealVector>> in = p.read(avroInput(inputKey, inputType));

  // either create or load the set of currently chosen k-sketch vectors
  // they are stored in a KSketchIndex object
  DistanceToClosestFn<RealVector> distanceToClosestFn;
  UpdateIndexFn updateIndexFn;
  if (iteration == 1) { // Iteration 1 is the first real iteration; iteration 0 contains initial state
    KSketchIndex index = createInitialIndex(settings, in);
    distanceToClosestFn = new DistanceToClosestFn<RealVector>(index);
    updateIndexFn = new UpdateIndexFn(index);
  } else {
    // Get the index location from the previous iteration
    String previousIndexKey = prefix + String.format("sketch/%d/", iteration - 1);
    distanceToClosestFn = new DistanceToClosestFn<RealVector>(previousIndexKey);
    updateIndexFn = new UpdateIndexFn(previousIndexKey);
  }

  // compute distance of each vector in dataset to closest vector in k-sketch
  PTable<Integer, Pair<RealVector, Double>> weighted = in.parallelDo("computeDistances", distanceToClosestFn,
      Avros.tableOf(Avros.ints(), Avros.pairs(MLAvros.vector(), Avros.doubles())));

  // run weighted reservoir sampling on the vector to select another group of settings.getSketchPoints()
  // to add to the k-sketch
  PTable<Integer,RealVector> kSketchSample = ReservoirSampling.groupedWeightedSample(weighted,
      settings.getSketchPoints(), RandomManager.getRandom());

  // update the KSketchIndex with the newly-chosen vectors
  kSketchSample.parallelDo("updateIndex", updateIndexFn, Serializables.avro(KSketchIndex.class))
      .write(avroOutput(outputKey));

  return p;
}
 
开发者ID:apsaltis,项目名称:oryx,代码行数:51,代码来源:KSketchSamplingStep.java


注:本文中的org.apache.crunch.impl.mr.MRPipeline.read方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。