本文整理汇总了Java中org.apache.crunch.impl.mr.MRPipeline.read方法的典型用法代码示例。如果您正苦于以下问题:Java MRPipeline.read方法的具体用法?Java MRPipeline.read怎么用?Java MRPipeline.read使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.crunch.impl.mr.MRPipeline
的用法示例。
在下文中一共展示了MRPipeline.read方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig stepConfig = getConfig();
Config config = ConfigUtils.getDefaultConfig();
EvaluationSettings settings = EvaluationSettings.create(config);
String instanceDir = stepConfig.getInstanceDir();
long generationID = stepConfig.getGenerationID();
String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
String outputKey = prefix + "eval/";
if (!validOutputPath(outputKey)) {
return null;
}
String inputKey = prefix + "weighted/weightedKSketchVectors/";
MRPipeline p = createBasicPipeline(ClosestSketchVectorFn.class);
PType<Pair<Integer, WeightedRealVector>> inputType = KMeansTypes.FOLD_WEIGHTED_VECTOR;
PCollection<Pair<Integer, WeightedRealVector>> weightedSketchVectors = p.read(avroInput(inputKey, inputType));
PCollection<KMeansEvaluationData> evaluation = weightedSketchVectors
.parallelDo("replicate",
new ReplicateValuesFn<Pair<Integer, WeightedRealVector>>(settings.getKValues(), settings.getReplications()),
Avros.tableOf(Avros.pairs(Avros.ints(), Avros.ints()), Avros.pairs(Avros.ints(), MLAvros.weightedVector())))
.groupByKey(settings.getParallelism())
.parallelDo("cluster",
new KMeansClusteringFn(settings),
Serializables.avro(KMeansEvaluationData.class));
// Write out the centers themselves to a text file
evaluation.parallelDo("replicaCenters", new CentersOutputFn(prefix), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputKey + "replicaCenters/"));
// Write out the per-replica stats
evaluation.parallelDo("replicaStats", new StatsOutputFn(), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputKey + "replicaStats/"));
return p;
}
示例2: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig jobConfig = getConfig();
String instanceDir = jobConfig.getInstanceDir();
long generationID = jobConfig.getGenerationID();
long lastGenerationID = jobConfig.getLastGenerationID();
String outputKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "input/";
if (!validOutputPath(outputKey)) {
return null;
}
MRPipeline p = createBasicPipeline(IdentityFn.class);
String inboundKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "inbound/";
PCollection<String> inbound = p.read(textInput(inboundKey));
if (lastGenerationID >= 0) {
String inputPrefix = Namespaces.getInstanceGenerationPrefix(instanceDir, lastGenerationID) + "input/";
Preconditions.checkState(Store.get().exists(inputPrefix, false), "Input path does not exist: %s", inputPrefix);
PCollection<String> lastInput = p.read(textInput(inputPrefix));
inbound = inbound.union(lastInput);
}
inbound.write(compressedTextOutput(p.getConfiguration(), outputKey));
return p;
}
示例3: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig stepConfig = getConfig();
ClusterSettings settings = ClusterSettings.create(ConfigUtils.getDefaultConfig());
String instanceDir = stepConfig.getInstanceDir();
long generationID = stepConfig.getGenerationID();
int iteration = stepConfig.getIteration();
String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
String outputKey = prefix + String.format("sketch/%d/", iteration);
if (!validOutputPath(outputKey)) {
return null;
}
// get normalized vectors
String inputKey = prefix + "normalized/";
MRPipeline p = createBasicPipeline(DistanceToClosestFn.class);
AvroType<Pair<Integer, RealVector>> inputType = Avros.pairs(Avros.ints(), MLAvros.vector());
PCollection<Pair<Integer, RealVector>> in = p.read(avroInput(inputKey, inputType));
// either create or load the set of currently chosen k-sketch vectors
// they are stored in a KSketchIndex object
DistanceToClosestFn<RealVector> distanceToClosestFn;
UpdateIndexFn updateIndexFn;
if (iteration == 1) { // Iteration 1 is the first real iteration; iteration 0 contains initial state
KSketchIndex index = createInitialIndex(settings, in);
distanceToClosestFn = new DistanceToClosestFn<RealVector>(index);
updateIndexFn = new UpdateIndexFn(index);
} else {
// Get the index location from the previous iteration
String previousIndexKey = prefix + String.format("sketch/%d/", iteration - 1);
distanceToClosestFn = new DistanceToClosestFn<RealVector>(previousIndexKey);
updateIndexFn = new UpdateIndexFn(previousIndexKey);
}
// compute distance of each vector in dataset to closest vector in k-sketch
PTable<Integer, Pair<RealVector, Double>> weighted = in.parallelDo("computeDistances", distanceToClosestFn,
Avros.tableOf(Avros.ints(), Avros.pairs(MLAvros.vector(), Avros.doubles())));
// run weighted reservoir sampling on the vector to select another group of settings.getSketchPoints()
// to add to the k-sketch
PTable<Integer,RealVector> kSketchSample = ReservoirSampling.groupedWeightedSample(weighted,
settings.getSketchPoints(), RandomManager.getRandom());
// update the KSketchIndex with the newly-chosen vectors
kSketchSample.parallelDo("updateIndex", updateIndexFn, Serializables.avro(KSketchIndex.class))
.write(avroOutput(outputKey));
return p;
}