本文整理汇总了Java中org.apache.crunch.impl.mr.MRPipeline类的典型用法代码示例。如果您正苦于以下问题:Java MRPipeline类的具体用法?Java MRPipeline怎么用?Java MRPipeline使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
MRPipeline类属于org.apache.crunch.impl.mr包,在下文中一共展示了MRPipeline类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testGeneric
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Test
public void testGeneric() throws IOException {
Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).build());
// write two files, each of 5 records
writeTestUsers(inputDataset, 5, 0);
writeTestUsers(inputDataset, 5, 5);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
checkTestUsers(outputDataset, 10);
}
示例2: testGenericParquet
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Test
public void testGenericParquet() throws IOException {
Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).format(Formats.PARQUET).build());
Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).format(Formats.PARQUET).build());
// write two files, each of 5 records
writeTestUsers(inputDataset, 5, 0);
writeTestUsers(inputDataset, 5, 5);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputDataset, GenericData.Record.class));
pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
pipeline.run();
checkTestUsers(outputDataset, 10);
}
示例3: testPartitionedSourceAndTarget
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Test
@SuppressWarnings("deprecation")
public void testPartitionedSourceAndTarget() throws IOException {
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
"username", 2).build();
Dataset<Record> inputDataset = repo.create("in", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
Dataset<Record> outputDataset = repo.create("out", new DatasetDescriptor.Builder()
.schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
writeTestUsers(inputDataset, 10);
PartitionKey key = partitionStrategy.partitionKey(0);
Dataset<Record> inputPart0 = inputDataset.getPartition(key, false);
Dataset<Record> outputPart0 = outputDataset.getPartition(key, true);
Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
PCollection<GenericData.Record> data = pipeline.read(
CrunchDatasets.asSource(inputPart0, GenericData.Record.class));
pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
pipeline.run();
Assert.assertEquals(5, datasetSize(outputPart0));
}
示例4: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
String instanceDir = config.getInstanceDir();
long generationID = config.getGenerationID();
String inputKey = Namespaces.getTempPrefix(instanceDir, generationID) + "partialRecommend/";
String outputKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "recommend/";
if (!validOutputPath(outputKey)) {
return null;
}
MRPipeline p = createBasicPipeline(CollectRecommendFn.class);
p.getConfiguration().set(IDMappingState.ID_MAPPING_KEY,
Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "idMapping/");
PTables.asPTable(p.read(input(inputKey, ALSTypes.VALUE_MATRIX)))
.groupByKey(groupingOptions())
.parallelDo("collectRecommend", new CollectRecommendFn(), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputKey));
return p;
}
示例5: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
IterationState iterationState = getIterationState();
String iterationKey = iterationState.getIterationKey();
JobStepConfig config = getConfig();
String instanceDir = config.getInstanceDir();
long generationID = config.getGenerationID();
String tempPrefix = Namespaces.getTempPrefix(instanceDir, generationID);
String outputKey = tempPrefix + "partialRecommend/";
if (!validOutputPath(outputKey)) {
return null;
}
MRPipeline p = createBasicPipeline(RecommendReduceFn.class);
Configuration conf = p.getConfiguration();
conf.set(Y_KEY_KEY, iterationKey + "Y/");
PTables.asPTable(p.read(input(tempPrefix + "distributeRecommend/", ALSTypes.REC_TYPE)))
.groupByKey(groupingOptions())
.parallelDo("recommend", new RecommendReduceFn(), ALSTypes.VALUE_MATRIX)
.write(output(outputKey));
return p;
}
示例6: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Override
protected final MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
IterationState iterationState = getIterationState();
String iterationKey = iterationState.getIterationKey();
String xOrY = isX() ? "X/" : "Y/";
String outputKeyPath =
Namespaces.getInstanceGenerationPrefix(config.getInstanceDir(), config.getGenerationID()) + xOrY;
if (!validOutputPath(outputKeyPath)) {
return null;
}
MRPipeline p = createBasicPipeline(PublishMapFn.class);
p.read(input(iterationKey + xOrY, ALSTypes.DENSE_ROW_MATRIX))
.parallelDo("publish", new PublishMapFn(), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputKeyPath));
return p;
}
示例7: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Override
protected final MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
String instanceDir = config.getInstanceDir();
long generationID = config.getGenerationID();
String inputKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "input/";
String outputKey = Namespaces.getTempPrefix(instanceDir, generationID) + getSuffix();
if (!validOutputPath(outputKey)) {
return null;
}
MRPipeline p = createBasicPipeline(ToVectorReduceFn.class);
getMatrix(p, inputKey)
.groupByKey(groupingOptions())
.parallelDo("toVectors", new ToVectorReduceFn(), ALSTypes.SPARSE_ROW_MATRIX)
.write(output(outputKey));
return p;
}
示例8: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
String instanceDir = config.getInstanceDir();
long generationID = config.getGenerationID();
String outputKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "knownItems/";
if (!validOutputPath(outputKey)) {
return null;
}
MRPipeline p = createBasicPipeline(CollectKnownItemsFn.class);
// Really should read in and exclude tag IDs but doesn't really hurt much
p.read(input(Namespaces.getTempPrefix(instanceDir, generationID) + "userVectors/", ALSTypes.SPARSE_ROW_MATRIX))
.parallelDo("collectKnownItems", new CollectKnownItemsFn(), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputKey));
return p;
}
示例9: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Override
protected final MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
String tempPrefix = Namespaces.getTempPrefix(config.getInstanceDir(), config.getGenerationID());
String outputPathKey = tempPrefix + getPopularPathDir() + '/';
if (!validOutputPath(outputPathKey)) {
return null;
}
MRPipeline p = createBasicPipeline(PopularMapFn.class);
p.read(input(tempPrefix + getSourceDir() + '/', ALSTypes.SPARSE_ROW_MATRIX))
.parallelDo("popularMap", new PopularMapFn(), Avros.tableOf(ALSTypes.INTS, ALSTypes.ID_SET))
.groupByKey(groupingOptions())
//.combineValues(new FastIDSetAggregator())
.parallelDo("popularReduce", new PopularReduceFn(), ALSTypes.LONGS)
.write(output(outputPathKey));
return p;
}
示例10: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
String instanceDir = config.getInstanceDir();
long generationID = config.getGenerationID();
String tempPrefix = Namespaces.getTempPrefix(instanceDir, generationID);
String outputPathKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "similarItems/";
if (!validOutputPath(outputPathKey)) {
return null;
}
MRPipeline p = createBasicPipeline(SimilarReduceFn.class);
p.getConfiguration().set(IDMappingState.ID_MAPPING_KEY,
Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "idMapping/");
PTables.asPTable(p.read(input(tempPrefix + "distributeSimilar/", ALSTypes.VALUE_MATRIX)))
.groupByKey(groupingOptions())
.parallelDo("similarReduce", new SimilarReduceFn(), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputPathKey));
return p;
}
示例11: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig stepConfig = getConfig();
String instanceDir = stepConfig.getInstanceDir();
long generationID = stepConfig.getGenerationID();
String outputKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "summary/";
if (!validOutputPath(outputKey)) {
return null;
}
String inboundKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "inbound/";
InboundSettings settings = InboundSettings.create(ConfigUtils.getDefaultConfig());
MRPipeline p = createBasicPipeline(StringSplitFn.class);
PCollection<Record> records = toRecords(p.read(textInput(inboundKey)));
PCollection<String> jsonSummary = getSummarizer(settings).buildJson(records);
jsonSummary.write(compressedTextOutput(p.getConfiguration(), outputKey));
return p;
}
示例12: createPipeline
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
String instanceGenerationPrefix =
Namespaces.getInstanceGenerationPrefix(config.getInstanceDir(), config.getGenerationID());
String outputPathKey = instanceGenerationPrefix + "trees/";
if (!validOutputPath(outputPathKey)) {
return null;
}
MRPipeline p = createBasicPipeline(DistributeExampleFn.class);
p.read(textInput(instanceGenerationPrefix + "inbound/"))
.parallelDo("distributeData",
new DistributeExampleFn(),
Avros.tableOf(Avros.ints(), Avros.strings()))
.groupByKey(groupingOptions())
.parallelDo("buildTrees", new BuildTreeFn(), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputPathKey));
return p;
}
示例13: startMapReduce
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
@Override
public void startMapReduce(String taskName) {
String outputDirectory = propertyConfig.getProperty("sqoop.task." + taskName + ".toJobConfig.outputDirectory");
String hadoopAddress = propertyConfig.getProperty("sqoop.task." + taskName + ".tolink.linkConfig.uri");
Pipeline pipeline = new MRPipeline(CrunchMapReduce.class, new Configuration());
Class<AvroParquetFileSourceTarget> avroParquetFileSourceTargetClass = AvroParquetFileSourceTarget.class;
}
示例14: run
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
public int run(String[] args) throws Exception {
String fooInputPath = args[0];
String barInputPath = args[1];
String outputPath = args[2];
int fooValMax = Integer.parseInt(args[3]);
int joinValMax = Integer.parseInt(args[4]);
int numberOfReducers = Integer.parseInt(args[5]);
Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
PCollection<String> fooLines = pipeline.readTextFile(fooInputPath); //<2>
PCollection<String> barLines = pipeline.readTextFile(barInputPath);
PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo( //<3>
new FooIndicatorFn(),
Avros.tableOf(Avros.longs(),
Avros.pairs(Avros.longs(), Avros.ints())));
fooTable = fooTable.filter(new FooFilter(fooValMax)); //<4>
PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
Avros.tableOf(Avros.longs(), Avros.ints()));
DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy = //<5>
new DefaultJoinStrategy
<Long, Pair<Long, Integer>, Integer>
(numberOfReducers);
PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
.join(fooTable, barTable, JoinType.INNER_JOIN);
PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));
filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>
PipelineResult result = pipeline.done();
return result.succeeded() ? 0 : 1;
}
示例15: run
import org.apache.crunch.impl.mr.MRPipeline; //导入依赖的package包/类
public int run(String[] args) throws Exception {
super.parseArguments(args);
// Create an object to coordinate pipeline creation and execution.
Pipeline pipeline = new MRPipeline(ComputeReadDepthInInterval.class, getConf());
// Set up source to read from BAMs/SAMs
TableSource<Long, SAMRecordWritable> samSource = From.formattedFile(inputPath,
AnySAMInputFormat.class,
Writables.longs(),
Writables.writables(SAMRecordWritable.class));
// Read in SAMRecords
PCollection<SAMRecordWritable> records = pipeline.read(samSource).values();
// Filter reads to mapped reads
PCollection<SAMRecordWritable> mappedReads = records.filter(new MappedReadFilter());
PCollection<Pair<String, Integer>> contigIntervals = mappedReads.parallelDo(
"ComputeDepthInInterval",
new ComputeDepthInInterval(intervalLength),
Writables.pairs(Writables.strings(), Writables.ints()));
// Compute read depth distribution
PTable<Pair<String, Integer>, Long> contigIntervalCounts = contigIntervals.count();
// Instruct the pipeline to write the resulting counts to a text file.
pipeline.writeTextFile(contigIntervalCounts, outputPath);
// Execute the pipeline as a MapReduce.
PipelineResult result = pipeline.done();
return result.succeeded() ? 0 : 1;
}