本文整理汇总了Java中org.apache.crunch.PCollection.parallelDo方法的典型用法代码示例。如果您正苦于以下问题:Java PCollection.parallelDo方法的具体用法?Java PCollection.parallelDo怎么用?Java PCollection.parallelDo使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.crunch.PCollection
的用法示例。
在下文中一共展示了PCollection.parallelDo方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: loadKeyedRecords
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
Set<String> samples)
throws IOException {
PCollection<Pair<org.bdgenomics.formats.avro.Variant, Collection<Genotype>>> adamRecords
= readVariants(inputFormat, inputPath, conf, pipeline, sampleGroup);
// The data are now loaded into ADAM variant objects; convert to keyed SpecificRecords
ADAMToKeyedSpecificRecordFn converter =
new ADAMToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
@SuppressWarnings("unchecked")
PType<SpecificRecord> specificPType = Avros.specifics(converter.getSpecificRecordType());
return adamRecords.parallelDo("Convert to keyed SpecificRecords",
converter, Avros.tableOf(KEY_PTYPE, specificPType));
}
示例2: readVariants
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
private static PCollection<Pair<Variant, Collection<Genotype>>>
readVariants(String inputFormat, Path inputPath, Configuration conf,
Pipeline pipeline, String sampleGroup) throws IOException {
PCollection<Pair<Variant, Collection<Genotype>>> adamRecords;
if (inputFormat.equals("VCF")) {
TableSource<LongWritable, VariantContextWritable> vcfSource =
From.formattedFile(
inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
PType<Pair<Variant, Collection<Genotype>>> adamPType =
Avros.pairs(Avros.specifics(org.bdgenomics.formats.avro.Variant.class),
Avros.collections(Avros.specifics(Genotype.class)));
adamRecords =
vcfRecords.parallelDo("VCF to ADAM Variant", new VCFToADAMVariantFn(), adamPType);
} else if (inputFormat.equals("AVRO")) {
throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
} else if (inputFormat.equals("PARQUET")) {
throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
} else {
throw new IllegalStateException("Unrecognized input format: " + inputFormat);
}
return adamRecords;
}
示例3: testCategorical
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Test
public void testCategorical() {
PCollection<String> input = MemPipeline.typedCollectionOf(
Avros.strings(),
"1.0,a,3.0,y",
"0.4,b,1.0,x",
"3.2,c,29.0,z");
PCollection<Record> elems = StringSplitFn.apply(input);
Summary s = new Summarizer()
.categoricalColumns(1, 3)
.build(elems).getValue();
PCollection<RealVector> vecs = elems.parallelDo(new StandardizeFn(s), MLAvros.vector());
assertEquals(ImmutableList.of(
Vectors.of(1.0, 1, 0, 0, 3.0, 0.0, 1.0, 0.0),
Vectors.of(0.4, 0, 1, 0, 1.0, 1.0, 0.0, 0.0),
Vectors.of(3.2, 0, 0, 1, 29.0, 0, 0, 1)),
vecs.materialize());
}
示例4: apply
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
public <T> PCollection<Pair<Integer, T>> apply(PCollection<T> pcollect) {
PTypeFamily ptf = pcollect.getTypeFamily();
PType<Pair<Integer, T>> pt = ptf.pairs(ptf.ints(), pcollect.getPType());
return pcollect.parallelDo("crossfold", new MapFn<T, Pair<Integer, T>>() {
private transient RandomGenerator rand;
@Override
public void initialize() {
if (rand == null) {
this.rand = RandomManager.getSeededRandom(seed);
}
}
@Override
public Pair<Integer, T> map(T t) {
return Pair.of(rand.nextInt(numFolds), t);
}
}, pt);
}
示例5: byFieldNames
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
public static <T extends SpecificRecord> PCollection<ThriftRecord> byFieldNames(
final PCollection<T> collection,
final String rowKeyFieldName,
final String ttlFieldName,
final String timestampFieldName
) {
final Class<T> recordType = collection.getPType().getTypeClass();
T record;
try {
record = recordType.getConstructor().newInstance();
} catch (Exception e) {
throw new RuntimeException("Could not create an instance of the record to determine it's schema", e);
}
ThriftByFieldNamesFn<T> doFn = new ThriftByFieldNamesFn<T>(record.getSchema(), rowKeyFieldName, ttlFieldName, timestampFieldName);
return collection.parallelDo(doFn, ThriftRecord.PTYPE);
}
示例6: run
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
public int run(String[] args) throws Exception {
String fooInputPath = args[0];
String barInputPath = args[1];
String outputPath = args[2];
int fooValMax = Integer.parseInt(args[3]);
int joinValMax = Integer.parseInt(args[4]);
int numberOfReducers = Integer.parseInt(args[5]);
Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
PCollection<String> fooLines = pipeline.readTextFile(fooInputPath); //<2>
PCollection<String> barLines = pipeline.readTextFile(barInputPath);
PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo( //<3>
new FooIndicatorFn(),
Avros.tableOf(Avros.longs(),
Avros.pairs(Avros.longs(), Avros.ints())));
fooTable = fooTable.filter(new FooFilter(fooValMax)); //<4>
PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
Avros.tableOf(Avros.longs(), Avros.ints()));
DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy = //<5>
new DefaultJoinStrategy
<Long, Pair<Long, Integer>, Integer>
(numberOfReducers);
PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
.join(fooTable, barTable, JoinType.INNER_JOIN);
PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));
filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>
PipelineResult result = pipeline.done();
return result.succeeded() ? 0 : 1;
}
示例7: loadKeyedRecords
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
Set<String> samples)
throws IOException {
PCollection<Variant> variants = readVariants(inputFormat, inputPath,
conf, pipeline, sampleGroup);
GA4GHToKeyedSpecificRecordFn converter =
new GA4GHToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
@SuppressWarnings("unchecked")
PType<SpecificRecord> specificPType = Avros.specifics(converter
.getSpecificRecordType());
return variants.parallelDo("Convert to keyed SpecificRecords",
converter, Avros.tableOf(KEY_PTYPE, specificPType));
}
示例8: readVariants
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
private static PCollection<Variant> readVariants(String inputFormat, Path inputPath,
Configuration conf, Pipeline pipeline, String sampleGroup) throws IOException {
PCollection<Variant> variants;
if (inputFormat.equals("VCF")) {
VCFToGA4GHVariantFn.configureHeaders(
conf, FileUtils.findVcfs(inputPath, conf), sampleGroup);
TableSource<LongWritable, VariantContextWritable> vcfSource =
From.formattedFile(
inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
variants = vcfRecords.parallelDo(
"VCF to GA4GH Variant", new VCFToGA4GHVariantFn(), Avros.specifics(Variant.class));
} else if (inputFormat.equals("AVRO")) {
variants = pipeline.read(From.avroFile(inputPath, Avros.specifics(Variant.class)));
} else if (inputFormat.equals("PARQUET")) {
@SuppressWarnings("unchecked")
Source<Variant> source =
new AvroParquetFileSource(inputPath, Avros.specifics(Variant.class));
variants = pipeline.read(source);
} else {
throw new IllegalStateException("Unrecognized input format: " + inputFormat);
}
return variants;
}
示例9: run
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
public int run(String... args) throws Exception {
if (args.length != 3) {
System.err.println("Usage: " + CombinedLogFormatConverter.class.getSimpleName() +
" <input> <dataset_uri> <dataset name>");
return 1;
}
String input = args[0];
String datasetUri = args[1];
String datasetName = args[2];
Schema schema = new Schema.Parser().parse(
Resources.getResource("combined_log_format.avsc").openStream());
// Create the dataset
DatasetRepository repo = DatasetRepositories.open(datasetUri);
DatasetDescriptor datasetDescriptor = new DatasetDescriptor.Builder()
.schema(schema).build();
Dataset<Object> dataset = repo.create(datasetName, datasetDescriptor);
// Run the job
final String schemaString = schema.toString();
AvroType<GenericData.Record> outputType = Avros.generics(schema);
PCollection<String> lines = readTextFile(input);
PCollection<GenericData.Record> records = lines.parallelDo(
new ConvertFn(schemaString), outputType);
getPipeline().write(records, CrunchDatasets.asTarget(dataset),
Target.WriteMode.APPEND);
run();
return 0;
}
示例10: createPipeline
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig stepConfig = getConfig();
String instanceDir = stepConfig.getInstanceDir();
long generationID = stepConfig.getGenerationID();
String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
String outputKey = prefix + "normalized/";
if (!validOutputPath(outputKey)) {
return null;
}
String inboundKey = prefix + "inbound/";
String summaryKey = prefix + "summary/";
Config config = ConfigUtils.getDefaultConfig();
InboundSettings inbound = InboundSettings.create(config);
NormalizeSettings settings = NormalizeSettings.create(config);
MRPipeline p = createBasicPipeline(StringSplitFn.class);
PCollection<Record> records = toRecords(p.read(textInput(inboundKey)));
StandardizeFn standardizeFn = getStandardizeFn(inbound, settings, summaryKey);
PCollection<RealVector> vecs = records.parallelDo("normalize", standardizeFn, MLAvros.vector());
// assign cross-folds here
new Crossfold(config.getInt("model.cross-folds"))
.apply(vecs)
.write(avroOutput(outputKey));
return p;
}
示例11: testSimple
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Test
public void testSimple() {
PCollection<String> input = MemPipeline.typedCollectionOf(
Avros.strings(),
"1.0,2.0,3.0",
"0.4,2.0,1.0",
"3.2,17.0,29.0");
PCollection<Record> elems = StringSplitFn.apply(input);
PCollection<RealVector> vecs = elems.parallelDo(new StandardizeFn(), MLAvros.vector());
assertEquals(ImmutableList.of(Vectors.of(1, 2, 3), Vectors.of(0.4, 2, 1),
Vectors.of(3.2, 17, 29)), vecs.materialize());
}
示例12: weightedSample
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
public static <T, N extends Number> PCollection<T> weightedSample(
PCollection<Pair<T, N>> input,
int sampleSize,
RandomGenerator random) {
PTypeFamily ptf = input.getTypeFamily();
PTable<Integer, Pair<T, N>> groupedIn = input.parallelDo(
new MapFn<Pair<T, N>, Pair<Integer, Pair<T, N>>>() {
@Override
public Pair<Integer, Pair<T, N>> map(Pair<T, N> p) {
return Pair.of(0, p);
}
}, ptf.tableOf(ptf.ints(), input.getPType()));
return groupedWeightedSample(groupedIn, sampleSize, random).values();
}
示例13: createPipeline
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig jobConfig = getConfig();
String instanceDir = jobConfig.getInstanceDir();
long generationID = jobConfig.getGenerationID();
long lastGenerationID = jobConfig.getLastGenerationID();
String outputKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "input/";
if (!validOutputPath(outputKey)) {
return null;
}
MRPipeline p = createBasicPipeline(MergeNewOldValuesFn.class);
String inboundKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "inbound/";
PCollection<Pair<Long, NumericIDValue>> parsed = p.read(textInput(inboundKey))
.parallelDo("inboundParse", new DelimitedInputParseFn(),
Avros.pairs(ALSTypes.LONGS, ALSTypes.IDVALUE));
PTable<Pair<Long, Integer>, NumericIDValue> inbound = parsed.parallelDo("inbound", new InboundJoinFn(), JOIN_TYPE);
if (lastGenerationID >= 0) {
String inputPrefix = Namespaces.getInstanceGenerationPrefix(instanceDir, lastGenerationID) + "input/";
Preconditions.checkState(Store.get().exists(inputPrefix, false), "Input path does not exist: %s", inputPrefix);
PTable<Pair<Long, Integer>, NumericIDValue> joinBefore = p.read(input(inputPrefix, ALSTypes.VALUE_MATRIX))
.parallelDo("lastGeneration", new JoinBeforeMapFn(), JOIN_TYPE);
inbound = inbound.union(joinBefore);
}
GroupingOptions groupingOptions = GroupingOptions.builder()
.partitionerClass(JoinUtils.getPartitionerClass(inbound.getTypeFamily()))
.numReducers(getNumReducers())
.build();
inbound
.groupByKey(groupingOptions)
.parallelDo(new MergeNewOldValuesFn(), ALSTypes.VALUE_MATRIX)
.write(output(outputKey));
return p;
}
示例14: createPipeline
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig stepConfig = getConfig();
ClusterSettings settings = ClusterSettings.create(ConfigUtils.getDefaultConfig());
String instanceDir = stepConfig.getInstanceDir();
long generationID = stepConfig.getGenerationID();
int iteration = stepConfig.getIteration();
String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
String outputKey = prefix + String.format("sketch/%d/", iteration);
if (!validOutputPath(outputKey)) {
return null;
}
// get normalized vectors
String inputKey = prefix + "normalized/";
MRPipeline p = createBasicPipeline(DistanceToClosestFn.class);
AvroType<Pair<Integer, RealVector>> inputType = Avros.pairs(Avros.ints(), MLAvros.vector());
PCollection<Pair<Integer, RealVector>> in = p.read(avroInput(inputKey, inputType));
// either create or load the set of currently chosen k-sketch vectors
// they are stored in a KSketchIndex object
DistanceToClosestFn<RealVector> distanceToClosestFn;
UpdateIndexFn updateIndexFn;
if (iteration == 1) { // Iteration 1 is the first real iteration; iteration 0 contains initial state
KSketchIndex index = createInitialIndex(settings, in);
distanceToClosestFn = new DistanceToClosestFn<RealVector>(index);
updateIndexFn = new UpdateIndexFn(index);
} else {
// Get the index location from the previous iteration
String previousIndexKey = prefix + String.format("sketch/%d/", iteration - 1);
distanceToClosestFn = new DistanceToClosestFn<RealVector>(previousIndexKey);
updateIndexFn = new UpdateIndexFn(previousIndexKey);
}
// compute distance of each vector in dataset to closest vector in k-sketch
PTable<Integer, Pair<RealVector, Double>> weighted = in.parallelDo("computeDistances", distanceToClosestFn,
Avros.tableOf(Avros.ints(), Avros.pairs(MLAvros.vector(), Avros.doubles())));
// run weighted reservoir sampling on the vector to select another group of settings.getSketchPoints()
// to add to the k-sketch
PTable<Integer,RealVector> kSketchSample = ReservoirSampling.groupedWeightedSample(weighted,
settings.getSketchPoints(), RandomManager.getRandom());
// update the KSketchIndex with the newly-chosen vectors
kSketchSample.parallelDo("updateIndex", updateIndexFn, Serializables.avro(KSketchIndex.class))
.write(avroOutput(outputKey));
return p;
}
示例15: apply
import org.apache.crunch.PCollection; //导入方法依赖的package包/类
public static PCollection<Record> apply(PCollection<String> in) {
return in.parallelDo("string-split",
new StringSplitFn(),
MLRecords.csvRecord(in.getTypeFamily(), String.valueOf(DelimitedDataUtils.DELIMITER)));
}