本文整理汇总了Java中org.apache.crunch.types.avro.Avros类的典型用法代码示例。如果您正苦于以下问题:Java Avros类的具体用法?Java Avros怎么用?Java Avros使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Avros类属于org.apache.crunch.types.avro包,在下文中一共展示了Avros类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: loadKeyedRecords
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
Set<String> samples)
throws IOException {
PCollection<Pair<org.bdgenomics.formats.avro.Variant, Collection<Genotype>>> adamRecords
= readVariants(inputFormat, inputPath, conf, pipeline, sampleGroup);
// The data are now loaded into ADAM variant objects; convert to keyed SpecificRecords
ADAMToKeyedSpecificRecordFn converter =
new ADAMToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
@SuppressWarnings("unchecked")
PType<SpecificRecord> specificPType = Avros.specifics(converter.getSpecificRecordType());
return adamRecords.parallelDo("Convert to keyed SpecificRecords",
converter, Avros.tableOf(KEY_PTYPE, specificPType));
}
示例2: readVariants
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
private static PCollection<Pair<Variant, Collection<Genotype>>>
readVariants(String inputFormat, Path inputPath, Configuration conf,
Pipeline pipeline, String sampleGroup) throws IOException {
PCollection<Pair<Variant, Collection<Genotype>>> adamRecords;
if (inputFormat.equals("VCF")) {
TableSource<LongWritable, VariantContextWritable> vcfSource =
From.formattedFile(
inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
PType<Pair<Variant, Collection<Genotype>>> adamPType =
Avros.pairs(Avros.specifics(org.bdgenomics.formats.avro.Variant.class),
Avros.collections(Avros.specifics(Genotype.class)));
adamRecords =
vcfRecords.parallelDo("VCF to ADAM Variant", new VCFToADAMVariantFn(), adamPType);
} else if (inputFormat.equals("AVRO")) {
throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
} else if (inputFormat.equals("PARQUET")) {
throw new UnsupportedOperationException("Unsupported input format: " + inputFormat);
} else {
throw new IllegalStateException("Unrecognized input format: " + inputFormat);
}
return adamRecords;
}
示例3: testDetach
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
@Test
public void testDetach() {
Collection<TestAvroRecord> expected = Lists.newArrayList(
new TestAvroRecord(new Utf8("something"), new Utf8("*"), 1L),
new TestAvroRecord(new Utf8("something"), new Utf8("**"), 1L),
new TestAvroRecord(new Utf8("something"), new Utf8("***"), 1L)
);
DoFn<Pair<String, Iterable<TestAvroRecord>>, Collection<TestAvroRecord>> doFn =
DoFns.detach(new CollectingMapFn(), Avros.specifics(TestAvroRecord.class));
Pair<String, Iterable<TestAvroRecord>> input = Pair.of("key", (Iterable<TestAvroRecord>) new AvroIterable());
InMemoryEmitter<Collection<TestAvroRecord>> emitter = new InMemoryEmitter<Collection<TestAvroRecord>>();
doFn.configure(new Configuration());
doFn.initialize();
doFn.process(input, emitter);
doFn.cleanup(emitter);
assertEquals(expected, emitter.getOutput().get(0));
}
示例4: asSource
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
/**
* Expose the given {@link Dataset} as a Crunch {@link ReadableSource}.
*
* Only the FileSystem {@code Dataset} implementation is supported and the
* file format must be {@code Formats.PARQUET} or {@code Formats.AVRO}.
*
* @param dataset the dataset to read from
* @param type the Java type of the entities in the dataset
* @param <E> the type of entity produced by the source
* @return the {@link ReadableSource}, or <code>null</code> if the dataset is not
* filesystem-based.
*/
@SuppressWarnings("unchecked")
public static <E> ReadableSource<E> asSource(Dataset<E> dataset, Class<E> type) {
Path directory = Accessor.getDefault().getDirectory(dataset);
if (directory != null) {
List<Path> paths = Lists.newArrayList(
Accessor.getDefault().getPathIterator(dataset));
AvroType<E> avroType;
if (type.isAssignableFrom(GenericData.Record.class)) {
avroType = (AvroType<E>) Avros.generics(dataset.getDescriptor().getSchema());
} else {
avroType = Avros.records(type);
}
final Format format = dataset.getDescriptor().getFormat();
if (Formats.PARQUET.equals(format)) {
return new AvroParquetFileSource<E>(paths, avroType);
} else if (Formats.AVRO.equals(format)) {
return new AvroFileSource<E>(paths, avroType);
} else {
throw new UnsupportedOperationException(
"Not a supported format: " + format);
}
}
return null;
}
示例5: createPipeline
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
String instanceDir = config.getInstanceDir();
long generationID = config.getGenerationID();
String inputKey = Namespaces.getTempPrefix(instanceDir, generationID) + "partialRecommend/";
String outputKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "recommend/";
if (!validOutputPath(outputKey)) {
return null;
}
MRPipeline p = createBasicPipeline(CollectRecommendFn.class);
p.getConfiguration().set(IDMappingState.ID_MAPPING_KEY,
Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "idMapping/");
PTables.asPTable(p.read(input(inputKey, ALSTypes.VALUE_MATRIX)))
.groupByKey(groupingOptions())
.parallelDo("collectRecommend", new CollectRecommendFn(), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputKey));
return p;
}
示例6: createPipeline
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
@Override
protected final MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
IterationState iterationState = getIterationState();
String iterationKey = iterationState.getIterationKey();
String xOrY = isX() ? "X/" : "Y/";
String outputKeyPath =
Namespaces.getInstanceGenerationPrefix(config.getInstanceDir(), config.getGenerationID()) + xOrY;
if (!validOutputPath(outputKeyPath)) {
return null;
}
MRPipeline p = createBasicPipeline(PublishMapFn.class);
p.read(input(iterationKey + xOrY, ALSTypes.DENSE_ROW_MATRIX))
.parallelDo("publish", new PublishMapFn(), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputKeyPath));
return p;
}
示例7: createPipeline
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
String instanceDir = config.getInstanceDir();
long generationID = config.getGenerationID();
String outputKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "knownItems/";
if (!validOutputPath(outputKey)) {
return null;
}
MRPipeline p = createBasicPipeline(CollectKnownItemsFn.class);
// Really should read in and exclude tag IDs but doesn't really hurt much
p.read(input(Namespaces.getTempPrefix(instanceDir, generationID) + "userVectors/", ALSTypes.SPARSE_ROW_MATRIX))
.parallelDo("collectKnownItems", new CollectKnownItemsFn(), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputKey));
return p;
}
示例8: createPipeline
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
@Override
protected final MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
String tempPrefix = Namespaces.getTempPrefix(config.getInstanceDir(), config.getGenerationID());
String outputPathKey = tempPrefix + getPopularPathDir() + '/';
if (!validOutputPath(outputPathKey)) {
return null;
}
MRPipeline p = createBasicPipeline(PopularMapFn.class);
p.read(input(tempPrefix + getSourceDir() + '/', ALSTypes.SPARSE_ROW_MATRIX))
.parallelDo("popularMap", new PopularMapFn(), Avros.tableOf(ALSTypes.INTS, ALSTypes.ID_SET))
.groupByKey(groupingOptions())
//.combineValues(new FastIDSetAggregator())
.parallelDo("popularReduce", new PopularReduceFn(), ALSTypes.LONGS)
.write(output(outputPathKey));
return p;
}
示例9: createPipeline
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
String instanceDir = config.getInstanceDir();
long generationID = config.getGenerationID();
String tempPrefix = Namespaces.getTempPrefix(instanceDir, generationID);
String outputPathKey = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "similarItems/";
if (!validOutputPath(outputPathKey)) {
return null;
}
MRPipeline p = createBasicPipeline(SimilarReduceFn.class);
p.getConfiguration().set(IDMappingState.ID_MAPPING_KEY,
Namespaces.getInstanceGenerationPrefix(instanceDir, generationID) + "idMapping/");
PTables.asPTable(p.read(input(tempPrefix + "distributeSimilar/", ALSTypes.VALUE_MATRIX)))
.groupByKey(groupingOptions())
.parallelDo("similarReduce", new SimilarReduceFn(), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputPathKey));
return p;
}
示例10: testCategorical
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
@Test
public void testCategorical() {
PCollection<String> input = MemPipeline.typedCollectionOf(
Avros.strings(),
"1.0,a,3.0,y",
"0.4,b,1.0,x",
"3.2,c,29.0,z");
PCollection<Record> elems = StringSplitFn.apply(input);
Summary s = new Summarizer()
.categoricalColumns(1, 3)
.build(elems).getValue();
PCollection<RealVector> vecs = elems.parallelDo(new StandardizeFn(s), MLAvros.vector());
assertEquals(ImmutableList.of(
Vectors.of(1.0, 1, 0, 0, 3.0, 0.0, 1.0, 0.0),
Vectors.of(0.4, 0, 1, 0, 1.0, 1.0, 0.0, 0.0),
Vectors.of(3.2, 0, 0, 1, 29.0, 0, 0, 1)),
vecs.materialize());
}
示例11: createPipeline
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
@Override
protected MRPipeline createPipeline() throws IOException {
JobStepConfig config = getConfig();
String instanceGenerationPrefix =
Namespaces.getInstanceGenerationPrefix(config.getInstanceDir(), config.getGenerationID());
String outputPathKey = instanceGenerationPrefix + "trees/";
if (!validOutputPath(outputPathKey)) {
return null;
}
MRPipeline p = createBasicPipeline(DistributeExampleFn.class);
p.read(textInput(instanceGenerationPrefix + "inbound/"))
.parallelDo("distributeData",
new DistributeExampleFn(),
Avros.tableOf(Avros.ints(), Avros.strings()))
.groupByKey(groupingOptions())
.parallelDo("buildTrees", new BuildTreeFn(), Avros.strings())
.write(compressedTextOutput(p.getConfiguration(), outputPathKey));
return p;
}
示例12: run
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
public int run(String[] args) throws Exception {
String fooInputPath = args[0];
String barInputPath = args[1];
String outputPath = args[2];
int fooValMax = Integer.parseInt(args[3]);
int joinValMax = Integer.parseInt(args[4]);
int numberOfReducers = Integer.parseInt(args[5]);
Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
PCollection<String> fooLines = pipeline.readTextFile(fooInputPath); //<2>
PCollection<String> barLines = pipeline.readTextFile(barInputPath);
PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo( //<3>
new FooIndicatorFn(),
Avros.tableOf(Avros.longs(),
Avros.pairs(Avros.longs(), Avros.ints())));
fooTable = fooTable.filter(new FooFilter(fooValMax)); //<4>
PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
Avros.tableOf(Avros.longs(), Avros.ints()));
DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy = //<5>
new DefaultJoinStrategy
<Long, Pair<Long, Integer>, Integer>
(numberOfReducers);
PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
.join(fooTable, barTable, JoinType.INNER_JOIN);
PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));
filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>
PipelineResult result = pipeline.done();
return result.succeeded() ? 0 : 1;
}
示例13: loadKeyedRecords
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
@Override
public PTable<Tuple3<String, Long, String>, SpecificRecord>
loadKeyedRecords(String inputFormat, Path inputPath, Configuration conf,
Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
Set<String> samples)
throws IOException {
PCollection<Variant> variants = readVariants(inputFormat, inputPath,
conf, pipeline, sampleGroup);
GA4GHToKeyedSpecificRecordFn converter =
new GA4GHToKeyedSpecificRecordFn(variantsOnly, flatten, sampleGroup, samples);
@SuppressWarnings("unchecked")
PType<SpecificRecord> specificPType = Avros.specifics(converter
.getSpecificRecordType());
return variants.parallelDo("Convert to keyed SpecificRecords",
converter, Avros.tableOf(KEY_PTYPE, specificPType));
}
示例14: readVariants
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
private static PCollection<Variant> readVariants(String inputFormat, Path inputPath,
Configuration conf, Pipeline pipeline, String sampleGroup) throws IOException {
PCollection<Variant> variants;
if (inputFormat.equals("VCF")) {
VCFToGA4GHVariantFn.configureHeaders(
conf, FileUtils.findVcfs(inputPath, conf), sampleGroup);
TableSource<LongWritable, VariantContextWritable> vcfSource =
From.formattedFile(
inputPath, VCFInputFormat.class, LongWritable.class, VariantContextWritable.class);
PCollection<VariantContextWritable> vcfRecords = pipeline.read(vcfSource).values();
variants = vcfRecords.parallelDo(
"VCF to GA4GH Variant", new VCFToGA4GHVariantFn(), Avros.specifics(Variant.class));
} else if (inputFormat.equals("AVRO")) {
variants = pipeline.read(From.avroFile(inputPath, Avros.specifics(Variant.class)));
} else if (inputFormat.equals("PARQUET")) {
@SuppressWarnings("unchecked")
Source<Variant> source =
new AvroParquetFileSource(inputPath, Avros.specifics(Variant.class));
variants = pipeline.read(source);
} else {
throw new IllegalStateException("Unrecognized input format: " + inputFormat);
}
return variants;
}
示例15: loadPartitionedVariants
import org.apache.crunch.types.avro.Avros; //导入依赖的package包/类
/**
* Load and partition variants.
* key = (contig, pos, sample_group); value = Variant/Call Avro object
* @param inputFormat the format of the input data (VCF, AVRO, or PARQUET)
* @param inputPath the input data path
* @param conf the Hadoop configuration
* @param pipeline the Crunch pipeline
* @param variantsOnly whether to ignore samples and only load variants
* @param flatten whether to flatten the data types
* @param sampleGroup an identifier for the group of samples being loaded
* @param samples the samples to include
* @param redistribute whether to repartition the data by locus/sample group
* @param segmentSize the number of base pairs in each segment partition
* @param numReducers the number of reducers to use
* @return the keyed variant or call records
* @throws IOException if an I/O error is encountered during loading
*/
public PTable<String, SpecificRecord> loadPartitionedVariants(
String inputFormat, Path inputPath, Configuration conf,
Pipeline pipeline, boolean variantsOnly, boolean flatten, String sampleGroup,
Set<String> samples, boolean redistribute, long segmentSize, int numReducers)
throws IOException {
PTable<Tuple3<String, Long, String>, SpecificRecord> locusSampleKeyedRecords =
loadKeyedRecords(inputFormat, inputPath, conf, pipeline, variantsOnly, flatten,
sampleGroup, samples);
// execute a DISTRIBUTE BY operation if requested
PTable<Tuple3<String, Long, String>, SpecificRecord> sortedRecords;
if (redistribute) {
// partitionKey(chr, chrSeg, sampleGroup), Pair(secondaryKey/pos, originalDatum)
PTableType<Tuple3<String, Long, String>,
Pair<Long,
Pair<Tuple3<String, Long, String>, SpecificRecord>>> reKeyedPType =
Avros.tableOf(Avros.triples(Avros.strings(), Avros.longs(), Avros.strings()),
Avros.pairs(Avros.longs(),
Avros.pairs(locusSampleKeyedRecords.getKeyType(),
locusSampleKeyedRecords.getValueType())));
PTable<Tuple3<String, Long, String>,
Pair<Long, Pair<Tuple3<String, Long, String>, SpecificRecord>>> reKeyed =
locusSampleKeyedRecords.parallelDo("Re-keying for redistribution",
new ReKeyDistributeByFn(segmentSize), reKeyedPType);
// repartition and sort by pos
sortedRecords = SecondarySort.sortAndApply(
reKeyed, new UnKeyForDistributeByFn(),
locusSampleKeyedRecords.getPTableType(), numReducers);
} else {
// input data assumed to be already globally sorted
sortedRecords = locusSampleKeyedRecords;
}
// generate the partition keys
return sortedRecords.mapKeys("Generate partition keys",
new LocusSampleToPartitionFn(segmentSize, sampleGroup), Avros.strings());
}