本文整理汇总了Java中org.apache.spark.api.java.function.FlatMapFunction类的典型用法代码示例。如果您正苦于以下问题:Java FlatMapFunction类的具体用法?Java FlatMapFunction怎么用?Java FlatMapFunction使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
FlatMapFunction类属于org.apache.spark.api.java.function包,在下文中一共展示了FlatMapFunction类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getAllWordsInDoc
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
* getAllWordsInDoc: Extracted all unique terms from all docs.
*
* @param docwordRDD Pair RDD, each key is a doc, and value is term list extracted from
* that doc.
* @return unique term list
*/
public static JavaRDD<String> getAllWordsInDoc(JavaPairRDD<String, List<String>> docwordRDD) {
JavaRDD<String> wordRDD = docwordRDD.values().flatMap(new FlatMapFunction<List<String>, String>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Iterator<String> call(List<String> list) {
return list.iterator();
}
}).distinct();
return wordRDD;
}
示例2: loadClickStremFromTxt
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
* loadClickStremFromTxt:Load click stream form txt file
*
* @param clickthroughFile
* txt file
* @param sc
* the spark context
* @return clickstream list in JavaRDD format {@link ClickStream}
*/
public JavaRDD<ClickStream> loadClickStremFromTxt(String clickthroughFile, JavaSparkContext sc) {
return sc.textFile(clickthroughFile).flatMap(new FlatMapFunction<String, ClickStream>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@SuppressWarnings("unchecked")
@Override
public Iterator<ClickStream> call(String line) throws Exception {
List<ClickStream> clickthroughs = (List<ClickStream>) ClickStream.parseFromTextLine(line);
return (Iterator<ClickStream>) clickthroughs;
}
});
}
示例3: checkByRateInParallel
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
void checkByRateInParallel() throws InterruptedException, IOException {
JavaRDD<String> userRDD = getUserRDD(this.httpType);
LOG.info("Original User count: {}", userRDD.count());
int userCount = 0;
userCount = userRDD.mapPartitions((FlatMapFunction<Iterator<String>, Integer>) iterator -> {
ESDriver tmpES = new ESDriver(props);
tmpES.createBulkProcessor();
List<Integer> realUserNums = new ArrayList<>();
while (iterator.hasNext()) {
String s = iterator.next();
Integer realUser = checkByRate(tmpES, s);
realUserNums.add(realUser);
}
tmpES.destroyBulkProcessor();
tmpES.close();
return realUserNums.iterator();
}).reduce((Function2<Integer, Integer, Integer>) (a, b) -> a + b);
LOG.info("User count: {}", Integer.toString(userCount));
}
示例4: fromPairFlatMapFunction
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/** A pair to {@link KV} flatmap function . */
static <K, V> FlatMapFunction<Iterator<Tuple2<K, V>>, KV<K, V>> fromPairFlatMapFunction() {
return new FlatMapFunction<Iterator<Tuple2<K, V>>, KV<K, V>>() {
@Override
public Iterator<KV<K, V>> call(Iterator<Tuple2<K, V>> itr) {
final Iterator<KV<K, V>> outputItr =
Iterators.transform(
itr,
new com.google.common.base.Function<Tuple2<K, V>, KV<K, V>>() {
@Override
public KV<K, V> apply(Tuple2<K, V> t2) {
return KV.of(t2._1(), t2._2());
}
});
return outputItr;
}
};
}
示例5: functionToFlatMapFunction
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
* A utility method that adapts {@link Function} to a {@link FlatMapFunction} with an {@link
* Iterator} input. This is particularly useful because it allows to use functions written for map
* functions in flatmap functions.
*
* @param func the {@link Function} to adapt.
* @param <InputT> the input type.
* @param <OutputT> the output type.
* @return a {@link FlatMapFunction} that accepts an {@link Iterator} as an input and applies the
* {@link Function} on every element.
*/
public static <InputT, OutputT>
FlatMapFunction<Iterator<InputT>, OutputT> functionToFlatMapFunction(
final Function<InputT, OutputT> func) {
return new FlatMapFunction<Iterator<InputT>, OutputT>() {
@Override
public Iterator<OutputT> call(Iterator<InputT> itr) throws Exception {
final Iterator<OutputT> outputItr =
Iterators.transform(
itr,
new com.google.common.base.Function<InputT, OutputT>() {
@Override
public OutputT apply(InputT t) {
try {
return func.call(t);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
});
return outputItr;
}
};
}
示例6: queryChronixChunks
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
* Low-level chunked query.
*
* @param query Solr query
* @param zkHost Zookeeper host
* @param collection the Solr collection of chronix time series data
* @param chronixStorage a ChronixSolrCloudStorage instance
* @return ChronixRDD of time series (chunks)
* @throws SolrServerException
*/
public ChronixRDD queryChronixChunks(
final SolrQuery query,
final String zkHost,
final String collection,
final ChronixSolrCloudStorage chronixStorage) throws SolrServerException, IOException {
// first get a list of replicas to query for this collection
List<String> shards = chronixStorage.getShardList(zkHost, collection);
// parallelize the requests to the shards
JavaRDD<MetricTimeSeries> docs = jsc.parallelize(shards, shards.size()).flatMap(
(FlatMapFunction<String, MetricTimeSeries>) shardUrl -> chronixStorage.streamFromSingleNode(
zkHost, collection, shardUrl, query, new MetricTimeSeriesConverter()).iterator());
return new ChronixRDD(docs);
}
示例7: toObservations
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
* Transformation: Transforms the ChronixRDD into a RDD of MetricObservations (pair of timestamp & value + dimensions).
*
* @return RDD of MetricObservations
*/
public JavaRDD<MetricObservation> toObservations() {
return this.flatMap((FlatMapFunction<MetricTimeSeries, MetricObservation>) ts -> ts.points().map(point -> {
//null-safe read of dimensional values
String host = ts.attributes().get(MetricDimension.HOST) == null ? null
: ts.attributes().get(MetricDimension.HOST).toString();
String series = ts.attributes().get(MetricDimension.MEASUREMENT_SERIES) == null ? null
: ts.attributes().get(MetricDimension.MEASUREMENT_SERIES).toString();
String process = ts.attributes().get(MetricDimension.PROCESS) == null ? null
: ts.attributes().get(MetricDimension.PROCESS).toString();
String group = ts.attributes().get(MetricDimension.METRIC_GROUP) == null ? null
: ts.attributes().get(MetricDimension.METRIC_GROUP).toString();
String ag = ts.attributes().get(MetricDimension.AGGREGATION_LEVEL) == null ? null
: ts.attributes().get(MetricDimension.AGGREGATION_LEVEL).toString();
//convert Point/MetricTimeSeries to MetricObservation
return new MetricObservation(
ts.getMetric(),
host, series, process, group, ag,
point.getTimestamp(),
point.getValue()
);
}).iterator());
}
示例8: main
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
static public void main(String[] args) {
JavaSparkContext sc = new JavaSparkContext("local", "Hello Spark");
JavaRDD<String> lines = sc.textFile("data/test1.txt");
//JavaRDD<String> tokens = lines.flatMap(line -> tokenize(line)); // worked for mllib version 1.5, not for version 2.0
JavaRDD<String> tokens = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) {
return tokenize(s).iterator();
}
});
JavaPairRDD<String, Integer> counts =
tokens.mapToPair(
token ->
new Tuple2<String, Integer>(token.toLowerCase(), 1))
.reduceByKey((count1, count2) -> count1 + count2);
Map countMap = counts.collectAsMap();
System.out.println(countMap);
List<Tuple2<String, Integer>> collection = counts.collect();
System.out.println(collection);
}
示例9: execute
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
@SuppressWarnings("serial")
@Override
public SortedCounts<String> execute(final JavaSparkContext spark) {
final JavaRDD<String> textFile = spark.textFile(inputFile);
final JavaRDD<String> words = textFile.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(final String rawJSON) throws TwitterException {
final Status tweet = TwitterObjectFactory.createStatus(rawJSON);
String text = tweet.getText();
return Arrays.asList(text.split(" "));
}
});
final JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(final String s) {
return new Tuple2<String, Integer>(s.toLowerCase(), 1);
}
});
final JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(final Integer a, final Integer b) {
return a + b;
}
});
return SortedCounts.create(counts);
}
示例10: partition
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
private JavaRDD<T> partition(final SpatialPartitioner partitioner) {
return this.rawSpatialRDD.flatMapToPair(
new PairFlatMapFunction<T, Integer, T>() {
@Override
public Iterator<Tuple2<Integer, T>> call(T spatialObject) throws Exception {
return partitioner.placeObject(spatialObject);
}
}
).partitionBy(partitioner)
.mapPartitions(new FlatMapFunction<Iterator<Tuple2<Integer, T>>, T>() {
@Override
public Iterator<T> call(final Iterator<Tuple2<Integer, T>> tuple2Iterator) throws Exception {
return new Iterator<T>() {
@Override
public boolean hasNext() {
return tuple2Iterator.hasNext();
}
@Override
public T next() {
return tuple2Iterator.next()._2();
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}; }
}, true);
}
示例11: main
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage BasicFlatMap sparkMaster inputFile");
}
JavaSparkContext sc = new JavaSparkContext(
args[0], "basicflatmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaRDD<String> rdd = sc.textFile(args[1]);
JavaRDD<String> words = rdd.flatMap(
new FlatMapFunction<String, String>() { public Iterable<String> call(String x) {
return Arrays.asList(x.split(" "));
}});
Map<String, Long> result = words.countByValue();
for (Entry<String, Long> entry: result.entrySet()) {
System.out.println(entry.getKey() + ":" + entry.getValue());
}
}
示例12: getReadsFunction
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
private static FlatMapFunction<Shard<GATKRead>, ReadWalkerContext> getReadsFunction(
Broadcast<ReferenceMultiSource> bReferenceSource, Broadcast<FeatureManager> bFeatureManager,
SAMSequenceDictionary sequenceDictionary, int readShardPadding) {
return (FlatMapFunction<Shard<GATKRead>, ReadWalkerContext>) shard -> {
// get reference bases for this shard (padded)
SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(readShardPadding, sequenceDictionary);
ReferenceDataSource reference = bReferenceSource == null ? null :
new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(paddedInterval), sequenceDictionary);
FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
return StreamSupport.stream(shard.spliterator(), false)
.map(r -> {
final SimpleInterval readInterval = getReadInterval(r);
return new ReadWalkerContext(r, new ReferenceContext(reference, readInterval), new FeatureContext(features, readInterval));
}).iterator();
};
}
示例13: getVariantsFunction
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
private static FlatMapFunction<Shard<VariantContext>, VariantWalkerContext> getVariantsFunction(
final Broadcast<ReferenceMultiSource> bReferenceSource,
final Broadcast<FeatureManager> bFeatureManager,
final SAMSequenceDictionary sequenceDictionary, final int variantShardPadding) {
return (FlatMapFunction<Shard<VariantContext>, VariantWalkerContext>) shard -> {
// get reference bases for this shard (padded)
SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(variantShardPadding, sequenceDictionary);
ReferenceDataSource reference = bReferenceSource == null ? null :
new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(paddedInterval), sequenceDictionary);
FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
return StreamSupport.stream(shard.spliterator(), false)
.filter(v -> v.getStart() >= shard.getStart() && v.getStart() <= shard.getEnd()) // only include variants that start in the shard
.map(v -> {
final SimpleInterval variantInterval = new SimpleInterval(v);
return new VariantWalkerContext(v,
new ReadsContext(), // empty
new ReferenceContext(reference, variantInterval),
new FeatureContext(features, variantInterval));
}).iterator();
};
}
示例14: shardsToAssemblyRegions
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
* @return and RDD of {@link Tuple2<AssemblyRegion, SimpleInterval>} which pairs each AssemblyRegion with the
* interval it was generated in
*/
private static FlatMapFunction<Iterator<Shard<GATKRead>>, Tuple2<AssemblyRegion, SimpleInterval>> shardsToAssemblyRegions(
final Broadcast<ReferenceMultiSource> reference,
final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast,
final ShardingArgumentCollection assemblyArgs,
final SAMFileHeader header,
final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast) {
return shards -> {
final ReferenceMultiSource referenceMultiSource = reference.value();
final ReferenceMultiSourceAdapter referenceSource = new ReferenceMultiSourceAdapter(referenceMultiSource);
final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), false, false, header, referenceSource, annotatorEngineBroadcast.getValue());
final ReadsDownsampler readsDownsampler = assemblyArgs.maxReadsPerAlignmentStart > 0 ?
new PositionalDownsampler(assemblyArgs.maxReadsPerAlignmentStart, header) : null;
return Utils.stream(shards)
//TODO we've hacked multi interval shards here with a shim, but we should investigate as smarter approach https://github.com/broadinstitute/gatk/issues/4299
.map(shard -> new ShardToMultiIntervalShardAdapter<>(
new DownsampleableSparkReadShard(new ShardBoundary(shard.getInterval(), shard.getPaddedInterval()), shard, readsDownsampler)))
.flatMap(shardToRegion(assemblyArgs, header, referenceSource, hcEngine)).iterator();
};
}
示例15: testPutPairsInSamePartition
import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
@Test(dataProvider = "readPairsAndPartitions")
public void testPutPairsInSamePartition(int numPairs, int numPartitions, int[] expectedReadsPerPartition) throws IOException {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
header.setSortOrder(SAMFileHeader.SortOrder.queryname);
JavaRDD<GATKRead> reads = createPairedReads(ctx, header, numPairs, numPartitions);
ReadsSparkSource readsSparkSource = new ReadsSparkSource(ctx);
JavaRDD<GATKRead> pairedReads = readsSparkSource.putPairsInSamePartition(header, reads);
List<List<GATKRead>> partitions = pairedReads.mapPartitions((FlatMapFunction<Iterator<GATKRead>, List<GATKRead>>) it ->
Iterators.singletonIterator(Lists.newArrayList(it))).collect();
assertEquals(partitions.size(), numPartitions);
for (int i = 0; i < numPartitions; i++) {
assertEquals(partitions.get(i).size(), expectedReadsPerPartition[i]);
}
assertEquals(Arrays.stream(expectedReadsPerPartition).sum(), numPairs * 2);
}