当前位置: 首页>>代码示例>>Java>>正文


Java FlatMapFunction类代码示例

本文整理汇总了Java中org.apache.spark.api.java.function.FlatMapFunction的典型用法代码示例。如果您正苦于以下问题:Java FlatMapFunction类的具体用法?Java FlatMapFunction怎么用?Java FlatMapFunction使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


FlatMapFunction类属于org.apache.spark.api.java.function包,在下文中一共展示了FlatMapFunction类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getAllWordsInDoc

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
 * getAllWordsInDoc: Extracted all unique terms from all docs.
 *
 * @param docwordRDD Pair RDD, each key is a doc, and value is term list extracted from
 *                   that doc.
 * @return unique term list
 */
public static JavaRDD<String> getAllWordsInDoc(JavaPairRDD<String, List<String>> docwordRDD) {
  JavaRDD<String> wordRDD = docwordRDD.values().flatMap(new FlatMapFunction<List<String>, String>() {
    /**
     *
     */
    private static final long serialVersionUID = 1L;

    @Override
    public Iterator<String> call(List<String> list) {
      return list.iterator();
    }
  }).distinct();

  return wordRDD;
}
 
开发者ID:apache,项目名称:incubator-sdap-mudrod,代码行数:23,代码来源:RDDUtil.java

示例2: loadClickStremFromTxt

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
 * loadClickStremFromTxt:Load click stream form txt file
 *
 * @param clickthroughFile
 *          txt file
 * @param sc
 *          the spark context
 * @return clickstream list in JavaRDD format {@link ClickStream}
 */
public JavaRDD<ClickStream> loadClickStremFromTxt(String clickthroughFile, JavaSparkContext sc) {
  return sc.textFile(clickthroughFile).flatMap(new FlatMapFunction<String, ClickStream>() {
    /**
     *
     */
    private static final long serialVersionUID = 1L;

    @SuppressWarnings("unchecked")
    @Override
    public Iterator<ClickStream> call(String line) throws Exception {
      List<ClickStream> clickthroughs = (List<ClickStream>) ClickStream.parseFromTextLine(line);
      return (Iterator<ClickStream>) clickthroughs;
    }
  });
}
 
开发者ID:apache,项目名称:incubator-sdap-mudrod,代码行数:25,代码来源:SessionExtractor.java

示例3: checkByRateInParallel

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
void checkByRateInParallel() throws InterruptedException, IOException {

    JavaRDD<String> userRDD = getUserRDD(this.httpType);
    LOG.info("Original User count: {}", userRDD.count());

    int userCount = 0;
    userCount = userRDD.mapPartitions((FlatMapFunction<Iterator<String>, Integer>) iterator -> {
      ESDriver tmpES = new ESDriver(props);
      tmpES.createBulkProcessor();
      List<Integer> realUserNums = new ArrayList<>();
      while (iterator.hasNext()) {
        String s = iterator.next();
        Integer realUser = checkByRate(tmpES, s);
        realUserNums.add(realUser);
      }
      tmpES.destroyBulkProcessor();
      tmpES.close();
      return realUserNums.iterator();
    }).reduce((Function2<Integer, Integer, Integer>) (a, b) -> a + b);

    LOG.info("User count: {}", Integer.toString(userCount));
  }
 
开发者ID:apache,项目名称:incubator-sdap-mudrod,代码行数:23,代码来源:CrawlerDetection.java

示例4: fromPairFlatMapFunction

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/** A pair to {@link KV} flatmap function . */
static <K, V> FlatMapFunction<Iterator<Tuple2<K, V>>, KV<K, V>> fromPairFlatMapFunction() {
  return new FlatMapFunction<Iterator<Tuple2<K, V>>, KV<K, V>>() {
    @Override
    public Iterator<KV<K, V>> call(Iterator<Tuple2<K, V>> itr) {
      final Iterator<KV<K, V>> outputItr =
          Iterators.transform(
              itr,
              new com.google.common.base.Function<Tuple2<K, V>, KV<K, V>>() {
                @Override
                public KV<K, V> apply(Tuple2<K, V> t2) {
                  return KV.of(t2._1(), t2._2());
                }
              });
      return outputItr;
    }
  };
}
 
开发者ID:apache,项目名称:beam,代码行数:19,代码来源:TranslationUtils.java

示例5: functionToFlatMapFunction

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
 * A utility method that adapts {@link Function} to a {@link FlatMapFunction} with an {@link
 * Iterator} input. This is particularly useful because it allows to use functions written for map
 * functions in flatmap functions.
 *
 * @param func the {@link Function} to adapt.
 * @param <InputT> the input type.
 * @param <OutputT> the output type.
 * @return a {@link FlatMapFunction} that accepts an {@link Iterator} as an input and applies the
 *     {@link Function} on every element.
 */
public static <InputT, OutputT>
    FlatMapFunction<Iterator<InputT>, OutputT> functionToFlatMapFunction(
        final Function<InputT, OutputT> func) {
  return new FlatMapFunction<Iterator<InputT>, OutputT>() {

    @Override
    public Iterator<OutputT> call(Iterator<InputT> itr) throws Exception {
      final Iterator<OutputT> outputItr =
          Iterators.transform(
              itr,
              new com.google.common.base.Function<InputT, OutputT>() {

                @Override
                public OutputT apply(InputT t) {
                  try {
                    return func.call(t);
                  } catch (Exception e) {
                    throw new RuntimeException(e);
                  }
                }
              });
      return outputItr;
    }
  };
}
 
开发者ID:apache,项目名称:beam,代码行数:37,代码来源:TranslationUtils.java

示例6: queryChronixChunks

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
 * Low-level chunked query.
 *
 * @param query Solr query
 * @param zkHost Zookeeper host
 * @param collection     the Solr collection of chronix time series data
 * @param chronixStorage a ChronixSolrCloudStorage instance
 * @return ChronixRDD of time series (chunks)
 * @throws SolrServerException
 */
public ChronixRDD queryChronixChunks(
        final SolrQuery query,
        final String zkHost,
        final String collection,
        final ChronixSolrCloudStorage chronixStorage) throws SolrServerException, IOException {

    // first get a list of replicas to query for this collection
    List<String> shards = chronixStorage.getShardList(zkHost, collection);

    // parallelize the requests to the shards
    JavaRDD<MetricTimeSeries> docs = jsc.parallelize(shards, shards.size()).flatMap(
            (FlatMapFunction<String, MetricTimeSeries>) shardUrl -> chronixStorage.streamFromSingleNode(
                    zkHost, collection, shardUrl, query, new MetricTimeSeriesConverter()).iterator());
    return new ChronixRDD(docs);
}
 
开发者ID:ChronixDB,项目名称:chronix.spark,代码行数:26,代码来源:ChronixSparkContext.java

示例7: toObservations

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
 * Transformation: Transforms the ChronixRDD into a RDD of MetricObservations (pair of timestamp & value + dimensions).
 *
 * @return RDD of MetricObservations
 */
public JavaRDD<MetricObservation> toObservations() {
    return this.flatMap((FlatMapFunction<MetricTimeSeries, MetricObservation>) ts -> ts.points().map(point -> {
        //null-safe read of dimensional values
        String host = ts.attributes().get(MetricDimension.HOST) == null ? null
                : ts.attributes().get(MetricDimension.HOST).toString();
        String series = ts.attributes().get(MetricDimension.MEASUREMENT_SERIES) == null ? null
                : ts.attributes().get(MetricDimension.MEASUREMENT_SERIES).toString();
        String process = ts.attributes().get(MetricDimension.PROCESS) == null ? null
                : ts.attributes().get(MetricDimension.PROCESS).toString();
        String group = ts.attributes().get(MetricDimension.METRIC_GROUP) == null ? null
                : ts.attributes().get(MetricDimension.METRIC_GROUP).toString();
        String ag = ts.attributes().get(MetricDimension.AGGREGATION_LEVEL) == null ? null
                : ts.attributes().get(MetricDimension.AGGREGATION_LEVEL).toString();
        //convert Point/MetricTimeSeries to MetricObservation
        return new MetricObservation(
                ts.getMetric(),
                host, series, process, group, ag,
                point.getTimestamp(),
                point.getValue()
        );
    }).iterator());
}
 
开发者ID:ChronixDB,项目名称:chronix.spark,代码行数:28,代码来源:ChronixRDD.java

示例8: main

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
static public void main(String[] args) {
  JavaSparkContext sc = new JavaSparkContext("local", "Hello Spark");

  JavaRDD<String> lines = sc.textFile("data/test1.txt");
  //JavaRDD<String> tokens = lines.flatMap(line -> tokenize(line)); // worked for mllib version 1.5, not for version 2.0
  JavaRDD<String> tokens = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String s) {
      return tokenize(s).iterator();
    }
  });
  JavaPairRDD<String, Integer> counts =
      tokens.mapToPair(
          token ->
              new Tuple2<String, Integer>(token.toLowerCase(), 1))
          .reduceByKey((count1, count2) -> count1 + count2);
  Map countMap = counts.collectAsMap();
  System.out.println(countMap);
  List<Tuple2<String, Integer>> collection = counts.collect();
  System.out.println(collection);
}
 
开发者ID:mark-watson,项目名称:power-java,代码行数:22,代码来源:HelloSpark.java

示例9: execute

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
@SuppressWarnings("serial")
@Override
public SortedCounts<String> execute(final JavaSparkContext spark) {
	final JavaRDD<String> textFile = spark.textFile(inputFile);
	final JavaRDD<String> words = textFile.flatMap(new FlatMapFunction<String, String>() {
		@Override
		public Iterable<String> call(final String rawJSON) throws TwitterException {
			final Status tweet = TwitterObjectFactory.createStatus(rawJSON);
			String text = tweet.getText();
			return Arrays.asList(text.split(" "));
		}
	});
	final JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
		@Override
		public Tuple2<String, Integer> call(final String s) {
			return new Tuple2<String, Integer>(s.toLowerCase(), 1);
		}
	});
	final JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
		@Override
		public Integer call(final Integer a, final Integer b) {
			return a + b;
		}
	});
	return SortedCounts.create(counts);
}
 
开发者ID:Zuehlke,项目名称:SHMACK,代码行数:27,代码来源:WordCount.java

示例10: partition

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
private JavaRDD<T> partition(final SpatialPartitioner partitioner) {
	return this.rawSpatialRDD.flatMapToPair(
		new PairFlatMapFunction<T, Integer, T>() {
			@Override
			public Iterator<Tuple2<Integer, T>> call(T spatialObject) throws Exception {
				return partitioner.placeObject(spatialObject);
			}
		}
	).partitionBy(partitioner)
		.mapPartitions(new FlatMapFunction<Iterator<Tuple2<Integer, T>>, T>() {
			@Override
			public Iterator<T> call(final Iterator<Tuple2<Integer, T>> tuple2Iterator) throws Exception {
				return new Iterator<T>() {
					@Override
					public boolean hasNext() {
						return tuple2Iterator.hasNext();
					}

					@Override
					public T next() {
						return tuple2Iterator.next()._2();
					}

					@Override
					public void remove() {
						throw new UnsupportedOperationException();
					}
				};				}
		}, true);
}
 
开发者ID:DataSystemsLab,项目名称:GeoSpark,代码行数:31,代码来源:SpatialRDD.java

示例11: main

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
public static void main(String[] args) throws Exception {

		if (args.length != 2) {
      throw new Exception("Usage BasicFlatMap sparkMaster inputFile");
		}

    JavaSparkContext sc = new JavaSparkContext(
      args[0], "basicflatmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<String> rdd = sc.textFile(args[1]);
    JavaRDD<String> words = rdd.flatMap(
      new FlatMapFunction<String, String>() { public Iterable<String> call(String x) {
          return Arrays.asList(x.split(" "));
        }});
    Map<String, Long> result = words.countByValue();
    for (Entry<String, Long> entry: result.entrySet()) {
      System.out.println(entry.getKey() + ":" + entry.getValue());
    }
  }
 
开发者ID:holdenk,项目名称:learning-spark-examples,代码行数:19,代码来源:BasicFlatMap.java

示例12: getReadsFunction

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
private static FlatMapFunction<Shard<GATKRead>, ReadWalkerContext> getReadsFunction(
        Broadcast<ReferenceMultiSource> bReferenceSource, Broadcast<FeatureManager> bFeatureManager,
        SAMSequenceDictionary sequenceDictionary, int readShardPadding) {
    return (FlatMapFunction<Shard<GATKRead>, ReadWalkerContext>) shard -> {
        // get reference bases for this shard (padded)
        SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(readShardPadding, sequenceDictionary);
        ReferenceDataSource reference = bReferenceSource == null ? null :
                new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(paddedInterval), sequenceDictionary);
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();

        return StreamSupport.stream(shard.spliterator(), false)
                .map(r -> {
                    final SimpleInterval readInterval = getReadInterval(r);
                    return new ReadWalkerContext(r, new ReferenceContext(reference, readInterval), new FeatureContext(features, readInterval));
                }).iterator();
    };
}
 
开发者ID:broadinstitute,项目名称:gatk,代码行数:18,代码来源:ReadWalkerSpark.java

示例13: getVariantsFunction

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
private static FlatMapFunction<Shard<VariantContext>, VariantWalkerContext> getVariantsFunction(
        final Broadcast<ReferenceMultiSource> bReferenceSource,
        final Broadcast<FeatureManager> bFeatureManager,
        final SAMSequenceDictionary sequenceDictionary, final int variantShardPadding) {
    return (FlatMapFunction<Shard<VariantContext>, VariantWalkerContext>) shard -> {
        // get reference bases for this shard (padded)
        SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(variantShardPadding, sequenceDictionary);
        ReferenceDataSource reference = bReferenceSource == null ? null :
                new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(paddedInterval), sequenceDictionary);
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();

        return StreamSupport.stream(shard.spliterator(), false)
                .filter(v -> v.getStart() >= shard.getStart() && v.getStart() <= shard.getEnd()) // only include variants that start in the shard
                .map(v -> {
                    final SimpleInterval variantInterval = new SimpleInterval(v);
                    return new VariantWalkerContext(v,
                            new ReadsContext(), // empty
                            new ReferenceContext(reference, variantInterval),
                            new FeatureContext(features, variantInterval));
                }).iterator();
    };
}
 
开发者ID:broadinstitute,项目名称:gatk,代码行数:23,代码来源:VariantWalkerSpark.java

示例14: shardsToAssemblyRegions

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
/**
 * @return and RDD of {@link Tuple2<AssemblyRegion, SimpleInterval>} which pairs each AssemblyRegion with the
 * interval it was generated in
 */
private static FlatMapFunction<Iterator<Shard<GATKRead>>, Tuple2<AssemblyRegion, SimpleInterval>> shardsToAssemblyRegions(
        final Broadcast<ReferenceMultiSource> reference,
        final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast,
        final ShardingArgumentCollection assemblyArgs,
        final SAMFileHeader header,
        final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast) {
    return shards -> {
        final ReferenceMultiSource referenceMultiSource = reference.value();
        final ReferenceMultiSourceAdapter referenceSource = new ReferenceMultiSourceAdapter(referenceMultiSource);
        final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), false, false, header, referenceSource, annotatorEngineBroadcast.getValue());

        final ReadsDownsampler readsDownsampler = assemblyArgs.maxReadsPerAlignmentStart > 0 ?
            new PositionalDownsampler(assemblyArgs.maxReadsPerAlignmentStart, header) : null;
        return Utils.stream(shards)
                //TODO we've hacked multi interval shards here with a shim, but we should investigate as smarter approach https://github.com/broadinstitute/gatk/issues/4299
            .map(shard -> new ShardToMultiIntervalShardAdapter<>(
                    new DownsampleableSparkReadShard(new ShardBoundary(shard.getInterval(), shard.getPaddedInterval()), shard, readsDownsampler)))
            .flatMap(shardToRegion(assemblyArgs, header, referenceSource, hcEngine)).iterator();
    };
}
 
开发者ID:broadinstitute,项目名称:gatk,代码行数:25,代码来源:HaplotypeCallerSpark.java

示例15: testPutPairsInSamePartition

import org.apache.spark.api.java.function.FlatMapFunction; //导入依赖的package包/类
@Test(dataProvider = "readPairsAndPartitions")
public void testPutPairsInSamePartition(int numPairs, int numPartitions, int[] expectedReadsPerPartition) throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
    header.setSortOrder(SAMFileHeader.SortOrder.queryname);
    JavaRDD<GATKRead> reads = createPairedReads(ctx, header, numPairs, numPartitions);
    ReadsSparkSource readsSparkSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> pairedReads = readsSparkSource.putPairsInSamePartition(header, reads);
    List<List<GATKRead>> partitions = pairedReads.mapPartitions((FlatMapFunction<Iterator<GATKRead>, List<GATKRead>>) it ->
            Iterators.singletonIterator(Lists.newArrayList(it))).collect();
    assertEquals(partitions.size(), numPartitions);
    for (int i = 0; i < numPartitions; i++) {
        assertEquals(partitions.get(i).size(), expectedReadsPerPartition[i]);
    }
    assertEquals(Arrays.stream(expectedReadsPerPartition).sum(), numPairs * 2);
}
 
开发者ID:broadinstitute,项目名称:gatk,代码行数:17,代码来源:ReadsSparkSourceUnitTest.java


注:本文中的org.apache.spark.api.java.function.FlatMapFunction类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。