本文整理匯總了Java中org.apache.spark.api.java.function.Function類的典型用法代碼示例。如果您正苦於以下問題:Java Function類的具體用法?Java Function怎麽用?Java Function使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
Function類屬於org.apache.spark.api.java.function包,在下文中一共展示了Function類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: createNGramDataFrame
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
/**
* Creates a n-gram data frame from text lines.
* @param lines
* @return a n-gram data frame.
*/
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
private static final long serialVersionUID = -4332903997027358601L;
@Override
public Row call(String line) throws Exception {
return RowFactory.create(Arrays.asList(line.split("\\s+")));
}
});
StructType schema = new StructType(new StructField[] {
new StructField("words",
DataTypes.createArrayType(DataTypes.StringType), false,
Metadata.empty()) });
DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
// build a bigram language model
NGram transformer = new NGram().setInputCol("words")
.setOutputCol("ngrams").setN(2);
DataFrame ngramDF = transformer.transform(wordDF);
ngramDF.show(10, false);
return ngramDF;
}
示例2: parse
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
/**
* Parses a list of PoS-tagged sentences, each on a line and writes the result to an output
* file in a specified output format.
* @param jsc
* @param sentences
* @param outputFileName
* @param outuptFormat
*/
public void parse(JavaSparkContext jsc, List<String> sentences, String outputFileName, OutputFormat outputFormat) {
JavaRDD<String> input = jsc.parallelize(sentences);
JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
JavaRDD<Row> rows = graphs.map(new Function<DependencyGraph, Row>() {
private static final long serialVersionUID = -812004521983071103L;
public Row call(DependencyGraph graph) {
return RowFactory.create(graph.getSentence().toString(), graph.dependencies());
}
});
StructType schema = new StructType(new StructField[]{
new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
new StructField("dependency", DataTypes.StringType, false, Metadata.empty())
});
SQLContext sqlContext = new SQLContext(jsc);
DataFrame df = sqlContext.createDataFrame(rows, schema);
if (outputFormat == OutputFormat.TEXT)
df.select("dependency").write().text(outputFileName);
else
df.repartition(1).write().json(outputFileName);
}
示例3: main
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.err.println("Usage: JavaSleep <seconds>");
System.exit(1);
}
SparkConf sparkConf = new SparkConf().setAppName("JavaSleep");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
Integer parallel = sparkConf.getInt("spark.default.parallelism", ctx.defaultParallelism());
Integer seconds = Integer.parseInt(args[0]);
Integer[] init_val = new Integer[parallel];
Arrays.fill(init_val, seconds);
JavaRDD<Integer> workload = ctx.parallelize(Arrays.asList(init_val), parallel).map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer s) throws InterruptedException {
Thread.sleep(s * 1000);
return 0;
}
});
List<Integer> output = workload.collect();
ctx.stop();
}
示例4: main
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
public static void main(String[] args) throws IOException {
Flags.setFromCommandLineArgs(THE_OPTIONS, args);
// 初始化Spark Conf.
SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
SQLContext sqlContext = new SQLContext(sc);
// 初始化參數
HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());
// 從Kafka Stream獲取數據
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);
JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
private static final long serialVersionUID = 5266880065425088203L;
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
List<ApacheAccessLog> list = new ArrayList<>();
try {
// 映射每一行
list.add(ApacheAccessLog.parseFromLogLine(line));
return list;
} catch (RuntimeException e) {
return list;
}
}).cache();
accessLogsDStream.foreachRDD(rdd -> {
// rdd to DataFrame
DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
// 寫入Parquet文件
df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());
return null;
});
// 啟動Streaming服務器
jssc.start(); // 啟動計算
jssc.awaitTermination(); // 等待終止
}
示例5: predictForOutput_LogisticRegressionModel
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
public static JavaRDD<Tuple2<Object, Object>> predictForOutput_LogisticRegressionModel(LogisticRegressionModel model, JavaRDD<LabeledPoint> data){
JavaRDD<Tuple2<Object, Object>> FeaturesAndPrediction = data.map(
new Function<LabeledPoint, Tuple2<Object, Object>>() {
private static final long serialVersionUID = 1L;
public Tuple2<Object, Object> call(LabeledPoint p) {
Double prediction = model.predict(p.features());
return new Tuple2<Object, Object>(p.features(), prediction);
}
}
);
return FeaturesAndPrediction;
}
示例6: main
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: JavaTeraSort <HDFS_INPUT> <HDFS_OUTPUT>");
System.exit(1);
}
SparkConf sparkConf = new SparkConf().setAppName("JavaTeraSort");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
JavaRDD<String> lines = ctx.textFile(args[0], 1);
Integer parallel = sparkConf.getInt("spark.default.parallelism", ctx.defaultParallelism());
Integer reducer = Integer.parseInt(IOCommon.getProperty("hibench.default.shuffle.parallelism").get());
JavaPairRDD<String, String> words = lines.mapToPair(new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String s) throws Exception {
return new Tuple2<String, String>(s.substring(0, 10), s.substring(10));
}
});
JavaPairRDD<String, String> sorted = words.sortByKey(true, reducer);
JavaRDD<String> result = sorted.map(new Function<Tuple2<String, String>, String>() {
@Override
public String call(Tuple2<String, String> e) throws Exception {
return e._1() + e._2();
}
});
result.saveAsTextFile(args[1]);
ctx.stop();
}
示例7: performJavaStream
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
static List<StreamVectors> performJavaStream(String appName, List<StreamVectors> input, int noIters) {
JavaRDD<StreamVectors> streamVectorsJavaRDD = ExampleUtils.getSparkContext(appName).parallelize(input);
for (int i = 0; i < noIters; i++) {
streamVectorsJavaRDD = streamVectorsJavaRDD.map(new Function<StreamVectors, StreamVectors>() {
@Override
public StreamVectors call(StreamVectors streamVectors) throws Exception {
streamVectors.setStartRun(System.nanoTime());
for (int idx = 0; idx < streamVectors.A.length; idx++) {
streamVectors.C[idx] = streamVectors.A[idx];
}
for (int idx = 0; idx < streamVectors.A.length; idx++) {
streamVectors.B[idx] = streamVectors.scaling_constant * streamVectors.C[idx];
}
for (int idx = 0; idx < streamVectors.A.length; idx++) {
streamVectors.C[idx] = streamVectors.A[idx] + streamVectors.B[idx];
}
for (int idx = 0; idx < streamVectors.A.length; idx++) {
streamVectors.A[idx] = streamVectors.B[idx] + streamVectors.scaling_constant * streamVectors.C[idx];
}
streamVectors.setEndRun(System.nanoTime());
return streamVectors;
}
});
}
return streamVectorsJavaRDD.collect();
}
示例8: performJavaStream
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
private List<StreamVectors> performJavaStream(String appName, List<StreamVectors> input) {
return ExampleUtils.getSparkContext(appName).parallelize(input).map(new Function<StreamVectors, StreamVectors>() {
@Override
public StreamVectors call(StreamVectors streamVectors) throws Exception {
streamVectors.setStartRun(System.nanoTime());
for(int idx = 0; idx < streamVectors.A.length; idx++){
streamVectors.C[idx] = streamVectors.A[idx];
}
for(int idx = 0; idx < streamVectors.A.length; idx++){
streamVectors.B[idx] = streamVectors.scaling_constant * streamVectors.C[idx];
}
for(int idx = 0; idx < streamVectors.A.length; idx++){
streamVectors.C[idx] = streamVectors.A[idx] + streamVectors.B[idx];
}
for(int idx = 0; idx < streamVectors.A.length; idx++){
streamVectors.A[idx] = streamVectors.B[idx] + streamVectors.scaling_constant * streamVectors.C[idx];
}
streamVectors.setEndRun(System.nanoTime());
return streamVectors;
}
}).collect();
}
示例9: call
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
@Override
public void call(JavaPairRDD<PublisherGeoKey, AggregationLog> logsRDD) throws Exception {
if (logsRDD != null) {
LOG.info(" Data to process in RDD:" + logsRDD.count());
JavaRDD<AggregationResult> aggResRDD = logsRDD.map(new Function<Tuple2<PublisherGeoKey, AggregationLog>, AggregationResult>() {
@Override
public AggregationResult call(
Tuple2<PublisherGeoKey, AggregationLog> arg0)
throws Exception {
PublisherGeoKey p = arg0._1;
AggregationLog a = arg0._2;
return new AggregationResult(new Timestamp(a.getTimestamp()),
p.getPublisher(), p.getGeo(), a.getImps(),
(int) a.getUniquesHll().estimatedSize(),
a.getSumBids() / a.getImps());
}
});
LOG.info(" Call Data Process Partition");
aggResRDD.foreachPartition(new SaveLogAggPartition());
} else
LOG.error("Data to process:" + 0);
}
示例10: main
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Big Apple").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
class GetLength implements Function<String, Integer> {
public Integer call(String s) {
return s.length();
}
}
class Sum implements Function2<Integer, Integer, Integer> {
public Integer call(Integer a, Integer b) {
return a + b;
}
}
JavaRDD<String> lines = sc.textFile("src/main/resources/compressed.gz");
JavaRDD<Integer> lineLengths = lines.map(new GetLength());
// Printing an RDD
lineLengths.foreach(x-> System.out.println(x));
int totalLength = lineLengths.reduce(new Sum());
System.out.println(totalLength);
}
示例11: compile
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
/**
* Create an appropriate {@link Function}-based predicate for deploying the given {@link PredicateDescriptor}
* on Apache Spark.
*
* @param predicateDescriptor describes the function
* @param operator that executes the {@link Function}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
* @param operatorContext contains optimization information for the {@code operator}
* @param inputs that feed the {@code operator}; only required if the {@code descriptor} describes an {@link ExtendedFunction}
*/
public <Type> Function<Type, Boolean> compile(
PredicateDescriptor<Type> predicateDescriptor,
SparkExecutionOperator operator,
OptimizationContext.OperatorContext operatorContext,
ChannelInstance[] inputs) {
final Predicate<Type> javaImplementation = predicateDescriptor.getJavaImplementation();
if (javaImplementation instanceof PredicateDescriptor.ExtendedSerializablePredicate) {
return new ExtendedPredicateAdapater<>(
(PredicateDescriptor.ExtendedSerializablePredicate<Type>) javaImplementation,
new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber())
);
} else {
return new PredicateAdapter<>(javaImplementation);
}
}
示例12: toTaggedSentence
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
private JavaRDD<String> toTaggedSentence(DataFrame output) {
return output.javaRDD().map(new Function<Row, String>() {
private static final long serialVersionUID = 4208643510231783579L;
@Override
public String call(Row row) throws Exception {
String[] tokens = row.getString(0).trim().split("\\s+");
String[] tags = row.getString(1).trim().split("\\s+");
if (tokens.length != tags.length) {
System.err.println("Incompatible lengths!");
return null;
}
StringBuilder sb = new StringBuilder(64);
for (int j = 0; j < tokens.length; j++) {
sb.append(tokens[j]);
sb.append('/');
sb.append(tags[j]);
sb.append(' ');
}
return sb.toString().trim();
}
});
}
示例13: numCharacters
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
/**
* Counts the number of non-space characters in this data set. This utility method
* is used to check the tokenization result.
* @param lines
* @return number of characters
*/
int numCharacters(JavaRDD<String> lines) {
JavaRDD<Integer> lengths = lines.map(new Function<String, Integer>() {
private static final long serialVersionUID = -2189399343462982586L;
@Override
public Integer call(String line) throws Exception {
line = line.replaceAll("[\\s_]+", "");
return line.length();
}
});
return lengths.reduce(new Function2<Integer, Integer, Integer>() {
private static final long serialVersionUID = -8438072946884289401L;
@Override
public Integer call(Integer e0, Integer e1) throws Exception {
return e0 + e1;
}
});
}
示例14: toPairFlatMapFunction
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
/** {@link KV} to pair flatmap function. */
public static <K, V> PairFlatMapFunction<Iterator<KV<K, V>>, K, V> toPairFlatMapFunction() {
return new PairFlatMapFunction<Iterator<KV<K, V>>, K, V>() {
@Override
public Iterator<Tuple2<K, V>> call(final Iterator<KV<K, V>> itr) {
final Iterator<Tuple2<K, V>> outputItr =
Iterators.transform(
itr,
new com.google.common.base.Function<KV<K, V>, Tuple2<K, V>>() {
@Override
public Tuple2<K, V> apply(KV<K, V> kv) {
return new Tuple2<>(kv.getKey(), kv.getValue());
}
});
return outputItr;
}
};
}
示例15: fromPairFlatMapFunction
import org.apache.spark.api.java.function.Function; //導入依賴的package包/類
/** A pair to {@link KV} flatmap function . */
static <K, V> FlatMapFunction<Iterator<Tuple2<K, V>>, KV<K, V>> fromPairFlatMapFunction() {
return new FlatMapFunction<Iterator<Tuple2<K, V>>, KV<K, V>>() {
@Override
public Iterator<KV<K, V>> call(Iterator<Tuple2<K, V>> itr) {
final Iterator<KV<K, V>> outputItr =
Iterators.transform(
itr,
new com.google.common.base.Function<Tuple2<K, V>, KV<K, V>>() {
@Override
public KV<K, V> apply(Tuple2<K, V> t2) {
return KV.of(t2._1(), t2._2());
}
});
return outputItr;
}
};
}