本文整理汇总了Java中org.apache.spark.api.java.JavaPairRDD.mapToPair方法的典型用法代码示例。如果您正苦于以下问题:Java JavaPairRDD.mapToPair方法的具体用法?Java JavaPairRDD.mapToPair怎么用?Java JavaPairRDD.mapToPair使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaPairRDD
的用法示例。
在下文中一共展示了JavaPairRDD.mapToPair方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: tokenizeData
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaPairRDD<String, List<String>> tokenizeData(JavaPairRDD<String, String> datasetsContentRDD, String splitter) throws Exception {
return datasetsContentRDD.mapToPair(new PairFunction<Tuple2<String, String>, String, List<String>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, List<String>> call(Tuple2<String, String> arg) throws Exception {
String content = arg._2;
List<String> tokens = getTokens(content, splitter);
return new Tuple2<>(arg._1, tokens);
}
});
}
示例2: removeRetiredDataset
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
/**
* filter out-of-data metadata
*
* @param es
* the Elasticsearch drive
* @param userDatasetsRDD
* dataset extracted from session
* @return filtered session datasets
*/
public JavaPairRDD<String, List<String>> removeRetiredDataset(ESDriver es, JavaPairRDD<String, List<String>> userDatasetsRDD) {
Map<String, String> nameMap = this.getOnServiceMetadata(es);
return userDatasetsRDD.mapToPair(new PairFunction<Tuple2<String, List<String>>, String, List<String>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, List<String>> call(Tuple2<String, List<String>> arg0) throws Exception {
List<String> oriDatasets = arg0._2;
List<String> newDatasets = new ArrayList<>();
int size = oriDatasets.size();
for (int i = 0; i < size; i++) {
String name = oriDatasets.get(i);
if (nameMap.containsKey(name)) {
newDatasets.add(nameMap.get(name));
}
}
return new Tuple2<>(arg0._1, newDatasets);
}
});
}
示例3: parsedToRatingRDD
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
/**
* @param parsedRDD parsed input as {@code String[]}
* @return {@link Rating}s ordered by timestamp
*/
private JavaRDD<Rating> parsedToRatingRDD(JavaRDD<String[]> parsedRDD,
Broadcast<Map<String,Integer>> bUserIDToIndex,
Broadcast<Map<String,Integer>> bItemIDToIndex) {
JavaPairRDD<Long,Rating> timestampRatingRDD = parsedRDD.mapToPair(tokens -> {
try {
return new Tuple2<>(
Long.valueOf(tokens[3]),
new Rating(bUserIDToIndex.value().get(tokens[0]),
bItemIDToIndex.value().get(tokens[1]),
// Empty value means 'delete'; propagate as NaN
tokens[2].isEmpty() ? Double.NaN : Double.parseDouble(tokens[2])));
} catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
log.warn("Bad input: {}", Arrays.toString(tokens));
throw e;
}
});
if (decayFactor < 1.0) {
double factor = decayFactor;
long now = System.currentTimeMillis();
timestampRatingRDD = timestampRatingRDD.mapToPair(timestampRating -> {
long timestamp = timestampRating._1();
return new Tuple2<>(timestamp, decayRating(timestampRating._2(), timestamp, now, factor));
});
}
if (decayZeroThreshold > 0.0) {
double theThreshold = decayZeroThreshold;
timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold);
}
return timestampRatingRDD.sortByKey().values();
}
示例4: run
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public void run() throws IOException {
FileSystem fs = DistributedFileSystem.get(new Configuration());
Path inpath = new Path(input);
Path outpath = new Path(output);
if (!fs.exists(inpath)) {
throw new IllegalArgumentException("Input file not found: " + inpath);
}
if (fs.exists(outpath)) {
throw new IllegalArgumentException("Output file exists, Not overwriting it: " + inpath);
}
SparkConf conf = new SparkConf();
conf.setMaster(sparkMaster);
conf.setAppName(getClass().getSimpleName() + "::" + System.currentTimeMillis());
JavaSparkContext ctx = new JavaSparkContext(conf);
//STEP1: READ
JavaPairRDD<Text, BytesWritable> rdd = ctx.sequenceFile(input, Text.class, BytesWritable.class);
//.mapToPair(rec -> new Tuple2<>(new Text(rec._1()), new BytesWritable(rec._2().getBytes())));
//STEP2: PARSE
JavaPairRDD<Text, Metadata> parsedRDD = rdd.mapToPair(
(PairFunction<Tuple2<Text, BytesWritable>, Text, Metadata>) rec -> {
Metadata md = new Metadata();
try (ByteArrayInputStream stream = new ByteArrayInputStream(rec._2().getBytes())) {
String content = TikaHolder.tika.parseToString(stream, md);
md.add("CONTENT", content);
}
return new Tuple2<>(rec._1(), md);
});
//STEP3: FORMAT
JavaRDD<String> outRDD = parsedRDD.map((Function<Tuple2<Text, Metadata>, String>) rec -> {
String key = rec._1().toString();
Metadata metadata = rec._2();
JSONObject object = new JSONObject();
for (String name : metadata.names()) {
if (metadata.isMultiValued(name)) {
JSONArray arr = new JSONArray();
for (String val : metadata.getValues(name)) {
arr.add(val);
}
object.put(name, arr);
} else {
object.put(name, metadata.get(name));
}
}
return key + "\t\t" + object.toJSONString();
});
//STEP4: SAVE
LOG.info("Saving at " + outpath);
outRDD.saveAsTextFile(output);
LOG.info("Stopping");
ctx.stop();
}
示例5: loadVectorFromCSV
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
/**
* loadVectorFromCSV: Load term vector from csv file.
*
* @param spark
* spark instance
* @param csvFileName
* csv matrix file
* @param skipNum
* the numbers of rows which should be skipped Ignore the top skip
* number rows of the csv file
* @return JavaPairRDD, each key is a term, and value is the vector of the
* term in feature space.
*/
public static JavaPairRDD<String, Vector> loadVectorFromCSV(SparkDriver spark, String csvFileName, int skipNum) {
// skip the first line (header), important!
JavaRDD<String> importRDD = spark.sc.textFile(csvFileName);
JavaPairRDD<String, Long> importIdRDD = importRDD.zipWithIndex().filter(new Function<Tuple2<String, Long>, Boolean>() {
/** */
private static final long serialVersionUID = 1L;
@Override
public Boolean call(Tuple2<String, Long> v1) throws Exception {
if (v1._2 > (skipNum - 1)) {
return true;
}
return false;
}
});
if (importIdRDD.count() == 0) {
return null;
}
return importIdRDD.mapToPair(new PairFunction<Tuple2<String, Long>, String, Vector>() {
/** */
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Vector> call(Tuple2<String, Long> t) throws Exception {
String[] fields = t._1.split(",");
String word = fields[0];
int fieldsize = fields.length;
int nStart = 1;
int nEnd = fieldsize - 1;
if (fieldsize < 2) {
nStart = 0;
nEnd = 0;
}
String[] numfields = Arrays.copyOfRange(fields, nStart, nEnd);
double[] nums = Stream.of(numfields).mapToDouble(Double::parseDouble).toArray();
Vector vec = Vectors.dense(nums);
return new Tuple2<>(word, vec);
}
});
}
示例6: call
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
@Override
public void call(JavaPairRDD<K,M> newData, Time timestamp)
throws IOException, InterruptedException {
if (newData.isEmpty()) {
log.info("No data in current generation's RDD; nothing to do");
return;
}
log.info("Beginning update at {}", timestamp);
Configuration hadoopConf = sparkContext.hadoopConfiguration();
if (hadoopConf.getResource("core-site.xml") == null) {
log.warn("Hadoop config like core-site.xml was not found; " +
"is the Hadoop config directory on the classpath?");
}
JavaPairRDD<K,M> pastData;
Path inputPathPattern = new Path(dataDirString + "/*/part-*");
FileSystem fs = FileSystem.get(inputPathPattern.toUri(), hadoopConf);
FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);
if (inputPathStatuses == null || inputPathStatuses.length == 0) {
log.info("No past data at path(s) {}", inputPathPattern);
pastData = null;
} else {
log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
Configuration updatedConf = new Configuration(hadoopConf);
updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));
@SuppressWarnings("unchecked")
JavaPairRDD<Writable,Writable> pastWritableData = (JavaPairRDD<Writable,Writable>)
sparkContext.newAPIHadoopRDD(updatedConf,
SequenceFileInputFormat.class,
keyWritableClass,
messageWritableClass);
pastData = pastWritableData.mapToPair(
new WritableToValueFunction<>(keyClass,
messageClass,
keyWritableClass,
messageWritableClass));
}
if (updateTopic == null || updateBroker == null) {
log.info("Not producing updates to update topic since none was configured");
updateInstance.runUpdate(sparkContext,
timestamp.milliseconds(),
newData,
pastData,
modelDirString,
null);
} else {
// This TopicProducer should not be async; sends one big model generally and
// needs to occur before other updates reliably rather than be buffered
try (TopicProducer<String,U> producer =
new TopicProducerImpl<>(updateBroker, updateTopic, false)) {
updateInstance.runUpdate(sparkContext,
timestamp.milliseconds(),
newData,
pastData,
modelDirString,
producer);
}
}
}