本文整理汇总了Java中org.apache.spark.rdd.RDD.map方法的典型用法代码示例。如果您正苦于以下问题:Java RDD.map方法的具体用法?Java RDD.map怎么用?Java RDD.map使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.rdd.RDD
的用法示例。
在下文中一共展示了RDD.map方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: doOperation
import org.apache.spark.rdd.RDD; //导入方法依赖的package包/类
private RDD<Element> doOperation(final GetRDDOfElements operation,
final Context context,
final AccumuloStore accumuloStore)
throws OperationException {
final Configuration conf = getConfiguration(operation);
final SparkContext sparkContext = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext();
sparkContext.hadoopConfiguration().addResource(conf);
// Use batch scan option when performing seeded operation
InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
addIterators(accumuloStore, conf, context.getUser(), operation);
addRanges(accumuloStore, conf, operation);
final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf,
ElementInputFormat.class,
Element.class,
NullWritable.class);
return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG);
}
示例2: buildScan
import org.apache.spark.rdd.RDD; //导入方法依赖的package包/类
/**
* Creates a {@code DataFrame} of all {@link Element}s from the specified groups.
*
* @return An {@link RDD} of {@link Row}s containing {@link Element}s whose group is in {@code groups}.
*/
@Override
public RDD<Row> buildScan() {
try {
LOGGER.info("Building GetRDDOfAllElements with view set to groups {}", StringUtils.join(groups, ','));
final GetRDDOfAllElements operation = new GetRDDOfAllElements();
operation.setView(view);
operation.setOptions(options);
final RDD<Element> rdd = store.execute(operation, context);
return rdd.map(new ConvertElementToRow(usedProperties, propertyNeedsConversion, converterByProperty),
ClassTagConstants.ROW_CLASS_TAG);
} catch (final OperationException e) {
LOGGER.error("OperationException while executing operation: {}", e);
return null;
}
}
示例3: doOperationUsingElementInputFormat
import org.apache.spark.rdd.RDD; //导入方法依赖的package包/类
private RDD<Element> doOperationUsingElementInputFormat(final GetRDDOfAllElements operation,
final Context context,
final AccumuloStore accumuloStore)
throws OperationException {
final Configuration conf = getConfiguration(operation);
addIterators(accumuloStore, conf, context.getUser(), operation);
final String useBatchScannerRDD = operation.getOption(USE_BATCH_SCANNER_RDD);
if (Boolean.parseBoolean(useBatchScannerRDD)) {
InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
}
final RDD<Tuple2<Element, NullWritable>> pairRDD = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext().newAPIHadoopRDD(conf,
ElementInputFormat.class,
Element.class,
NullWritable.class);
return pairRDD.map(new FirstElement(), ELEMENT_CLASS_TAG);
}
示例4: convert
import org.apache.spark.rdd.RDD; //导入方法依赖的package包/类
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
PODistinct poDistinct) throws IOException {
SparkUtil.assertPredecessorSize(predecessors, poDistinct, 1);
RDD<Tuple> rdd = predecessors.get(0);
ClassTag<Tuple2<Tuple, Object>> tuple2ClassManifest = SparkUtil
.<Tuple, Object> getTuple2Manifest();
RDD<Tuple2<Tuple, Object>> rddPairs = rdd.map(TO_KEY_VALUE_FUNCTION,
tuple2ClassManifest);
PairRDDFunctions<Tuple, Object> pairRDDFunctions
= new PairRDDFunctions<Tuple, Object>(
rddPairs, SparkUtil.getManifest(Tuple.class),
SparkUtil.getManifest(Object.class), null);
int parallelism = SparkUtil.getParallelism(predecessors, poDistinct);
return pairRDDFunctions.reduceByKey(MERGE_VALUES_FUNCTION, parallelism)
.map(TO_VALUE_FUNCTION, SparkUtil.getManifest(Tuple.class));
}
示例5: convert
import org.apache.spark.rdd.RDD; //导入方法依赖的package包/类
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessorRdds, POLoad poLoad)
throws IOException {
// if (predecessors.size()!=0) {
// throw new
// RuntimeException("Should not have predecessors for Load. Got : "+predecessors);
// }
JobConf loadJobConf = SparkUtil.newJobConf(pigContext);
configureLoader(physicalPlan, poLoad, loadJobConf);
// don't know why but just doing this cast for now
RDD<Tuple2<Text, Tuple>> hadoopRDD = sparkContext.newAPIHadoopFile(
poLoad.getLFile().getFileName(), PigInputFormatSpark.class,
Text.class, Tuple.class, loadJobConf);
registerUdfFiles();
// map to get just RDD<Tuple>
return hadoopRDD.map(TO_TUPLE_FUNCTION,
SparkUtil.getManifest(Tuple.class));
}
示例6: convert
import org.apache.spark.rdd.RDD; //导入方法依赖的package包/类
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POSort sortOperator)
throws IOException {
SparkUtil.assertPredecessorSize(predecessors, sortOperator, 1);
RDD<Tuple> rdd = predecessors.get(0);
RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyValueFunction(),
SparkUtil.<Tuple, Object> getTuple2Manifest());
JavaPairRDD<Tuple, Object> r = new JavaPairRDD<Tuple, Object>(rddPair,
SparkUtil.getManifest(Tuple.class),
SparkUtil.getManifest(Object.class));
JavaPairRDD<Tuple, Object> sorted = r.sortByKey(
sortOperator.new SortComparator(), true);
JavaRDD<Tuple> mapped = sorted.mapPartitions(TO_VALUE_FUNCTION);
return mapped.rdd();
}
示例7: doCql3SaveToCassandra
import org.apache.spark.rdd.RDD; //导入方法依赖的package包/类
public static <W> void doCql3SaveToCassandra(RDD<W> rdd, ICassandraDeepJobConfig<W> writeConfig,
Function1<W, Tuple2<Cells, Cells>> transformer) {
if (!writeConfig.getIsWriteConfig()) {
throw new IllegalArgumentException("Provided configuration object is not suitable for writing");
}
Tuple2<Map<String, ByteBuffer>, Map<String, ByteBuffer>> tuple = new Tuple2<>(null, null);
RDD<Tuple2<Cells, Cells>> mappedRDD = rdd.map(transformer,
ClassTag$.MODULE$.<Tuple2<Cells, Cells>>apply(tuple.getClass()));
((CassandraDeepJobConfig) writeConfig).createOutputTableIfNeeded(mappedRDD.first());
final int pageSize = writeConfig.getBatchSize();
int offset = 0;
List<Tuple2<Cells, Cells>> elements = Arrays.asList((Tuple2<Cells, Cells>[]) mappedRDD.collect());
List<Tuple2<Cells, Cells>> split;
do {
split = elements.subList(pageSize * (offset++), Math.min(pageSize * offset, elements.size()));
Batch batch = QueryBuilder.batch();
for (Tuple2<Cells, Cells> t : split) {
Tuple2<String[], Object[]> bindVars = Utils.prepareTuple4CqlDriver(t);
Insert insert = QueryBuilder
.insertInto(quote(writeConfig.getKeyspace()), quote(writeConfig.getTable()))
.values(bindVars._1(), bindVars._2());
batch.add(insert);
}
writeConfig.getSession().execute(batch);
} while (!split.isEmpty() && split.size() == pageSize);
}
示例8: convert
import org.apache.spark.rdd.RDD; //导入方法依赖的package包/类
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
POLocalRearrange physicalOperator) throws IOException {
SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
RDD<Tuple> rdd = predecessors.get(0);
// call local rearrange to get key and value
return rdd.map(new LocalRearrangeFunction(physicalOperator),
SparkUtil.getManifest(Tuple.class));
}
示例9: convert
import org.apache.spark.rdd.RDD; //导入方法依赖的package包/类
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
POPackage physicalOperator) throws IOException {
SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
RDD<Tuple> rdd = predecessors.get(0);
// package will generate the group from the result of the local
// rearrange
return rdd.map(new PackageFunction(physicalOperator, this.confBytes),
SparkUtil.getManifest(Tuple.class));
}
示例10: convert
import org.apache.spark.rdd.RDD; //导入方法依赖的package包/类
@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
POSkewedJoin poSkewedJoin) throws IOException {
SparkUtil.assertPredecessorSize(predecessors, poSkewedJoin, 2);
LRs = new POLocalRearrange[2];
this.poSkewedJoin = poSkewedJoin;
createJoinPlans(poSkewedJoin.getJoinPlans());
// extract the two RDDs
RDD<Tuple> rdd1 = predecessors.get(0);
RDD<Tuple> rdd2 = predecessors.get(1);
// make (key, value) pairs, key has type Object, value has type Tuple
RDD<Tuple2<Object, Tuple>> rdd1Pair = rdd1.map(new ExtractKeyFunction(
this, 0), SparkUtil.<Object, Tuple>getTuple2Manifest());
RDD<Tuple2<Object, Tuple>> rdd2Pair = rdd2.map(new ExtractKeyFunction(
this, 1), SparkUtil.<Object, Tuple>getTuple2Manifest());
// join fn is present in JavaPairRDD class ..
JavaPairRDD<Object, Tuple> rdd1Pair_javaRDD = new JavaPairRDD<Object, Tuple>(
rdd1Pair, SparkUtil.getManifest(Object.class),
SparkUtil.getManifest(Tuple.class));
JavaPairRDD<Object, Tuple> rdd2Pair_javaRDD = new JavaPairRDD<Object, Tuple>(
rdd2Pair, SparkUtil.getManifest(Object.class),
SparkUtil.getManifest(Tuple.class));
// do the join
JavaPairRDD<Object, Tuple2<Tuple, Tuple>> result_KeyValue = rdd1Pair_javaRDD
.join(rdd2Pair_javaRDD);
// map to get RDD<Tuple> from RDD<Object, Tuple2<Tuple, Tuple>> by
// ignoring the key (of type Object) and appending the values (the
// Tuples)
JavaRDD<Tuple> result = result_KeyValue
.mapPartitions(new ToValueFunction());
// return type is RDD<Tuple>, so take it from JavaRDD<Tuple>
return result.rdd();
}