本文整理汇总了Java中org.apache.spark.rdd.RDD类的典型用法代码示例。如果您正苦于以下问题:Java RDD类的具体用法?Java RDD怎么用?Java RDD使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
RDD类属于org.apache.spark.rdd包,在下文中一共展示了RDD类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: rmse
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
/**
* Computes root mean squared error of {@link Rating#rating()} versus predicted value.
*/
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
@SuppressWarnings("unchecked")
RDD<Tuple2<Object,Object>> testUserProducts =
(RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
double mse = predictions.mapToPair(
rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
double diff = valuePrediction._1() - valuePrediction._2();
return diff * diff;
}).mean();
return Math.sqrt(mse);
}
示例2: execute
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
@Override
public void execute() {
if (op.getSubOp() != null) {
RDD<SolutionMapping> oldResult = IntermediateResultsModel
.getInstance().getResultRDD(op.getSubOp().hashCode());
Set<String> vars = IntermediateResultsModel.getInstance()
.getResultVariables(op.getSubOp().hashCode());
RDD<SolutionMapping> result = SparkFacade.distinct(oldResult);
IntermediateResultsModel.getInstance().removeResult(
op.getSubOp().hashCode());
IntermediateResultsModel.getInstance().putResult(op.hashCode(),
result, vars);
}
}
示例3: doOperation
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
private RDD<Element> doOperation(final GetRDDOfElements operation,
final Context context,
final AccumuloStore accumuloStore)
throws OperationException {
final Configuration conf = getConfiguration(operation);
final SparkContext sparkContext = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext();
sparkContext.hadoopConfiguration().addResource(conf);
// Use batch scan option when performing seeded operation
InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
addIterators(accumuloStore, conf, context.getUser(), operation);
addRanges(accumuloStore, conf, operation);
final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf,
ElementInputFormat.class,
Element.class,
NullWritable.class);
return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG);
}
示例4: execute
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
@Override
public void execute() {
long limit = op.getLength();
long offset = op.getStart();
RDD<SolutionMapping> result = null;
if (limit > 0 && offset > 0) {
// Limit and offset
result = SparkFacade.limitOffset(IntermediateResultsModel
.getInstance().getResultRDD(op.getSubOp().hashCode()),
(int) limit, (int) offset);
} else if (limit > 0 && offset < 0) {
result = SparkFacade.limit(IntermediateResultsModel.getInstance()
.getResultRDD(op.getSubOp().hashCode()), (int) limit);
} else if (limit < 0 && offset > 0) {
throw new UnsupportedOperationException(
"Offset only is not supported yet");
}
Set<String> resultVars = IntermediateResultsModel.getInstance()
.getResultVariables(op.getSubOp().hashCode());
IntermediateResultsModel.getInstance().removeResult(
op.getSubOp().hashCode());
IntermediateResultsModel.getInstance().putResult(op.hashCode(), result,
resultVars);
}
示例5: execute
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
@Override
public void execute() {
if (op.getSubOp() != null) {
RDD<SolutionMapping> result = IntermediateResultsModel
.getInstance().getResultRDD(op.getSubOp().hashCode());
RDD<SolutionMapping> filteredRes = SparkFacade.filter(result,
this.expressions);
IntermediateResultsModel.getInstance().putResult(
op.hashCode(),
filteredRes,
IntermediateResultsModel.getInstance().getResultVariables(
op.getSubOp().hashCode()));
IntermediateResultsModel.getInstance().removeResult(
op.getSubOp().hashCode());
}
}
示例6: buildScan
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
@Override
public RDD<Row> buildScan() {
log.debug("-> buildScan()");
schema();
// I have isolated the work to a method to keep the plumbing code as simple as
// possible.
List<PhotoMetadata> table = collectData();
@SuppressWarnings("resource")
JavaSparkContext sparkContext = new JavaSparkContext(sqlContext.sparkContext());
JavaRDD<Row> rowRDD = sparkContext.parallelize(table)
.map(photo -> SparkBeanUtils.getRowFromBean(schema, photo));
return rowRDD.rdd();
}
示例7: readAndConvertFeatureRDD
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
private static RDD<Tuple2<Object,double[]>> readAndConvertFeatureRDD(
JavaPairRDD<String,float[]> javaRDD,
Broadcast<Map<String,Integer>> bIdToIndex) {
RDD<Tuple2<Integer,double[]>> scalaRDD = javaRDD.mapToPair(t ->
new Tuple2<>(bIdToIndex.value().get(t._1()), t._2())
).mapValues(f -> {
double[] d = new double[f.length];
for (int i = 0; i < d.length; i++) {
d[i] = f[i];
}
return d;
}
).rdd();
// This mimics the persistence level establish by ALS training methods
scalaRDD.persist(StorageLevel.MEMORY_AND_DISK());
@SuppressWarnings("unchecked")
RDD<Tuple2<Object,double[]>> objKeyRDD = (RDD<Tuple2<Object,double[]>>) (RDD<?>) scalaRDD;
return objKeyRDD;
}
示例8: buildScan
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
@Override
public RDD<Row> buildScan() {
SPARQLRepository repository = new SPARQLRepository(serviceField);
try {
repository.initialize();
List<Row> rows = tupleQueryModifiedToWorkWithVirtuoso(repository, queryField.getSourceString(), tuple -> {
List<Row> result = new ArrayList<>();
while (tuple.hasNext()) {
result.add(convertTupleResultToRow(tuple.getBindingNames(), tuple.next(), this.schemaField));
}
return result;
});
// The unmapped spark context must be closed by the owner of this
// class when it isn't needed, this is just created to allow us to
// use the scala code in java
@SuppressWarnings("resource")
JavaSparkContext sc = new JavaSparkContext(sqlContext().sparkContext());
return sc.parallelize(rows).rdd();
} finally {
repository.shutDown();
}
}
示例9: buildScan
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
/**
* Creates a {@code DataFrame} of all {@link Element}s from the specified groups.
*
* @return An {@link RDD} of {@link Row}s containing {@link Element}s whose group is in {@code groups}.
*/
@Override
public RDD<Row> buildScan() {
try {
LOGGER.info("Building GetRDDOfAllElements with view set to groups {}", StringUtils.join(groups, ','));
final GetRDDOfAllElements operation = new GetRDDOfAllElements();
operation.setView(view);
operation.setOptions(options);
final RDD<Element> rdd = store.execute(operation, context);
return rdd.map(new ConvertElementToRow(usedProperties, propertyNeedsConversion, converterByProperty),
ClassTagConstants.ROW_CLASS_TAG);
} catch (final OperationException e) {
LOGGER.error("OperationException while executing operation: {}", e);
return null;
}
}
示例10: doOperation
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
private RDD<Element> doOperation(final GetRDDOfAllElements operation,
final Context context,
final AccumuloStore accumuloStore)
throws OperationException {
SparkSession sparkSession = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties());
if (sparkSession == null) {
throw new OperationException("This operation requires an active SparkSession.");
}
sparkSession.sparkContext().hadoopConfiguration().addResource(getConfiguration(operation));
final String useRFileReaderRDD = operation.getOption(USE_RFILE_READER_RDD);
if (Boolean.parseBoolean(useRFileReaderRDD)) {
return doOperationUsingRFileReaderRDD(operation, context, accumuloStore);
} else {
return doOperationUsingElementInputFormat(operation, context, accumuloStore);
}
}
示例11: doOperationUsingElementInputFormat
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
private RDD<Element> doOperationUsingElementInputFormat(final GetRDDOfAllElements operation,
final Context context,
final AccumuloStore accumuloStore)
throws OperationException {
final Configuration conf = getConfiguration(operation);
addIterators(accumuloStore, conf, context.getUser(), operation);
final String useBatchScannerRDD = operation.getOption(USE_BATCH_SCANNER_RDD);
if (Boolean.parseBoolean(useBatchScannerRDD)) {
InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
}
final RDD<Tuple2<Element, NullWritable>> pairRDD = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext().newAPIHadoopRDD(conf,
ElementInputFormat.class,
Element.class,
NullWritable.class);
return pairRDD.map(new FirstElement(), ELEMENT_CLASS_TAG);
}
示例12: doOperation
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
public void doOperation(final ImportRDDOfElements operation, final Context context, final AccumuloStore store) throws OperationException {
final String outputPath = operation.getOption(OUTPUT_PATH);
if (null == outputPath || outputPath.isEmpty()) {
throw new OperationException("Option outputPath must be set for this option to be run against the accumulostore");
}
final String failurePath = operation.getOption(FAILURE_PATH);
if (null == failurePath || failurePath.isEmpty()) {
throw new OperationException("Option failurePath must be set for this option to be run against the accumulostore");
}
final ElementConverterFunction func = new ElementConverterFunction(SparkContextUtil.getSparkSession(context, store.getProperties()).sparkContext().broadcast(store.getKeyPackage().getKeyConverter(), ACCUMULO_ELEMENT_CONVERTER_CLASS_TAG));
final RDD<Tuple2<Key, Value>> rdd = operation.getInput().flatMap(func, TUPLE2_CLASS_TAG);
final ImportKeyValuePairRDDToAccumulo op =
new ImportKeyValuePairRDDToAccumulo.Builder()
.input(rdd)
.failurePath(failurePath)
.outputPath(outputPath)
.build();
store.execute(new OperationChain<>(op), context);
}
示例13: checkHadoopConfIsPassedThrough
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
@Test
public void checkHadoopConfIsPassedThrough() throws OperationException, IOException {
final Graph graph1 = new Graph.Builder()
.config(new GraphConfig.Builder()
.graphId("graphId")
.build())
.addSchema(getClass().getResourceAsStream("/schema/elements.json"))
.addSchema(getClass().getResourceAsStream("/schema/types.json"))
.addSchema(getClass().getResourceAsStream("/schema/serialisation.json"))
.storeProperties(getClass().getResourceAsStream("/store.properties"))
.build();
final User user = new User();
final Configuration conf = new Configuration();
conf.set("AN_OPTION", "A_VALUE");
final String encodedConf = AbstractGetRDDHandler.convertConfigurationToString(conf);
final GetRDDOfAllElements rddQuery = new GetRDDOfAllElements.Builder()
.option(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, encodedConf)
.build();
final RDD<Element> rdd = graph1.execute(rddQuery, user);
assertEquals(encodedConf, rddQuery.getOption(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY));
assertEquals("A_VALUE", rdd.sparkContext().hadoopConfiguration().get("AN_OPTION"));
}
示例14: testGetAllElementsInRDD
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
private void testGetAllElementsInRDD(final Graph graph, final GetRDDOfAllElements getRDD) throws OperationException,
IOException, InterruptedException, AccumuloSecurityException, AccumuloException {
final Set<Element> expectedElements = new HashSet<>(getElements());
final RDD<Element> rdd = graph.execute(getRDD, USER);
if (rdd == null) {
fail("No RDD returned");
}
final Set<Element> results = new HashSet<>();
final Element[] returnedElements = (Element[]) rdd.collect();
// Check the number of elements returned is correct to ensure edges
// aren't returned twice
assertEquals(30, returnedElements.length);
for (int i = 0; i < returnedElements.length; i++) {
results.add(returnedElements[i]);
}
assertEquals(expectedElements, results);
}
示例15: testGetAllElementsInRDDWithIngestAggregationApplied
import org.apache.spark.rdd.RDD; //导入依赖的package包/类
private void testGetAllElementsInRDDWithIngestAggregationApplied(final Graph graph, final GetRDDOfAllElements getRDD)
throws OperationException {
final RDD<Element> rdd = graph.execute(getRDD, USER);
if (rdd == null) {
fail("No RDD returned");
}
// Should get aggregated data
final Element[] returnedElements = (Element[]) rdd.collect();
assertEquals(1, returnedElements.length);
final Entity entity1 = new Entity.Builder()
.group(TestGroups.ENTITY)
.vertex("A")
.property("count", 2)
.build();
assertEquals(entity1, returnedElements[0]);
}