本文整理汇总了Java中org.apache.spark.api.java.JavaSparkContext.broadcast方法的典型用法代码示例。如果您正苦于以下问题:Java JavaSparkContext.broadcast方法的具体用法?Java JavaSparkContext.broadcast怎么用?Java JavaSparkContext.broadcast使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaSparkContext
的用法示例。
在下文中一共展示了JavaSparkContext.broadcast方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: run
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
*
* @param topKvalueCandidates the topK results per entity, acquired from value similarity
* @param rawTriples1 the rdf triples of the first entity collection
* @param rawTriples2 the rdf triples of the second entity collection
* @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
* @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
* @param entityIds2
* @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
* @param K the K for topK candidate matches
* @param N the N for topN rdf relations (and neighbors)
* @param jsc the java spark context used to load files and broadcast variables
* @return topK neighbor candidates per entity
*/
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates,
JavaRDD<String> rawTriples1,
JavaRDD<String> rawTriples2,
String SEPARATOR,
JavaRDD<String> entityIds1,
JavaRDD<String> entityIds2,
float MIN_SUPPORT_THRESHOLD,
int K,
int N,
JavaSparkContext jsc) {
Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
//JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);
JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);
return topKneighborCandidates;
}
示例2: run2
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
*
* @param topKvalueCandidates the topK results per entity, acquired from value similarity
* @param rawTriples1 the rdf triples of the first entity collection
* @param rawTriples2 the rdf triples of the second entity collection
* @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
* @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
* @param entityIds2
* @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
* @param K the K for topK candidate matches
* @param N the N for topN rdf relations (and neighbors)
* @param jsc the java spark context used to load files and broadcast variables
* @return topK neighbor candidates per entity
*/
public JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> run2(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates,
JavaRDD<String> rawTriples1,
JavaRDD<String> rawTriples2,
String SEPARATOR,
JavaRDD<String> entityIds1,
JavaRDD<String> entityIds2,
float MIN_SUPPORT_THRESHOLD,
int K,
int N,
JavaSparkContext jsc) {
Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> topKneighborCandidates = getTopKNeighborSimsSUMWithScores(topKvalueCandidates, inNeighbors_BV, K);
return topKneighborCandidates;
}
示例3: run
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
*
* @param topKvalueCandidates the topK results per entity, acquired from value similarity
* @param rawTriples1 the rdf triples of the first entity collection
* @param rawTriples2 the rdf triples of the second entity collection
* @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
* @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
* @param entityIds2
* @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
* @param K the K for topK candidate matches
* @param N the N for topN rdf relations (and neighbors)
* @param jsc the java spark context used to load files and broadcast variables
* @return topK neighbor candidates per entity
*/
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates,
JavaRDD<String> rawTriples1,
JavaRDD<String> rawTriples2,
String SEPARATOR,
JavaRDD<String> entityIds1,
JavaRDD<String> entityIds2,
float MIN_SUPPORT_THRESHOLD,
int K,
int N,
JavaSparkContext jsc) {
Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
//JavaPairRDD<Tuple2<Integer, Integer>, Float> neighborSims = getNeighborSims(topKvalueCandidates, inNeighbors_BV);
//JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSimsOld(neighborSims, K);
JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);
return topKneighborCandidates;
}
示例4: GetD_COORD
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static CoordinateMatrix GetD_COORD(CoordinateMatrix A, boolean inverseValues, JavaSparkContext jsc) {
JavaRDD<MatrixEntry> rows = A.entries().toJavaRDD().cache();
final Broadcast<Boolean> inverseValuesBC = jsc.broadcast(inverseValues);
JavaRDD<MatrixEntry> LUEntries = rows.mapPartitions(new FlatMapFunction<Iterator<MatrixEntry>, MatrixEntry>() {
@Override
public Iterator<MatrixEntry> call(Iterator<MatrixEntry> matrixEntryIterator) throws Exception {
List<MatrixEntry> newLowerEntries = new ArrayList<MatrixEntry>();
boolean inverseValuesValue = inverseValuesBC.getValue().booleanValue();
while(matrixEntryIterator.hasNext()) {
MatrixEntry currentEntry = matrixEntryIterator.next();
if(currentEntry.i() == currentEntry.j()) {
if(inverseValuesValue) {
newLowerEntries.add(new MatrixEntry(currentEntry.i(), currentEntry.j(), 1.0/currentEntry.value()));
}
else {
newLowerEntries.add(currentEntry);
}
}
else {
newLowerEntries.add(new MatrixEntry(currentEntry.i(), currentEntry.j(), 0.0));
}
}
return newLowerEntries.iterator();
}
});
CoordinateMatrix newMatrix = new CoordinateMatrix(LUEntries.rdd());
return newMatrix;
}
示例5: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
SparkSession sparkSession = SparkSession.builder().master("local").appName("My App")
.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext());
JavaPairRDD<String, String> userIdToCityId = jsc.parallelizePairs(
Arrays.asList(new Tuple2<String, String>("1", "101"), new Tuple2<String, String>("2", "102"),
new Tuple2<String, String>("3", "107"), new Tuple2<String, String>("4", "103"),
new Tuple2<String, String>("11", "101"), new Tuple2<String, String>("12", "102"),
new Tuple2<String, String>("13", "107"), new Tuple2<String, String>("14", "103")));
JavaPairRDD<String, String> cityIdToCityName = jsc.parallelizePairs(
Arrays.asList(new Tuple2<String, String>("101", "India"), new Tuple2<String, String>("102", "UK"),
new Tuple2<String, String>("103", "Germany"), new Tuple2<String, String>("107", "USA")));
Broadcast<Map<String, String>> citiesBroadcasted = jsc.broadcast(cityIdToCityName.collectAsMap());
JavaRDD<Tuple3<String, String, String>> joined = userIdToCityId.map(
v1 -> new Tuple3<String, String, String>(v1._1(), v1._2(), citiesBroadcasted.value().get(v1._2())));
System.out.println(joined.collect());
}
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:26,代码来源:MapSideJoinBroadcast.java
示例6: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main( String[] args )
{
Dataset<Row> mutations = DataProvider.getMutationsToStructures();
List<String> pdbIds = mutations.select(col("pdbId"))
.distinct().toJavaRDD().map(t -> t.getString(0)).collect();
List<Row> broadcasted = mutations.select("pdbId", "chainId", "pdbAtomPos").collectAsList();
SaprkUtils.stopSparkSession();
JavaSparkContext sc = SaprkUtils.getSparkContext();
Broadcast<List<Row>> bcmut = sc.broadcast(broadcasted);
MmtfReader//.readSequenceFile("/pdb/2017/full", pdbIds, sc)
.downloadMmtfFiles(Arrays.asList("5IRC"), sc)
.flatMapToPair(new StructureToPolymerChains())
.flatMapToPair(new AddResidueToKey(bcmut))
.mapValues(new StructureToBioJava())
.mapToPair(new FilterResidue())
.filter(t -> t._2!=null).keys()
.map(t -> t.replace(".", ","))
.saveAsTextFile("/Users/yana/git/mutantpdb/src/main/resources/pdb_residues");
sc.close();
}
示例7: pushUdf
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
* Pushes an "in_valueset" UDF that uses the given {@link BroadcastableValueSets} for its content.
*
* @param spark the spark session
* @param valueSets the valuesets to use in the UDF
*/
public static synchronized void pushUdf(SparkSession spark, BroadcastableValueSets valueSets) {
JavaSparkContext ctx = new JavaSparkContext(spark.sparkContext());
Broadcast<BroadcastableValueSets> broadcast = ctx.broadcast(valueSets);
spark.udf()
.register("in_valueset",
new InValuesetUdf(broadcast),
DataTypes.BooleanType);
// Push the broadcast variable
valueSetStack.push(broadcast);
}
示例8: GetD_IRW
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static IndexedRowMatrix GetD_IRW(IndexedRowMatrix A, boolean inverseValues, JavaSparkContext jsc) {
JavaRDD<IndexedRow> rows = A.rows().toJavaRDD().cache();
final Broadcast<Boolean> inverseValuesBC = jsc.broadcast(inverseValues);
JavaRDD<IndexedRow> LURows = rows.map(new Function<IndexedRow, IndexedRow>() {
@Override
public IndexedRow call(IndexedRow indexedRow) throws Exception {
long index = indexedRow.index();
DenseVector vect = indexedRow.vector().toDense();
boolean inverseValuesValue = inverseValuesBC.getValue().booleanValue();
double newValues[] = new double[vect.size()];
for(int i = 0; i< vect.size(); i++) {
if( i == index) {
if(inverseValuesValue) {
newValues[i] = 1.0/vect.apply(i);
}
else {
newValues[i] = vect.apply(i);
}
}
else {
newValues[i] = 0.0;
}
}
DenseVector newVector = new DenseVector(newValues);
return new IndexedRow(index, newVector);
}
});
IndexedRowMatrix newMatrix = new IndexedRowMatrix(LURows.rdd());
return newMatrix;
}
示例9: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception{
String srcBucketName;
String scrBucketKey;
String destBucketName;
String destPrefix;
ArgumentParser argumentParser = new ArgumentParser();
AmazonS3 s3Client = new AmazonS3Client();
try {
BucketKey location = argumentParser.parseArguments(args);
srcBucketName = location.getSrcBucket();
scrBucketKey = location.getSrcKey();
destBucketName = location.getDestBucket();
destPrefix = location.getDestPrefix();
} catch (ParseException e) {
LOG.info(PARSE_ERROR_MSG);
throw new IllegalArgumentException("Parser throw a parse Exception", e);
}
// Obtain the original manifest files
InventoryManifestRetriever inventoryManifestRetriever =
new InventoryManifestRetriever(s3Client, srcBucketName, scrBucketKey);
InventoryManifest manifest = inventoryManifestRetriever.getInventoryManifest();
// Check if the inventory report includes the StorageClass column
String fileSchema = manifest.getFileSchema();
String filterColumn = "storageClass";
if (!StringUtils.containsIgnoreCase(fileSchema, filterColumn)) {
throw new StorageClassNotIncludedException();
}
//Create Spark Context
SparkConf sparkConf = new SparkConf();
JavaSparkContext sc = new JavaSparkContext(sparkConf);
Broadcast<CachedS3ClientFactory> clientFactory = sc.broadcast(new CachedS3ClientFactory());
// Get the inventory report, split it into lines, parse each line to a POJO,
// Filter, and write new csv file to S3
JavaRDD<InventoryManifest.Locator> locatorRDD = sc.parallelize(manifest.getLocators());
List<InventoryManifest.Locator> newLocatorList = locatorRDD
.map(new InventoryReportLineRetriever(clientFactory, manifest))
.flatMap(new InventoryReportMapper(manifest))
.filter(new ReducedRedundancyStorageClassFilter())
.mapPartitions(new WriteNewInventoryReportFunc(clientFactory, srcBucketName, manifest,
destBucketName, destPrefix))
.collect();
// Generate new manifest files including new locators, and send them back to S3
new ManifestWriter(s3Client, destBucketName, destPrefix, srcBucketName, manifest)
.writeManifest(newLocatorList);
sc.close();
}
开发者ID:awslabs,项目名称:s3-inventory-usage-examples,代码行数:54,代码来源:ReducedRedundancyLocatorExampleMain.java
示例10: run
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
* return a map of topN neighbors per entity (reversed to point to in-neighbors (values) having the key entity as their top out-neighbor)
* @param rawTriples
* @param SEPARATOR
* @param entityIdsRDD
* @param MIN_SUPPORT_THRESHOLD
* @param N topN neighbors per entity
* @param positiveIds
* @param jsc
* @return
*/
public Map<Integer,IntArrayList> run(JavaRDD<String> rawTriples, String SEPARATOR, JavaRDD<String> entityIdsRDD, float MIN_SUPPORT_THRESHOLD, int N, boolean positiveIds, JavaSparkContext jsc) {
//rawTriples.persist(StorageLevel.MEMORY_AND_DISK_SER());
//List<String> subjects = Utils.getEntityUrlsFromEntityRDDInOrder(rawTriples, SEPARATOR); //a list of (distinct) subject URLs, keeping insertion order (from original triples file)
//Object2IntOpenHashMap<String> subjects = Utils.getEntityIdsMapping(rawTriples, SEPARATOR);
Object2IntOpenHashMap<String> entityIds = Utils.readEntityIdsMapping(entityIdsRDD, positiveIds);
System.out.println("Found "+entityIds.size()+" entities in collection "+ (positiveIds?"1":"2"));
long numEntitiesSquared = (long)entityIds.keySet().size();
numEntitiesSquared *= numEntitiesSquared;
Broadcast<Object2IntOpenHashMap<String>> entityIds_BV = jsc.broadcast(entityIds);
JavaPairRDD<String,List<Tuple2<Integer, Integer>>> relationIndex = getRelationIndex(rawTriples, SEPARATOR, entityIds_BV); //a list of (s,o) for each predicate
//rawTriples.unpersist();
relationIndex.persist(StorageLevel.MEMORY_AND_DISK_SER());
List<String> relationsRank = getRelationsRank(relationIndex, MIN_SUPPORT_THRESHOLD, numEntitiesSquared);
System.out.println("Top-5 relations in collection "+(positiveIds?"1: ":"2: ")+Arrays.toString(relationsRank.subList(0, Math.min(5,relationsRank.size())).toArray()));
JavaPairRDD<Integer, IntArrayList> topOutNeighbors = getTopOutNeighborsPerEntity(relationIndex, relationsRank, N, positiveIds); //action
relationIndex.unpersist();
//reverse the outNeighbors, to get in neighbors
Map<Integer, IntArrayList> inNeighbors =
topOutNeighbors.flatMapToPair(x -> { //reverse the neighbor pairs from (in,[out1,out2,out3]) to (out1,in), (out2,in), (out3,in)
List<Tuple2<Integer,Integer>> inNeighbs = new ArrayList<>();
for (int outNeighbor : x._2()) {
inNeighbs.add(new Tuple2<>(outNeighbor, x._1()));
}
return inNeighbs.iterator();
})
.aggregateByKey(new IntOpenHashSet(),
(x,y) -> {x.add(y); return x;},
(x,y) -> {x.addAll(y); return x;})
.mapValues(x-> new IntArrayList(x))
.collectAsMap();
return inNeighbors;
}
示例11: broadcast
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
* Broadcast mappings stored in the given conceptMaps instance that match the given
* conceptMapUris.
*
* @param conceptMaps the {@link ConceptMaps} instance with the content to broadcast
* @param conceptMapUriToVersion map of the concept map URIs to broadcast to their versions.
* @return a broadcast variable containing a mappings object usable in UDFs.
*/
public static Broadcast<BroadcastableMappings> broadcast(ConceptMaps conceptMaps,
Map<String,String> conceptMapUriToVersion) {
Map<String,ConceptMap> mapsToLoad = conceptMaps.getMaps()
.collectAsList()
.stream()
.filter(conceptMap ->
conceptMap.getVersion().equals(conceptMapUriToVersion.get(conceptMap.getUrl())))
.collect(Collectors.toMap(ConceptMap::getUrl, Function.identity()));
// Expand the concept maps to load and sort them so dependencies are before
// their dependents in the list.
List<String> sortedMapsToLoad = sortMapsToLoad(conceptMapUriToVersion.keySet(), mapsToLoad);
// Since this is used to map from one system to another, we use only targets
// that don't introduce inaccurate meanings. (For instance, we can't map
// general condition code to a more specific type, since that is not
// representative of the source data.)
Dataset<Mapping> mappings = conceptMaps.getMappings(conceptMapUriToVersion)
.filter("equivalence in ('equivalent', 'equals', 'wider', 'subsumes')");
// Group mappings by their concept map URI
Map<String, List<Mapping>> groupedMappings = mappings
.collectAsList()
.stream()
.collect(Collectors.groupingBy(Mapping::getConceptMapUri));
Map<String, BroadcastableConceptMap> broadcastableMaps = new HashMap<>();
for (String conceptMapUri: sortedMapsToLoad) {
ConceptMap map = mapsToLoad.get(conceptMapUri);
Set<String> children = getMapChildren(map);
List<BroadcastableConceptMap> childMaps = children.stream()
.map(child -> broadcastableMaps.get(child))
.collect(Collectors.toList());
BroadcastableConceptMap broadcastableConceptMap = new BroadcastableConceptMap(conceptMapUri,
groupedMappings.getOrDefault(conceptMapUri, Collections.emptyList()),
childMaps);
broadcastableMaps.put(conceptMapUri, broadcastableConceptMap);
}
JavaSparkContext ctx = new JavaSparkContext(conceptMaps.getMaps()
.sparkSession()
.sparkContext());
return ctx.broadcast(new BroadcastableMappings(broadcastableMaps));
}
示例12: buildModel
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
@Override
public PMML buildModel(JavaSparkContext sparkContext,
JavaRDD<String> trainData,
List<?> hyperParameters,
Path candidatePath) {
int features = (Integer) hyperParameters.get(0);
double lambda = (Double) hyperParameters.get(1);
double alpha = (Double) hyperParameters.get(2);
double epsilon = Double.NaN;
if (logStrength) {
epsilon = (Double) hyperParameters.get(3);
}
Preconditions.checkArgument(features > 0);
Preconditions.checkArgument(lambda >= 0.0);
Preconditions.checkArgument(alpha > 0.0);
if (logStrength) {
Preconditions.checkArgument(epsilon > 0.0);
}
JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
parsedRDD.cache();
Map<String,Integer> userIDIndexMap = buildIDIndexMapping(parsedRDD, true);
Map<String,Integer> itemIDIndexMap = buildIDIndexMapping(parsedRDD, false);
log.info("Broadcasting ID-index mappings for {} users, {} items",
userIDIndexMap.size(), itemIDIndexMap.size());
Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDIndexMap);
Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDIndexMap);
JavaRDD<Rating> trainRatingData = parsedToRatingRDD(parsedRDD, bUserIDToIndex, bItemIDToIndex);
trainRatingData = aggregateScores(trainRatingData, epsilon);
ALS als = new ALS()
.setRank(features)
.setIterations(iterations)
.setLambda(lambda)
.setCheckpointInterval(5);
if (implicit) {
als = als.setImplicitPrefs(true).setAlpha(alpha);
}
RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
trainingRatingDataRDD.cache();
MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
trainingRatingDataRDD.unpersist(false);
bUserIDToIndex.unpersist();
bItemIDToIndex.unpersist();
parsedRDD.unpersist();
Broadcast<Map<Integer,String>> bUserIndexToID = sparkContext.broadcast(invertMap(userIDIndexMap));
Broadcast<Map<Integer,String>> bItemIndexToID = sparkContext.broadcast(invertMap(itemIDIndexMap));
PMML pmml = mfModelToPMML(model,
features,
lambda,
alpha,
epsilon,
implicit,
logStrength,
candidatePath,
bUserIndexToID,
bItemIndexToID);
unpersist(model);
bUserIndexToID.unpersist();
bItemIndexToID.unpersist();
return pmml;
}
示例13: evaluate
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
@Override
public double evaluate(JavaSparkContext sparkContext,
PMML model,
Path modelParentPath,
JavaRDD<String> testData,
JavaRDD<String> trainData) {
JavaRDD<String[]> parsedTestRDD = testData.map(MLFunctions.PARSE_FN);
parsedTestRDD.cache();
Map<String,Integer> userIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, true);
Map<String,Integer> itemIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, false);
log.info("Broadcasting ID-index mappings for {} users, {} items",
userIDToIndex.size(), itemIDToIndex.size());
Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDToIndex);
Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDToIndex);
JavaRDD<Rating> testRatingData = parsedToRatingRDD(parsedTestRDD, bUserIDToIndex, bItemIDToIndex);
double epsilon = Double.NaN;
if (logStrength) {
epsilon = Double.parseDouble(AppPMMLUtils.getExtensionValue(model, "epsilon"));
}
testRatingData = aggregateScores(testRatingData, epsilon);
MatrixFactorizationModel mfModel =
pmmlToMFModel(sparkContext, model, modelParentPath, bUserIDToIndex, bItemIDToIndex);
parsedTestRDD.unpersist();
double eval;
if (implicit) {
double auc = Evaluation.areaUnderCurve(sparkContext, mfModel, testRatingData);
log.info("AUC: {}", auc);
eval = auc;
} else {
double rmse = Evaluation.rmse(mfModel, testRatingData);
log.info("RMSE: {}", rmse);
eval = -rmse;
}
unpersist(mfModel);
bUserIDToIndex.unpersist();
bItemIDToIndex.unpersist();
return eval;
}
示例14: SCAL_IRW
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static IndexedRowMatrix SCAL_IRW(double alpha, IndexedRowMatrix A, IndexedRowMatrix B, JavaSparkContext jsc) {
JavaRDD<IndexedRow> rows = A.rows().toJavaRDD();
final Broadcast<Double> alphaBC = jsc.broadcast(alpha);
JavaRDD<IndexedRow> newRows = rows.map(new Function<IndexedRow, IndexedRow>() {
@Override
public IndexedRow call(IndexedRow indexedRow) throws Exception {
double alphaValue = alphaBC.getValue().doubleValue();
long index = indexedRow.index();
double values[] = new double[indexedRow.vector().size()];
for(int i = 0; i< values.length; i++) {
values[i] = indexedRow.vector().apply(i) * alphaValue;
}
return new IndexedRow(index, new DenseVector(values));
}
});
B = new IndexedRowMatrix(newRows.rdd());
return B;
}
示例15: SCAL_BCK
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static BlockMatrix SCAL_BCK(double alpha, BlockMatrix A, BlockMatrix B, JavaSparkContext jsc) {
JavaRDD<Tuple2<Tuple2<Object, Object>, Matrix>> blocks = A.blocks().toJavaRDD();
final Broadcast<Double> alphaBC = jsc.broadcast(alpha);
JavaRDD<Tuple2<Tuple2<Object, Object>, Matrix>> newBlocks = blocks.map(new Function<Tuple2<Tuple2<Object, Object>, Matrix>, Tuple2<Tuple2<Object, Object>, Matrix>>() {
@Override
public Tuple2<Tuple2<Object, Object>, Matrix> call(Tuple2<Tuple2<Object, Object>, Matrix> block) throws Exception {
double alphaBCRec = alphaBC.getValue().doubleValue();
Integer row = (Integer)block._1._1; //Integer.parseInt(block._1._1.toString());
Integer col = (Integer)block._1._2;
Matrix matrixBlock = block._2;
for(int i = 0; i< matrixBlock.numRows(); i++) {
for(int j = 0; j< matrixBlock.numCols(); j++) {
matrixBlock.update(i,j, matrixBlock.apply(i,j) * alphaBCRec);
}
}
return new Tuple2<Tuple2<Object, Object>, Matrix>(new Tuple2<Object, Object>(row, col), matrixBlock);
}
});
B = new BlockMatrix(newBlocks.rdd(), A.rowsPerBlock(), A.colsPerBlock());
return B;
}