当前位置: 首页>>代码示例>>Java>>正文


Java JavaSparkContext.broadcast方法代码示例

本文整理汇总了Java中org.apache.spark.api.java.JavaSparkContext.broadcast方法的典型用法代码示例。如果您正苦于以下问题:Java JavaSparkContext.broadcast方法的具体用法?Java JavaSparkContext.broadcast怎么用?Java JavaSparkContext.broadcast使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.api.java.JavaSparkContext的用法示例。


在下文中一共展示了JavaSparkContext.broadcast方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: run

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);

    //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);        
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:35,代码来源:CNPNeighborsUnnormalized.java

示例2: run2

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> run2(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);             
    JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> topKneighborCandidates =  getTopKNeighborSimsSUMWithScores(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:33,代码来源:CNPARCS.java

示例3: run

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
    
    //JavaPairRDD<Tuple2<Integer, Integer>, Float> neighborSims = getNeighborSims(topKvalueCandidates, inNeighbors_BV);        
    //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsOld(neighborSims, K);        
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:36,代码来源:CNPNeighbors.java

示例4: GetD_COORD

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static CoordinateMatrix GetD_COORD(CoordinateMatrix A, boolean inverseValues, JavaSparkContext jsc) {

        JavaRDD<MatrixEntry> rows = A.entries().toJavaRDD().cache();

        final Broadcast<Boolean> inverseValuesBC = jsc.broadcast(inverseValues);

        JavaRDD<MatrixEntry> LUEntries = rows.mapPartitions(new FlatMapFunction<Iterator<MatrixEntry>, MatrixEntry>() {
            @Override
            public Iterator<MatrixEntry> call(Iterator<MatrixEntry> matrixEntryIterator) throws Exception {
                List<MatrixEntry> newLowerEntries = new ArrayList<MatrixEntry>();

                boolean inverseValuesValue = inverseValuesBC.getValue().booleanValue();

                while(matrixEntryIterator.hasNext()) {
                    MatrixEntry currentEntry = matrixEntryIterator.next();

                    if(currentEntry.i() == currentEntry.j()) {
                        if(inverseValuesValue) {
                            newLowerEntries.add(new MatrixEntry(currentEntry.i(), currentEntry.j(), 1.0/currentEntry.value()));
                        }
                        else {
                            newLowerEntries.add(currentEntry);
                        }

                    }
                    else {
                        newLowerEntries.add(new MatrixEntry(currentEntry.i(), currentEntry.j(), 0.0));
                    }

                }

                return newLowerEntries.iterator();
            }
        });

        CoordinateMatrix newMatrix = new CoordinateMatrix(LUEntries.rdd());

        return newMatrix;
    }
 
开发者ID:jmabuin,项目名称:BLASpark,代码行数:40,代码来源:OtherOperations.java

示例5: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {

		SparkSession sparkSession = SparkSession.builder().master("local").appName("My App")
				.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

		JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext());

		JavaPairRDD<String, String> userIdToCityId = jsc.parallelizePairs(
				Arrays.asList(new Tuple2<String, String>("1", "101"), new Tuple2<String, String>("2", "102"),
						new Tuple2<String, String>("3", "107"), new Tuple2<String, String>("4", "103"),
						new Tuple2<String, String>("11", "101"), new Tuple2<String, String>("12", "102"),
						new Tuple2<String, String>("13", "107"), new Tuple2<String, String>("14", "103")));

		JavaPairRDD<String, String> cityIdToCityName = jsc.parallelizePairs(
				Arrays.asList(new Tuple2<String, String>("101", "India"), new Tuple2<String, String>("102", "UK"),
						new Tuple2<String, String>("103", "Germany"), new Tuple2<String, String>("107", "USA")));

		Broadcast<Map<String, String>> citiesBroadcasted = jsc.broadcast(cityIdToCityName.collectAsMap());

		JavaRDD<Tuple3<String, String, String>> joined = userIdToCityId.map(
				v1 -> new Tuple3<String, String, String>(v1._1(), v1._2(), citiesBroadcasted.value().get(v1._2())));

		System.out.println(joined.collect());

	}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:26,代码来源:MapSideJoinBroadcast.java

示例6: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main( String[] args )
{
    Dataset<Row> mutations = DataProvider.getMutationsToStructures();
    List<String> pdbIds = mutations.select(col("pdbId"))
            .distinct().toJavaRDD().map(t -> t.getString(0)).collect();

    List<Row> broadcasted = mutations.select("pdbId", "chainId", "pdbAtomPos").collectAsList();
    SaprkUtils.stopSparkSession();

    JavaSparkContext sc = SaprkUtils.getSparkContext();
    Broadcast<List<Row>> bcmut = sc.broadcast(broadcasted);

    MmtfReader//.readSequenceFile("/pdb/2017/full", pdbIds, sc)
            .downloadMmtfFiles(Arrays.asList("5IRC"), sc)
            .flatMapToPair(new StructureToPolymerChains())
            .flatMapToPair(new AddResidueToKey(bcmut))
            .mapValues(new StructureToBioJava())
            .mapToPair(new FilterResidue())
            .filter(t -> t._2!=null).keys()
            .map(t -> t.replace(".", ","))
            .saveAsTextFile("/Users/yana/git/mutantpdb/src/main/resources/pdb_residues");
    sc.close();
}
 
开发者ID:jjgao,项目名称:mutantpdb,代码行数:24,代码来源:App.java

示例7: pushUdf

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
 * Pushes an "in_valueset" UDF that uses the given {@link BroadcastableValueSets} for its content.
 *
 * @param spark the spark session
 * @param valueSets the valuesets to use in the UDF
 */
public static synchronized void pushUdf(SparkSession spark, BroadcastableValueSets valueSets) {

  JavaSparkContext ctx = new JavaSparkContext(spark.sparkContext());

  Broadcast<BroadcastableValueSets> broadcast = ctx.broadcast(valueSets);

  spark.udf()
      .register("in_valueset",
          new InValuesetUdf(broadcast),
          DataTypes.BooleanType);

  // Push the broadcast variable
  valueSetStack.push(broadcast);
}
 
开发者ID:cerner,项目名称:bunsen,代码行数:21,代码来源:ValueSetUdfs.java

示例8: GetD_IRW

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static IndexedRowMatrix GetD_IRW(IndexedRowMatrix A, boolean inverseValues, JavaSparkContext jsc) {

        JavaRDD<IndexedRow> rows = A.rows().toJavaRDD().cache();

        final Broadcast<Boolean> inverseValuesBC = jsc.broadcast(inverseValues);
        JavaRDD<IndexedRow> LURows = rows.map(new Function<IndexedRow, IndexedRow>() {

            @Override
            public IndexedRow call(IndexedRow indexedRow) throws Exception {
                long index = indexedRow.index();
                DenseVector vect = indexedRow.vector().toDense();

                boolean inverseValuesValue = inverseValuesBC.getValue().booleanValue();

                double newValues[] = new double[vect.size()];


                for(int i = 0; i< vect.size(); i++) {

                    if( i == index) {
                        if(inverseValuesValue) {
                            newValues[i] = 1.0/vect.apply(i);
                        }
                        else {
                            newValues[i] = vect.apply(i);
                        }
                    }
                    else {
                        newValues[i] = 0.0;
                    }

                }

                DenseVector newVector = new DenseVector(newValues);

                return new IndexedRow(index, newVector);

            }
        });

        IndexedRowMatrix newMatrix = new IndexedRowMatrix(LURows.rdd());

        return newMatrix;
    }
 
开发者ID:jmabuin,项目名称:BLASpark,代码行数:45,代码来源:OtherOperations.java

示例9: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception{
    String srcBucketName;
    String scrBucketKey;
    String destBucketName;
    String destPrefix;
    ArgumentParser argumentParser = new ArgumentParser();
    AmazonS3 s3Client = new AmazonS3Client();

    try {
        BucketKey location = argumentParser.parseArguments(args);
        srcBucketName = location.getSrcBucket();
        scrBucketKey = location.getSrcKey();
        destBucketName = location.getDestBucket();
        destPrefix = location.getDestPrefix();
    } catch (ParseException e) {
        LOG.info(PARSE_ERROR_MSG);
        throw new IllegalArgumentException("Parser throw a parse Exception", e);
    }

    // Obtain the original manifest files
    InventoryManifestRetriever inventoryManifestRetriever =
            new InventoryManifestRetriever(s3Client, srcBucketName, scrBucketKey);
    InventoryManifest manifest = inventoryManifestRetriever.getInventoryManifest();

    // Check if the inventory report includes the StorageClass column
    String fileSchema = manifest.getFileSchema();
    String filterColumn = "storageClass";
    if (!StringUtils.containsIgnoreCase(fileSchema, filterColumn)) {
        throw new StorageClassNotIncludedException();
    }

    //Create Spark Context
    SparkConf sparkConf = new SparkConf();
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    Broadcast<CachedS3ClientFactory> clientFactory = sc.broadcast(new CachedS3ClientFactory());

    // Get the inventory report, split it into lines, parse each line to a POJO,
    // Filter, and write new csv file to S3
    JavaRDD<InventoryManifest.Locator> locatorRDD = sc.parallelize(manifest.getLocators());
    List<InventoryManifest.Locator> newLocatorList = locatorRDD
            .map(new InventoryReportLineRetriever(clientFactory, manifest))
            .flatMap(new InventoryReportMapper(manifest))
            .filter(new ReducedRedundancyStorageClassFilter())
            .mapPartitions(new WriteNewInventoryReportFunc(clientFactory, srcBucketName, manifest,
                    destBucketName, destPrefix))
            .collect();

    // Generate new manifest files including new locators, and send them back to S3
    new ManifestWriter(s3Client, destBucketName, destPrefix, srcBucketName, manifest)
            .writeManifest(newLocatorList);

    sc.close();
}
 
开发者ID:awslabs,项目名称:s3-inventory-usage-examples,代码行数:54,代码来源:ReducedRedundancyLocatorExampleMain.java

示例10: run

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
 * return a map of topN neighbors per entity (reversed to point to in-neighbors (values) having the key entity as their top out-neighbor)
 * @param rawTriples
 * @param SEPARATOR
 * @param entityIdsRDD
 * @param MIN_SUPPORT_THRESHOLD
 * @param N topN neighbors per entity
 * @param positiveIds
 * @param jsc
 * @return 
 */
public Map<Integer,IntArrayList> run(JavaRDD<String> rawTriples, String SEPARATOR, JavaRDD<String> entityIdsRDD, float MIN_SUPPORT_THRESHOLD, int N, boolean positiveIds, JavaSparkContext jsc) {
    //rawTriples.persist(StorageLevel.MEMORY_AND_DISK_SER());        
    
    //List<String> subjects = Utils.getEntityUrlsFromEntityRDDInOrder(rawTriples, SEPARATOR); //a list of (distinct) subject URLs, keeping insertion order (from original triples file)        
    //Object2IntOpenHashMap<String> subjects = Utils.getEntityIdsMapping(rawTriples, SEPARATOR);
    Object2IntOpenHashMap<String> entityIds = Utils.readEntityIdsMapping(entityIdsRDD, positiveIds);
    System.out.println("Found "+entityIds.size()+" entities in collection "+ (positiveIds?"1":"2"));
    
    long numEntitiesSquared = (long)entityIds.keySet().size();
    numEntitiesSquared *= numEntitiesSquared;
    
    Broadcast<Object2IntOpenHashMap<String>> entityIds_BV = jsc.broadcast(entityIds);
     
    JavaPairRDD<String,List<Tuple2<Integer, Integer>>> relationIndex = getRelationIndex(rawTriples, SEPARATOR, entityIds_BV); //a list of (s,o) for each predicate      
    
    //rawTriples.unpersist();        
    relationIndex.persist(StorageLevel.MEMORY_AND_DISK_SER());                
                    
    List<String> relationsRank = getRelationsRank(relationIndex, MIN_SUPPORT_THRESHOLD, numEntitiesSquared);      
    System.out.println("Top-5 relations in collection "+(positiveIds?"1: ":"2: ")+Arrays.toString(relationsRank.subList(0, Math.min(5,relationsRank.size())).toArray()));
    
    JavaPairRDD<Integer, IntArrayList> topOutNeighbors = getTopOutNeighborsPerEntity(relationIndex, relationsRank, N, positiveIds); //action
    
    relationIndex.unpersist(); 
    
    //reverse the outNeighbors, to get in neighbors
    Map<Integer, IntArrayList> inNeighbors =
    topOutNeighbors.flatMapToPair(x -> { //reverse the neighbor pairs from (in,[out1,out2,out3]) to (out1,in), (out2,in), (out3,in)
                List<Tuple2<Integer,Integer>> inNeighbs = new ArrayList<>();
                for (int outNeighbor : x._2()) {
                    inNeighbs.add(new Tuple2<>(outNeighbor, x._1()));
                }
                return inNeighbs.iterator();
            })
            .aggregateByKey(new IntOpenHashSet(), 
                    (x,y) -> {x.add(y); return x;}, 
                    (x,y) -> {x.addAll(y); return x;})
            .mapValues(x-> new IntArrayList(x))
            .collectAsMap();
    
    return inNeighbors;
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:54,代码来源:RelationsRank.java

示例11: broadcast

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
 * Broadcast mappings stored in the given conceptMaps instance that match the given
 * conceptMapUris.
 *
 * @param conceptMaps the {@link ConceptMaps} instance with the content to broadcast
 * @param conceptMapUriToVersion map of the concept map URIs to broadcast to their versions.
 * @return a broadcast variable containing a mappings object usable in UDFs.
 */
public static Broadcast<BroadcastableMappings> broadcast(ConceptMaps conceptMaps,
    Map<String,String> conceptMapUriToVersion) {

  Map<String,ConceptMap> mapsToLoad = conceptMaps.getMaps()
      .collectAsList()
      .stream()
      .filter(conceptMap ->
          conceptMap.getVersion().equals(conceptMapUriToVersion.get(conceptMap.getUrl())))
      .collect(Collectors.toMap(ConceptMap::getUrl, Function.identity()));

  // Expand the concept maps to load and sort them so dependencies are before
  // their dependents in the list.
  List<String> sortedMapsToLoad = sortMapsToLoad(conceptMapUriToVersion.keySet(), mapsToLoad);

  // Since this is used to map from one system to another, we use only targets
  // that don't introduce inaccurate meanings. (For instance, we can't map
  // general condition code to a more specific type, since that is not
  // representative of the source data.)
  Dataset<Mapping> mappings = conceptMaps.getMappings(conceptMapUriToVersion)
      .filter("equivalence in ('equivalent', 'equals', 'wider', 'subsumes')");

  // Group mappings by their concept map URI
  Map<String, List<Mapping>> groupedMappings =  mappings
      .collectAsList()
      .stream()
      .collect(Collectors.groupingBy(Mapping::getConceptMapUri));

  Map<String, BroadcastableConceptMap> broadcastableMaps = new HashMap<>();

  for (String conceptMapUri: sortedMapsToLoad) {

    ConceptMap map = mapsToLoad.get(conceptMapUri);

    Set<String> children = getMapChildren(map);

    List<BroadcastableConceptMap> childMaps = children.stream()
        .map(child -> broadcastableMaps.get(child))
        .collect(Collectors.toList());

    BroadcastableConceptMap broadcastableConceptMap = new BroadcastableConceptMap(conceptMapUri,
        groupedMappings.getOrDefault(conceptMapUri, Collections.emptyList()),
        childMaps);

    broadcastableMaps.put(conceptMapUri, broadcastableConceptMap);
  }

  JavaSparkContext ctx = new JavaSparkContext(conceptMaps.getMaps()
      .sparkSession()
      .sparkContext());

  return ctx.broadcast(new BroadcastableMappings(broadcastableMaps));
}
 
开发者ID:cerner,项目名称:bunsen,代码行数:61,代码来源:BroadcastableMappings.java

示例12: buildModel

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int features = (Integer) hyperParameters.get(0);
  double lambda = (Double) hyperParameters.get(1);
  double alpha = (Double) hyperParameters.get(2);
  double epsilon = Double.NaN;
  if (logStrength) {
    epsilon = (Double) hyperParameters.get(3);
  }
  Preconditions.checkArgument(features > 0);
  Preconditions.checkArgument(lambda >= 0.0);
  Preconditions.checkArgument(alpha > 0.0);
  if (logStrength) {
    Preconditions.checkArgument(epsilon > 0.0);
  }

  JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
  parsedRDD.cache();

  Map<String,Integer> userIDIndexMap = buildIDIndexMapping(parsedRDD, true);
  Map<String,Integer> itemIDIndexMap = buildIDIndexMapping(parsedRDD, false);

  log.info("Broadcasting ID-index mappings for {} users, {} items",
           userIDIndexMap.size(), itemIDIndexMap.size());

  Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDIndexMap);
  Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDIndexMap);

  JavaRDD<Rating> trainRatingData = parsedToRatingRDD(parsedRDD, bUserIDToIndex, bItemIDToIndex);
  trainRatingData = aggregateScores(trainRatingData, epsilon);
  ALS als = new ALS()
      .setRank(features)
      .setIterations(iterations)
      .setLambda(lambda)
      .setCheckpointInterval(5);
  if (implicit) {
    als = als.setImplicitPrefs(true).setAlpha(alpha);
  }

  RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
  trainingRatingDataRDD.cache();
  MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
  trainingRatingDataRDD.unpersist(false);

  bUserIDToIndex.unpersist();
  bItemIDToIndex.unpersist();

  parsedRDD.unpersist();

  Broadcast<Map<Integer,String>> bUserIndexToID = sparkContext.broadcast(invertMap(userIDIndexMap));
  Broadcast<Map<Integer,String>> bItemIndexToID = sparkContext.broadcast(invertMap(itemIDIndexMap));

  PMML pmml = mfModelToPMML(model,
                            features,
                            lambda,
                            alpha,
                            epsilon,
                            implicit,
                            logStrength,
                            candidatePath,
                            bUserIndexToID,
                            bItemIndexToID);
  unpersist(model);

  bUserIndexToID.unpersist();
  bItemIndexToID.unpersist();

  return pmml;
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:73,代码来源:ALSUpdate.java

示例13: evaluate

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
@Override
public double evaluate(JavaSparkContext sparkContext,
                       PMML model,
                       Path modelParentPath,
                       JavaRDD<String> testData,
                       JavaRDD<String> trainData) {

  JavaRDD<String[]> parsedTestRDD = testData.map(MLFunctions.PARSE_FN);
  parsedTestRDD.cache();

  Map<String,Integer> userIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, true);
  Map<String,Integer> itemIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, false);

  log.info("Broadcasting ID-index mappings for {} users, {} items",
           userIDToIndex.size(), itemIDToIndex.size());

  Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDToIndex);
  Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDToIndex);

  JavaRDD<Rating> testRatingData = parsedToRatingRDD(parsedTestRDD, bUserIDToIndex, bItemIDToIndex);
  double epsilon = Double.NaN;
  if (logStrength) {
    epsilon = Double.parseDouble(AppPMMLUtils.getExtensionValue(model, "epsilon"));
  }
  testRatingData = aggregateScores(testRatingData, epsilon);

  MatrixFactorizationModel mfModel =
      pmmlToMFModel(sparkContext, model, modelParentPath, bUserIDToIndex, bItemIDToIndex);

  parsedTestRDD.unpersist();

  double eval;
  if (implicit) {
    double auc = Evaluation.areaUnderCurve(sparkContext, mfModel, testRatingData);
    log.info("AUC: {}", auc);
    eval = auc;
  } else {
    double rmse = Evaluation.rmse(mfModel, testRatingData);
    log.info("RMSE: {}", rmse);
    eval = -rmse;
  }
  unpersist(mfModel);

  bUserIDToIndex.unpersist();
  bItemIDToIndex.unpersist();

  return eval;
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:49,代码来源:ALSUpdate.java

示例14: SCAL_IRW

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static IndexedRowMatrix SCAL_IRW(double alpha, IndexedRowMatrix A, IndexedRowMatrix B, JavaSparkContext jsc) {

        JavaRDD<IndexedRow> rows = A.rows().toJavaRDD();

        final Broadcast<Double> alphaBC = jsc.broadcast(alpha);

        JavaRDD<IndexedRow> newRows = rows.map(new Function<IndexedRow, IndexedRow>() {
            @Override
            public IndexedRow call(IndexedRow indexedRow) throws Exception {

                double alphaValue = alphaBC.getValue().doubleValue();

                long index = indexedRow.index();

                double values[] = new double[indexedRow.vector().size()];

                for(int i = 0; i< values.length; i++) {
                    values[i] = indexedRow.vector().apply(i) * alphaValue;
                }

                return new IndexedRow(index, new DenseVector(values));

            }
        });

        B = new IndexedRowMatrix(newRows.rdd());

        return B;

    }
 
开发者ID:jmabuin,项目名称:BLASpark,代码行数:31,代码来源:L3.java

示例15: SCAL_BCK

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static BlockMatrix SCAL_BCK(double alpha, BlockMatrix A, BlockMatrix B, JavaSparkContext jsc) {

        JavaRDD<Tuple2<Tuple2<Object, Object>, Matrix>> blocks = A.blocks().toJavaRDD();

        final Broadcast<Double> alphaBC = jsc.broadcast(alpha);

        JavaRDD<Tuple2<Tuple2<Object, Object>, Matrix>> newBlocks = blocks.map(new Function<Tuple2<Tuple2<Object, Object>, Matrix>, Tuple2<Tuple2<Object, Object>, Matrix>>() {
            @Override
            public Tuple2<Tuple2<Object, Object>, Matrix> call(Tuple2<Tuple2<Object, Object>, Matrix> block) throws Exception {

                double alphaBCRec = alphaBC.getValue().doubleValue();

                Integer row = (Integer)block._1._1; //Integer.parseInt(block._1._1.toString());
                Integer col = (Integer)block._1._2;
                Matrix matrixBlock = block._2;

                for(int i = 0; i< matrixBlock.numRows(); i++) {

                    for(int j = 0; j< matrixBlock.numCols(); j++) {
                        matrixBlock.update(i,j, matrixBlock.apply(i,j) * alphaBCRec);
                    }

                }

                return new Tuple2<Tuple2<Object, Object>, Matrix>(new Tuple2<Object, Object>(row, col), matrixBlock);

            }
        });

        B = new BlockMatrix(newBlocks.rdd(), A.rowsPerBlock(), A.colsPerBlock());

        return B;

    }
 
开发者ID:jmabuin,项目名称:BLASpark,代码行数:35,代码来源:L3.java


注:本文中的org.apache.spark.api.java.JavaSparkContext.broadcast方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。