当前位置: 首页>>代码示例>>Java>>正文


Java JavaPairRDD.persist方法代码示例

本文整理汇总了Java中org.apache.spark.api.java.JavaPairRDD.persist方法的典型用法代码示例。如果您正苦于以下问题:Java JavaPairRDD.persist方法的具体用法?Java JavaPairRDD.persist怎么用?Java JavaPairRDD.persist使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.api.java.JavaPairRDD的用法示例。


在下文中一共展示了JavaPairRDD.persist方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: run

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
/**
 * return a map of topN neighbors per entity (reversed to point to in-neighbors (values) having the key entity as their top out-neighbor)
 * @param rawTriples
 * @param SEPARATOR
 * @param entityIdsRDD
 * @param MIN_SUPPORT_THRESHOLD
 * @param N topN neighbors per entity
 * @param positiveIds
 * @param jsc
 * @return 
 */
public Map<Integer,IntArrayList> run(JavaRDD<String> rawTriples, String SEPARATOR, JavaRDD<String> entityIdsRDD, float MIN_SUPPORT_THRESHOLD, int N, boolean positiveIds, JavaSparkContext jsc) {
    //rawTriples.persist(StorageLevel.MEMORY_AND_DISK_SER());        
    
    //List<String> subjects = Utils.getEntityUrlsFromEntityRDDInOrder(rawTriples, SEPARATOR); //a list of (distinct) subject URLs, keeping insertion order (from original triples file)        
    //Object2IntOpenHashMap<String> subjects = Utils.getEntityIdsMapping(rawTriples, SEPARATOR);
    Object2IntOpenHashMap<String> entityIds = Utils.readEntityIdsMapping(entityIdsRDD, positiveIds);
    System.out.println("Found "+entityIds.size()+" entities in collection "+ (positiveIds?"1":"2"));
    
    long numEntitiesSquared = (long)entityIds.keySet().size();
    numEntitiesSquared *= numEntitiesSquared;
    
    Broadcast<Object2IntOpenHashMap<String>> entityIds_BV = jsc.broadcast(entityIds);
     
    JavaPairRDD<String,List<Tuple2<Integer, Integer>>> relationIndex = getRelationIndex(rawTriples, SEPARATOR, entityIds_BV); //a list of (s,o) for each predicate      
    
    //rawTriples.unpersist();        
    relationIndex.persist(StorageLevel.MEMORY_AND_DISK_SER());                
                    
    List<String> relationsRank = getRelationsRank(relationIndex, MIN_SUPPORT_THRESHOLD, numEntitiesSquared);      
    System.out.println("Top-5 relations in collection "+(positiveIds?"1: ":"2: ")+Arrays.toString(relationsRank.subList(0, Math.min(5,relationsRank.size())).toArray()));
    
    JavaPairRDD<Integer, IntArrayList> topOutNeighbors = getTopOutNeighborsPerEntity(relationIndex, relationsRank, N, positiveIds); //action
    
    relationIndex.unpersist(); 
    
    //reverse the outNeighbors, to get in neighbors
    Map<Integer, IntArrayList> inNeighbors =
    topOutNeighbors.flatMapToPair(x -> { //reverse the neighbor pairs from (in,[out1,out2,out3]) to (out1,in), (out2,in), (out3,in)
                List<Tuple2<Integer,Integer>> inNeighbs = new ArrayList<>();
                for (int outNeighbor : x._2()) {
                    inNeighbs.add(new Tuple2<>(outNeighbor, x._1()));
                }
                return inNeighbs.iterator();
            })
            .aggregateByKey(new IntOpenHashSet(), 
                    (x,y) -> {x.add(y); return x;}, 
                    (x,y) -> {x.addAll(y); return x;})
            .mapValues(x-> new IntArrayList(x))
            .collectAsMap();
    
    return inNeighbors;
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:54,代码来源:RelationsRank.java

示例2: main

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public static void main(String[] args) {
    String tmpPath;
    String master;
    String inputPath;        
    String outputPath;
    
    if (args.length == 0) {
        System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode
        
        tmpPath = "/file:C:\\tmp";
        master = "local[2]";
        inputPath = "/file:C:\\Users\\VASILIS\\Documents\\OAEI_Datasets\\exportedBlocks\\testInput";            
        outputPath = "/file:C:\\Users\\VASILIS\\Documents\\OAEI_Datasets\\exportedBlocks\\testOutput";            
    } else {            
        tmpPath = "/file:/tmp/";
        //master = "spark://master:7077";
        inputPath = args[0];            
        outputPath = args[1];
        // delete existing output directories
        try {                                
            Utils.deleteHDFSPath(outputPath);
        } catch (IOException | URISyntaxException ex) {
            Logger.getLogger(MetaBlockingOnlyValuesCBS.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    String appName = "MetaBlocking CBS only values on "+inputPath.substring(inputPath.lastIndexOf("/", inputPath.length()-2)+1);
    SparkSession spark = Utils.setUpSpark(appName, 288, 8, 3, tmpPath);
    int PARALLELISM = spark.sparkContext().getConf().getInt("spark.default.parallelism", 152);        
    JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); 
    
    
    ////////////////////////
    //start the processing//
    ////////////////////////
    
    //Block Filtering
    System.out.println("\n\nStarting BlockFiltering, reading from "+inputPath);
    LongAccumulator BLOCK_ASSIGNMENTS_ACCUM = jsc.sc().longAccumulator();
    BlockFilteringAdvanced bf = new BlockFilteringAdvanced();
    JavaPairRDD<Integer,IntArrayList> entityIndex = bf.run(jsc.textFile(inputPath), BLOCK_ASSIGNMENTS_ACCUM); 
    entityIndex.cache();        
            
    //Blocks From Entity Index
    System.out.println("\n\nStarting BlocksFromEntityIndex...");
            
    LongAccumulator CLEAN_BLOCK_ACCUM = jsc.sc().longAccumulator();
    LongAccumulator NUM_COMPARISONS_ACCUM = jsc.sc().longAccumulator();
    
    BlocksFromEntityIndex bFromEI = new BlocksFromEntityIndex();
    JavaPairRDD<Integer, IntArrayList> blocksFromEI = bFromEI.run(entityIndex, CLEAN_BLOCK_ACCUM, NUM_COMPARISONS_ACCUM);
    blocksFromEI.persist(StorageLevel.DISK_ONLY());
    
    blocksFromEI.count(); //the simplest action just to run blocksFromEI and get the actual value for the counters below
    
    double BCin = (double) BLOCK_ASSIGNMENTS_ACCUM.value() / entityIndex.count(); //BCin = average number of block assignments per entity
    final int K = Math.max(1, ((Double)Math.floor(BCin)).intValue()); //K = |_BCin -1_|
    System.out.println(BLOCK_ASSIGNMENTS_ACCUM.value()+" block assignments");
    System.out.println(CLEAN_BLOCK_ACCUM.value()+" clean blocks");
    System.out.println(NUM_COMPARISONS_ACCUM.value()+" comparisons");
    System.out.println("BCin = "+BCin);
    System.out.println("K = "+K);
    
    entityIndex.unpersist();
    
    //CNP
    System.out.println("\n\nStarting CNP...");
    CNPCBSValuesOnly cnp = new CNPCBSValuesOnly();
    JavaPairRDD<Integer,IntArrayList> metablockingResults = cnp.run(blocksFromEI, K);
    
    metablockingResults
            .mapValues(x -> x.toString()).saveAsTextFile(outputPath); //only to see the output and add an action (saving to file may not be needed)
    System.out.println("Job finished successfully. Output written in "+outputPath);
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:74,代码来源:MetaBlockingOnlyValuesCBS.java


注:本文中的org.apache.spark.api.java.JavaPairRDD.persist方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。