当前位置: 首页>>代码示例>>Java>>正文


Java JavaSparkContext类代码示例

本文整理汇总了Java中org.apache.spark.api.java.JavaSparkContext的典型用法代码示例。如果您正苦于以下问题:Java JavaSparkContext类的具体用法?Java JavaSparkContext怎么用?Java JavaSparkContext使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


JavaSparkContext类属于org.apache.spark.api.java包,在下文中一共展示了JavaSparkContext类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage:");
        System.err.println("  SparkWordCount <sourceFile> <targetFile>");
        System.exit(1);
    }

    SparkConf conf = new SparkConf()
            .setAppName("Word Count");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> textFile = sc.textFile(args[0]);
    JavaRDD<String> words = textFile.flatMap(LineIterator::new);
    JavaPairRDD<String, Long> pairs =
            words.mapToPair(s -> new Tuple2<>(s, 1L));
    JavaPairRDD<String, Long> counts =
            pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);

    System.out.println("Starting task..");
    long t = System.currentTimeMillis();
    counts.saveAsTextFile(args[1] + "_" + t);
    System.out.println("Time=" + (System.currentTimeMillis() - t));
}
 
开发者ID:hazelcast,项目名称:big-data-benchmark,代码行数:23,代码来源:SparkWordCount.java

示例2: wordCountJava8

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static void wordCountJava8( String filename )
{
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile( filename );

    // Java 8 with lambdas: split the input string into words
   // TODO here a change has happened 
    JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
    JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile( "output" );
}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:22,代码来源:WordCount.java

示例3: main

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
	System.out.println(System.getProperty("hadoop.home.dir"));
	String inputPath = args[0];
	String outputPath = args[1];
	FileUtils.deleteQuietly(new File(outputPath));

	JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

	JavaRDD<String> rdd = sc.textFile(inputPath);

	JavaPairRDD<String, Integer> counts = rdd
			.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
			.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
			.reduceByKey((x, y) -> x + y);

	counts.saveAsTextFile(outputPath);
	sc.close();
}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:19,代码来源:SparkWordCount.java

示例4: Spark

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
 * Initializes a Spark connection. Use it afterwards for execution of Spark
 * SQL queries.
 * 
 * @param appName
 *            the name of the app that will be used with this Spark
 *            connection
 * @param database
 *            name of the database that will be used with this Spark
 *            connection
 */
public Spark(String appName, String database) {

	// TODO check what will happen if there is already in use the same app
	// name
	this.sparkConfiguration = new SparkConf().setAppName(appName);
	this.javaContext = new JavaSparkContext(sparkConfiguration);
	this.hiveContext = new HiveContext(javaContext);
	// TODO check what kind of exception can be thrown here if there is a
	// problem with spark connection

	this.hiveContext.sql(String.format("CREATE DATABASE %s", database));
	// TODO check what kind of exception is thrown if database already

	// use the created database
	this.hiveContext.sql((String.format("USE %s", database)));
}
 
开发者ID:aschaetzle,项目名称:Sempala,代码行数:28,代码来源:Spark.java

示例5: buildModel

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
 * @param sparkContext    active Spark Context
 * @param trainData       training data on which to build a model
 * @param hyperParameters ordered list of hyper parameter values to use in building model
 * @param candidatePath   directory where additional model files can be written
 * @return a {@link PMML} representation of a model trained on the given data
 */
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int numClusters = (Integer) hyperParameters.get(0);
  Preconditions.checkArgument(numClusters > 1);
  log.info("Building KMeans Model with {} clusters", numClusters);

  JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
  KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
                                         numberOfRuns, initializationStrategy);

  return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:23,代码来源:KMeansUpdate.java

示例6: GetLU

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static DistributedMatrix GetLU(DistributedMatrix A, JavaSparkContext jsc) {


        DistributedMatrix returnedMatrix;

        if( A.getClass() == IndexedRowMatrix.class) {
            returnedMatrix = OtherOperations.GetLU_IRW((IndexedRowMatrix) A);
        }
        else if (A.getClass() == CoordinateMatrix.class) {
            returnedMatrix = OtherOperations.GetLU_COORD((CoordinateMatrix) A);
        }
        else if (A.getClass() == BlockMatrix.class){
            // TODO: Implement this operation
            //returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
            returnedMatrix = null;
        }
        else {
            returnedMatrix = null;
        }


        return returnedMatrix;

    }
 
开发者ID:jmabuin,项目名称:BLASpark,代码行数:25,代码来源:OtherOperations.java

示例7: GetD

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static DistributedMatrix GetD(DistributedMatrix A, boolean inverseValues, JavaSparkContext jsc) {

        DistributedMatrix returnedMatrix;

        if( A.getClass() == IndexedRowMatrix.class) {
            returnedMatrix = OtherOperations.GetD_IRW((IndexedRowMatrix) A, inverseValues, jsc);
        }
        else if (A.getClass() == CoordinateMatrix.class) {
            returnedMatrix = OtherOperations.GetD_COORD((CoordinateMatrix) A, inverseValues, jsc);
        }
        else if (A.getClass() == BlockMatrix.class){
            // TODO: Implement this operation
            //returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
            returnedMatrix = null;
        }
        else {
            returnedMatrix = null;
        }


        return returnedMatrix;

    }
 
开发者ID:jmabuin,项目名称:BLASpark,代码行数:24,代码来源:OtherOperations.java

示例8: splitFastq

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  //TODO: Handle also compressed files
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:17,代码来源:InterleaveMulti.java

示例9: interleaveSplitFastq

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);

      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:18,代码来源:Decompress.java

示例10: loadClickStremFromTxt

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
 * loadClickStremFromTxt:Load click stream form txt file
 *
 * @param clickthroughFile
 *          txt file
 * @param sc
 *          the spark context
 * @return clickstream list in JavaRDD format {@link ClickStream}
 */
public JavaRDD<ClickStream> loadClickStremFromTxt(String clickthroughFile, JavaSparkContext sc) {
  return sc.textFile(clickthroughFile).flatMap(new FlatMapFunction<String, ClickStream>() {
    /**
     *
     */
    private static final long serialVersionUID = 1L;

    @SuppressWarnings("unchecked")
    @Override
    public Iterator<ClickStream> call(String line) throws Exception {
      List<ClickStream> clickthroughs = (List<ClickStream>) ClickStream.parseFromTextLine(line);
      return (Iterator<ClickStream>) clickthroughs;
    }
  });
}
 
开发者ID:apache,项目名称:incubator-sdap-mudrod,代码行数:25,代码来源:SessionExtractor.java

示例11: SparkDriver

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public SparkDriver(Properties props) {
  SparkConf conf = new SparkConf().setAppName(props.getProperty(MudrodConstants.SPARK_APP_NAME, "MudrodSparkApp")).setIfMissing("spark.master", props.getProperty(MudrodConstants.SPARK_MASTER))
      .set("spark.hadoop.validateOutputSpecs", "false").set("spark.files.overwrite", "true");

  String esHost = props.getProperty(MudrodConstants.ES_UNICAST_HOSTS);
  String esPort = props.getProperty(MudrodConstants.ES_HTTP_PORT);

  if (!"".equals(esHost)) {
    conf.set("es.nodes", esHost);
  }

  if (!"".equals(esPort)) {
    conf.set("es.port", esPort);
  }

  conf.set("spark.serializer", KryoSerializer.class.getName());
  conf.set("es.batch.size.entries", "1500");

  sc = new JavaSparkContext(conf);
  sqlContext = new SQLContext(sc);
}
 
开发者ID:apache,项目名称:incubator-sdap-mudrod,代码行数:22,代码来源:SparkDriver.java

示例12: init

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
private void init() throws IOException {
    final ClientConfig config = new ClientConfig();
    client = HazelcastClient.newHazelcastClient(config);

    final SparkConf conf = new SparkConf()
            .set("hazelcast.server.addresses", "127.0.0.1:5701")
            .set("hazelcast.server.groupName", "dev")
            .set("hazelcast.server.groupPass", "dev-pass")
            .set("hazelcast.spark.valueBatchingEnabled", "true")
            .set("hazelcast.spark.readBatchSize", "5000")
            .set("hazelcast.spark.writeBatchSize", "5000");

    sc = new JavaSparkContext("local", "appname", conf);

    loadHistoricalRaces();
    createRandomUsers();
    createFutureEvent();
}
 
开发者ID:hazelcast,项目名称:betleopard,代码行数:19,代码来源:LiveBetMain.java

示例13: run

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);

    //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);        
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:35,代码来源:CNPNeighborsUnnormalized.java

示例14: run2

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> run2(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);             
    JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> topKneighborCandidates =  getTopKNeighborSimsSUMWithScores(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:33,代码来源:CNPARCS.java

示例15: run

import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
    
    //JavaPairRDD<Tuple2<Integer, Integer>, Float> neighborSims = getNeighborSims(topKvalueCandidates, inNeighbors_BV);        
    //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsOld(neighborSims, K);        
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:36,代码来源:CNPNeighbors.java


注:本文中的org.apache.spark.api.java.JavaSparkContext类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。