本文整理汇总了Java中org.apache.spark.api.java.JavaSparkContext类的典型用法代码示例。如果您正苦于以下问题:Java JavaSparkContext类的具体用法?Java JavaSparkContext怎么用?Java JavaSparkContext使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
JavaSparkContext类属于org.apache.spark.api.java包,在下文中一共展示了JavaSparkContext类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Usage:");
System.err.println(" SparkWordCount <sourceFile> <targetFile>");
System.exit(1);
}
SparkConf conf = new SparkConf()
.setAppName("Word Count");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> textFile = sc.textFile(args[0]);
JavaRDD<String> words = textFile.flatMap(LineIterator::new);
JavaPairRDD<String, Long> pairs =
words.mapToPair(s -> new Tuple2<>(s, 1L));
JavaPairRDD<String, Long> counts =
pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);
System.out.println("Starting task..");
long t = System.currentTimeMillis();
counts.saveAsTextFile(args[1] + "_" + t);
System.out.println("Time=" + (System.currentTimeMillis() - t));
}
示例2: wordCountJava8
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static void wordCountJava8( String filename )
{
// Define a configuration to use to interact with Spark
SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");
// Create a Java version of the Spark Context from the configuration
JavaSparkContext sc = new JavaSparkContext(conf);
// Load the input data, which is a text file read from the command line
JavaRDD<String> input = sc.textFile( filename );
// Java 8 with lambdas: split the input string into words
// TODO here a change has happened
JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );
// Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile( "output" );
}
示例3: main
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
System.out.println(System.getProperty("hadoop.home.dir"));
String inputPath = args[0];
String outputPath = args[1];
FileUtils.deleteQuietly(new File(outputPath));
JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");
JavaRDD<String> rdd = sc.textFile(inputPath);
JavaPairRDD<String, Integer> counts = rdd
.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
.reduceByKey((x, y) -> x + y);
counts.saveAsTextFile(outputPath);
sc.close();
}
示例4: Spark
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
* Initializes a Spark connection. Use it afterwards for execution of Spark
* SQL queries.
*
* @param appName
* the name of the app that will be used with this Spark
* connection
* @param database
* name of the database that will be used with this Spark
* connection
*/
public Spark(String appName, String database) {
// TODO check what will happen if there is already in use the same app
// name
this.sparkConfiguration = new SparkConf().setAppName(appName);
this.javaContext = new JavaSparkContext(sparkConfiguration);
this.hiveContext = new HiveContext(javaContext);
// TODO check what kind of exception can be thrown here if there is a
// problem with spark connection
this.hiveContext.sql(String.format("CREATE DATABASE %s", database));
// TODO check what kind of exception is thrown if database already
// use the created database
this.hiveContext.sql((String.format("USE %s", database)));
}
示例5: buildModel
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
* @param sparkContext active Spark Context
* @param trainData training data on which to build a model
* @param hyperParameters ordered list of hyper parameter values to use in building model
* @param candidatePath directory where additional model files can be written
* @return a {@link PMML} representation of a model trained on the given data
*/
@Override
public PMML buildModel(JavaSparkContext sparkContext,
JavaRDD<String> trainData,
List<?> hyperParameters,
Path candidatePath) {
int numClusters = (Integer) hyperParameters.get(0);
Preconditions.checkArgument(numClusters > 1);
log.info("Building KMeans Model with {} clusters", numClusters);
JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
numberOfRuns, initializationStrategy);
return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}
示例6: GetLU
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static DistributedMatrix GetLU(DistributedMatrix A, JavaSparkContext jsc) {
DistributedMatrix returnedMatrix;
if( A.getClass() == IndexedRowMatrix.class) {
returnedMatrix = OtherOperations.GetLU_IRW((IndexedRowMatrix) A);
}
else if (A.getClass() == CoordinateMatrix.class) {
returnedMatrix = OtherOperations.GetLU_COORD((CoordinateMatrix) A);
}
else if (A.getClass() == BlockMatrix.class){
// TODO: Implement this operation
//returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
returnedMatrix = null;
}
else {
returnedMatrix = null;
}
return returnedMatrix;
}
示例7: GetD
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static DistributedMatrix GetD(DistributedMatrix A, boolean inverseValues, JavaSparkContext jsc) {
DistributedMatrix returnedMatrix;
if( A.getClass() == IndexedRowMatrix.class) {
returnedMatrix = OtherOperations.GetD_IRW((IndexedRowMatrix) A, inverseValues, jsc);
}
else if (A.getClass() == CoordinateMatrix.class) {
returnedMatrix = OtherOperations.GetD_COORD((CoordinateMatrix) A, inverseValues, jsc);
}
else if (A.getClass() == BlockMatrix.class){
// TODO: Implement this operation
//returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
returnedMatrix = null;
}
else {
returnedMatrix = null;
}
return returnedMatrix;
}
示例8: splitFastq
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
Path fqpath = new Path(fqPath);
String fqname = fqpath.getName();
String[] ns = fqname.split("\\.");
//TODO: Handle also compressed files
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
splitRDD.foreach( split -> {
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);
});
}
示例9: interleaveSplitFastq
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
zips.foreach( splits -> {
Path path = splits._1.getPath();
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
});
}
示例10: loadClickStremFromTxt
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
* loadClickStremFromTxt:Load click stream form txt file
*
* @param clickthroughFile
* txt file
* @param sc
* the spark context
* @return clickstream list in JavaRDD format {@link ClickStream}
*/
public JavaRDD<ClickStream> loadClickStremFromTxt(String clickthroughFile, JavaSparkContext sc) {
return sc.textFile(clickthroughFile).flatMap(new FlatMapFunction<String, ClickStream>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@SuppressWarnings("unchecked")
@Override
public Iterator<ClickStream> call(String line) throws Exception {
List<ClickStream> clickthroughs = (List<ClickStream>) ClickStream.parseFromTextLine(line);
return (Iterator<ClickStream>) clickthroughs;
}
});
}
示例11: SparkDriver
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
public SparkDriver(Properties props) {
SparkConf conf = new SparkConf().setAppName(props.getProperty(MudrodConstants.SPARK_APP_NAME, "MudrodSparkApp")).setIfMissing("spark.master", props.getProperty(MudrodConstants.SPARK_MASTER))
.set("spark.hadoop.validateOutputSpecs", "false").set("spark.files.overwrite", "true");
String esHost = props.getProperty(MudrodConstants.ES_UNICAST_HOSTS);
String esPort = props.getProperty(MudrodConstants.ES_HTTP_PORT);
if (!"".equals(esHost)) {
conf.set("es.nodes", esHost);
}
if (!"".equals(esPort)) {
conf.set("es.port", esPort);
}
conf.set("spark.serializer", KryoSerializer.class.getName());
conf.set("es.batch.size.entries", "1500");
sc = new JavaSparkContext(conf);
sqlContext = new SQLContext(sc);
}
示例12: init
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
private void init() throws IOException {
final ClientConfig config = new ClientConfig();
client = HazelcastClient.newHazelcastClient(config);
final SparkConf conf = new SparkConf()
.set("hazelcast.server.addresses", "127.0.0.1:5701")
.set("hazelcast.server.groupName", "dev")
.set("hazelcast.server.groupPass", "dev-pass")
.set("hazelcast.spark.valueBatchingEnabled", "true")
.set("hazelcast.spark.readBatchSize", "5000")
.set("hazelcast.spark.writeBatchSize", "5000");
sc = new JavaSparkContext("local", "appname", conf);
loadHistoricalRaces();
createRandomUsers();
createFutureEvent();
}
示例13: run
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
*
* @param topKvalueCandidates the topK results per entity, acquired from value similarity
* @param rawTriples1 the rdf triples of the first entity collection
* @param rawTriples2 the rdf triples of the second entity collection
* @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
* @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
* @param entityIds2
* @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
* @param K the K for topK candidate matches
* @param N the N for topN rdf relations (and neighbors)
* @param jsc the java spark context used to load files and broadcast variables
* @return topK neighbor candidates per entity
*/
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates,
JavaRDD<String> rawTriples1,
JavaRDD<String> rawTriples2,
String SEPARATOR,
JavaRDD<String> entityIds1,
JavaRDD<String> entityIds2,
float MIN_SUPPORT_THRESHOLD,
int K,
int N,
JavaSparkContext jsc) {
Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
//JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);
JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);
return topKneighborCandidates;
}
示例14: run2
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
*
* @param topKvalueCandidates the topK results per entity, acquired from value similarity
* @param rawTriples1 the rdf triples of the first entity collection
* @param rawTriples2 the rdf triples of the second entity collection
* @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
* @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
* @param entityIds2
* @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
* @param K the K for topK candidate matches
* @param N the N for topN rdf relations (and neighbors)
* @param jsc the java spark context used to load files and broadcast variables
* @return topK neighbor candidates per entity
*/
public JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> run2(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates,
JavaRDD<String> rawTriples1,
JavaRDD<String> rawTriples2,
String SEPARATOR,
JavaRDD<String> entityIds1,
JavaRDD<String> entityIds2,
float MIN_SUPPORT_THRESHOLD,
int K,
int N,
JavaSparkContext jsc) {
Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> topKneighborCandidates = getTopKNeighborSimsSUMWithScores(topKvalueCandidates, inNeighbors_BV, K);
return topKneighborCandidates;
}
示例15: run
import org.apache.spark.api.java.JavaSparkContext; //导入依赖的package包/类
/**
*
* @param topKvalueCandidates the topK results per entity, acquired from value similarity
* @param rawTriples1 the rdf triples of the first entity collection
* @param rawTriples2 the rdf triples of the second entity collection
* @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
* @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
* @param entityIds2
* @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
* @param K the K for topK candidate matches
* @param N the N for topN rdf relations (and neighbors)
* @param jsc the java spark context used to load files and broadcast variables
* @return topK neighbor candidates per entity
*/
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates,
JavaRDD<String> rawTriples1,
JavaRDD<String> rawTriples2,
String SEPARATOR,
JavaRDD<String> entityIds1,
JavaRDD<String> entityIds2,
float MIN_SUPPORT_THRESHOLD,
int K,
int N,
JavaSparkContext jsc) {
Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
//JavaPairRDD<Tuple2<Integer, Integer>, Float> neighborSims = getNeighborSims(topKvalueCandidates, inNeighbors_BV);
//JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSimsOld(neighborSims, K);
JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);
return topKneighborCandidates;
}