本文整理汇总了Java中org.apache.spark.api.java.JavaSparkContext.parallelize方法的典型用法代码示例。如果您正苦于以下问题:Java JavaSparkContext.parallelize方法的具体用法?Java JavaSparkContext.parallelize怎么用?Java JavaSparkContext.parallelize使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaSparkContext
的用法示例。
在下文中一共展示了JavaSparkContext.parallelize方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: interleaveSplitFastq
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
zips.foreach( splits -> {
Path path = splits._1.getPath();
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
});
}
示例2: splitFastq
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
Path fqpath = new Path(fqPath);
String fqname = fqpath.getName();
String[] ns = fqname.split("\\.");
//TODO: Handle also compressed files
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
splitRDD.foreach( split -> {
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);
});
}
示例3: interleaveSplitFastq
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
zips.foreach( splits -> {
Path path = splits._1.getPath();
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
});
}
示例4: interleaveSplitFastq
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
String[] ns = fst.getPath().getName().split("\\.");
//TODO: Handle also compressed files
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
zips.foreach( splits -> {
Path path = splits._1.getPath();
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir, path.getParent().getName()+"_"+splits._1.getStart()+".fq");
});
}
示例5: GetPi
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public String GetPi(int scale) {
JavaSparkContext jsc = SparkContextProvider.getContext();
int n = 100000 * scale;
List<Integer> l = new ArrayList<Integer>(n);
for (int i = 0; i < n; i++) {
l.add(i);
}
JavaRDD<Integer> dataSet = jsc.parallelize(l, scale);
int count = dataSet.map(integer -> {
double x = Math.random() * 2 - 1;
double y = Math.random() * 2 - 1;
return (x * x + y * y < 1) ? 1 : 0;
}).reduce((integer, integer2) -> integer + integer2);
String ret = "Pi is rouuuughly " + 4.0 * count / n;
return ret;
}
示例6: splitFastq
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
Path fqpath = new Path(fqPath);
String fqname = fqpath.getName();
String[] ns = fqname.split("\\.");
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
splitRDD.foreach( split -> {
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);
});
}
示例7: buildMetadataRDD
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
/**
* buildMetadataRDD: Convert metadata list to JavaPairRDD
*
* @param es an Elasticsearch client node instance
* @param sc spark context
* @param index index name of log processing application
* @param metadatas metadata list
* @return PairRDD, in each pair key is metadata short name and value is term
* list extracted from metadata variables.
*/
protected JavaPairRDD<String, List<String>> buildMetadataRDD(ESDriver es, JavaSparkContext sc, String index, List<PODAACMetadata> metadatas) {
JavaRDD<PODAACMetadata> metadataRDD = sc.parallelize(metadatas);
JavaPairRDD<String, List<String>> metadataTermsRDD = metadataRDD.mapToPair(new PairFunction<PODAACMetadata, String, List<String>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, List<String>> call(PODAACMetadata metadata) throws Exception {
return new Tuple2<String, List<String>>(metadata.getShortName(), metadata.getAllTermList());
}
}).reduceByKey(new Function2<List<String>, List<String>, List<String>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public List<String> call(List<String> v1, List<String> v2) throws Exception {
List<String> list = new ArrayList<String>();
list.addAll(v1);
list.addAll(v2);
return list;
}
});
return metadataTermsRDD;
}
示例8: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception{
String srcBucketName;
String scrBucketKey;
String destBucketName;
String destPrefix;
ArgumentParser argumentParser = new ArgumentParser();
AmazonS3 s3Client = new AmazonS3Client();
try {
BucketKey location = argumentParser.parseArguments(args);
srcBucketName = location.getSrcBucket();
scrBucketKey = location.getSrcKey();
destBucketName = location.getDestBucket();
destPrefix = location.getDestPrefix();
} catch (ParseException e) {
LOG.info(PARSE_ERROR_MSG);
throw new IllegalArgumentException("Parser throw a parse Exception", e);
}
// Obtain the original manifest files
InventoryManifestRetriever inventoryManifestRetriever =
new InventoryManifestRetriever(s3Client, srcBucketName, scrBucketKey);
InventoryManifest manifest = inventoryManifestRetriever.getInventoryManifest();
// Check if the inventory report includes the StorageClass column
String fileSchema = manifest.getFileSchema();
String filterColumn = "storageClass";
if (!StringUtils.containsIgnoreCase(fileSchema, filterColumn)) {
throw new StorageClassNotIncludedException();
}
//Create Spark Context
SparkConf sparkConf = new SparkConf();
JavaSparkContext sc = new JavaSparkContext(sparkConf);
Broadcast<CachedS3ClientFactory> clientFactory = sc.broadcast(new CachedS3ClientFactory());
// Get the inventory report, split it into lines, parse each line to a POJO,
// Filter, and write new csv file to S3
JavaRDD<InventoryManifest.Locator> locatorRDD = sc.parallelize(manifest.getLocators());
List<InventoryManifest.Locator> newLocatorList = locatorRDD
.map(new InventoryReportLineRetriever(clientFactory, manifest))
.flatMap(new InventoryReportMapper(manifest))
.filter(new ReducedRedundancyStorageClassFilter())
.mapPartitions(new WriteNewInventoryReportFunc(clientFactory, srcBucketName, manifest,
destBucketName, destPrefix))
.collect();
// Generate new manifest files including new locators, and send them back to S3
new ManifestWriter(s3Client, destBucketName, destPrefix, srcBucketName, manifest)
.writeManifest(newLocatorList);
sc.close();
}
开发者ID:awslabs,项目名称:s3-inventory-usage-examples,代码行数:54,代码来源:ReducedRedundancyLocatorExampleMain.java
示例9: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
Logger.getLogger("org").setLevel(Level.WARN);
SparkConf sparkConf = new SparkConf()
.setAppName("ExampleSpark")
.setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
//String in = "data/iris2.data";
//String out = "data/iris2outSVM.data";
//double[][] inputs = IOUtils.readMatrix(in, ",");
//double[] outputs = IOUtils.readVector(out);
IdxManager idx = IOUtils.deserialize("data/idx.ser");
IdxManager idxTest = IOUtils.deserialize("data/idx-test.ser");
double[][] inputs = idx.getData();
double[] outputs = idx.getLabelsVec();
double[][] inputsTest = idxTest.getData();
double[] outputsTest = idxTest.getLabelsVec();
inputs = HogManager.exportDataFeatures(inputs, idx.getNumOfRows(),
idx.getNumOfCols());
inputsTest = HogManager.exportDataFeatures(inputsTest, idx.getNumOfRows(),
idx.getNumOfCols());
List<LabeledPoint> pointList = new ArrayList<>();
for (int i = 0; i < outputs.length; i++) {
pointList.add(new LabeledPoint(outputs[i], Vectors.dense(inputs[i])));
}
List<LabeledPoint> pointListTest = new ArrayList<>();
for (int i = 0; i < outputsTest.length; i++) {
pointListTest.add(new LabeledPoint(outputsTest[i],
Vectors.dense(inputsTest[i])));
}
JavaRDD<LabeledPoint> trainingData = jsc.parallelize(pointList);
JavaRDD<LabeledPoint> testData = jsc.parallelize(pointListTest);
// Split the data into training and test sets (30% held out for testing)
//JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
//JavaRDD<LabeledPoint> trainingData = splits[0];
//JavaRDD<LabeledPoint> testData = splits[1];
// Set parameters.
// Empty categoricalFeaturesInfo indicates all features are continuous.
Integer numClasses = 10;
Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
String impurity = "gini";
Integer maxDepth = 10;
Integer maxBins = 256;
// Train a DecisionTree model for classification.
long startTime = System.currentTimeMillis();
final DecisionTreeModel model = DecisionTree.trainClassifier(trainingData,
numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins);
long endTime = System.currentTimeMillis();
long learnTime = endTime - startTime;
// Evaluate model on test instances and compute test error
JavaPairRDD<Double, Double> predictionAndLabel =
testData.mapToPair(
p -> new Tuple2<>(model.predict(p.features()), p.label()));
Double testErr = 1.0 * predictionAndLabel.filter(
pl -> !pl._1().equals(pl._2())).count() / testData.count();
// results
new File("results").mkdir();
IOUtils.writeStr("results/dtree_error.data", Double.toString(testErr));
IOUtils.writeStr("results/dtree_model.data", model.toDebugString());
double[][] outFinal = new double[outputsTest.length][];
for (int i = 0; i < outputsTest.length; i++) {
outFinal[i] = valToVec(model.predict(Vectors.dense(inputsTest[i])));
}
ConfusionMatrix cm = new ConfusionMatrix(outFinal, idxTest.getLabels());
cm.writeClassErrorMatrix("results/confusion_matrix.data");
IOUtils.writeStr("results/learn_time_ms.data", Long.toString(learnTime));
}
示例10: interleaveReads
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static JavaPairRDD<Text, SequencedFragment> interleaveReads(String fastq, String fastq2, int splitlen, JavaSparkContext sc) throws IOException {
FileSystem fs = FileSystem.get(new Configuration());
FileStatus fst = fs.getFileStatus(new Path(fastq));
FileStatus fst2 = fs.getFileStatus(new Path(fastq2));
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
return zips.flatMapToPair( splits -> {
FastqInputFormat.FastqRecordReader fqreader = new FastqInputFormat.FastqRecordReader(new Configuration(), splits._1);
FastqInputFormat.FastqRecordReader fqreader2 = new FastqInputFormat.FastqRecordReader(new Configuration(), splits._2);
ArrayList<Tuple2<Text, SequencedFragment>> reads = new ArrayList<Tuple2<Text, SequencedFragment>>();
while (fqreader.nextKeyValue()) {
String key = fqreader.getCurrentKey().toString();
String[] keysplit = key.split(" ");
key = keysplit[0];
SequencedFragment sf = new SequencedFragment();
sf.setQuality(new Text(fqreader.getCurrentValue().getQuality().toString()));
sf.setSequence(new Text(fqreader.getCurrentValue().getSequence().toString()));
if (fqreader2.nextKeyValue()) {
String key2 = fqreader2.getCurrentKey().toString();
String[] keysplit2 = key2.split(" ");
key2 = keysplit2[0];
//key2 = key2.replace(" 2:N:0:1","/2");
SequencedFragment sf2 = new SequencedFragment();
sf2.setQuality(new Text(fqreader2.getCurrentValue().getQuality().toString()));
sf2.setSequence(new Text(fqreader2.getCurrentValue().getSequence().toString()));
reads.add(new Tuple2<Text, SequencedFragment>(new Text(key), sf));
reads.add(new Tuple2<Text, SequencedFragment>(new Text(key2), sf2));
}
}
return reads.iterator();
});
}
示例11: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
boolean isLocal = false;
final String master = isLocal ? "local[4]" : "spark://10.128.184.199:7077";
final String csv = isLocal ? "Z:/RCS_SP1/RAW_DATA_MORE/2016_03/TAXI/TAXI_20160301.csv" : "/pi_nj_57/RCS_SP1/RAW_DATA_MORE/2016_03/TAXI/TAXI_20160301.csv";
final String appName = "SpeedCalculator";
Calculator calculator = new Calculator();
SparkConf conf = new SparkConf()
.set("spark.executor.memory", "4G")
.set("spark.submit.deployMode", "cluster")
.setMaster("spark://10.128.184.199:7077")
.setJars(new String[]{"C:\\Users\\i321761\\Desktop\\git\\github.wdf.sap.corp\\i321761\\hadoop-sample\\target\\hadoopsample-1.0-SNAPSHOT.jar"});
JavaSparkContext sc = new JavaSparkContext(master, appName, conf);
// JavaRDD<String> rdd = sc.textFile(csv, 2);
JavaRDD<String> rdd = sc.parallelize(Arrays.asList("abc", "def"));
long start = System.currentTimeMillis();
System.out.println("Count Start ....");
// Convert csv string to taxi point structure and remove invalid records
JavaRDD<ITaxiMonitor.TaxiPoint> taxiPointRDD = rdd.map(line -> TaxiPointUtil.parseTaxiPoint(line))
.filter(point -> point != null && !point.receiveTime.isEmpty() && point.receiveTime.contains(" 08:"));
JavaPairRDD<Long, List<ITaxiMonitor.TaxiPoint>> slotsIn5 = taxiPointRDD
.keyBy(point -> (DateTimeUtil.parseToMillSecond(point.receiveTime, "UTC+8") / 300000) * 300000)
.combineByKey(
// 收到每个key的第一条记录时的初始化工作
v -> {
List<ITaxiMonitor.TaxiPoint> points = new ArrayList();
points.add(v);
return points;
},
// 对于某个key,收到新的记录时的操作
(c, v) -> {
c.add(v);
return c;
},
// 一个key的集合可能分布在多个task上,如何合并同一个key的操作
(c1, c2) -> {
c1.addAll(c2);
return c1;
}
)
.sortByKey();
// 一个key代表5分钟的交通数据集合,对每个5分钟的集合调用计算接口计算出交通速度
slotsIn5.map(slot -> calculator.execute(slot._2(), slot._1(), slot._1()))
.collect().forEach(speedResult -> {
speedResult.getTimedEdgeSpeeds().forEach(timedEdgeSpeeds -> {
long t = DateTimeUtil.parseToMillSecond(timedEdgeSpeeds.timestamp, "UTC+0");
timedEdgeSpeeds.edgeSpeeds.forEach(speed -> System.out.println(" * EDGE_SPEED: " + TaxiPointUtil.formatEdgeSpeed(t, speed, ",")));
});
});
slotsIn5.take(10)
.forEach(slot -> System.out.println("slot: " + slot._1() + ", " + DateTimeUtil.formatToUTC(slot._1()) + ", count: " + slot._2().size()));
// .foreach(slot -> System.out.println("slot: " + DateTimeUtil.formatToUTC(slot._1()) + ", count" + slot._2().size()));
sc.stop();
}
示例12: splitFastq
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static void splitFastq(FileStatus fst, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
//TODO: Handle also compressed files
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, new Configuration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
splitRDD.foreach( split -> {
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
writeFastqFile(fqreader, new Configuration(), splitDir + "/" + split.getPath().getName()+"_"+split.getStart() + ".fq");
});
}
示例13: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
SparkConf conf = new SparkConf().setMaster("local").setAppName("graph");
JavaSparkContext javaSparkContext = new JavaSparkContext(conf);
ClassTag<String> stringTag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
List<Edge<String>> edges = new ArrayList<>();
edges.add(new Edge<String>(1, 2, "Friend"));
edges.add(new Edge<String>(2, 3, "Advisor"));
edges.add(new Edge<String>(1, 3, "Friend"));
edges.add(new Edge<String>(4, 3, "colleague"));
edges.add(new Edge<String>(4, 5, "Relative"));
edges.add(new Edge<String>(2, 5, "BusinessPartners"));
JavaRDD<Edge<String>> edgeRDD = javaSparkContext.parallelize(edges);
Graph<String, String> graph = Graph.fromEdges(edgeRDD.rdd(), "",StorageLevel.MEMORY_ONLY(), StorageLevel.MEMORY_ONLY(), stringTag, stringTag);
graph.vertices().toJavaRDD().collect().forEach(System.out::println);
// graph.aggregateMessages(sendMsg, mergeMsg, tripletFields, evidence$11)
}
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:31,代码来源:PropertyGraphExampleFromEdges.java
示例14: testFromRdd
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
@Test
public void testFromRdd() {
JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
JavaRDD<Condition> conditionRdd = context.parallelize(ImmutableList.of(condition));
Dataset<Condition> ds = spark.createDataset(conditionRdd.rdd(),
encoders.of(Condition.class));
Condition convertedCondition = ds.head();
Assert.assertEquals(condition.getId(),
convertedCondition.getId());
}