本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.count方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.count方法的具体用法?Java JavaRDD.count怎么用?Java JavaRDD.count使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaRDD
的用法示例。
在下文中一共展示了JavaRDD.count方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: partition
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Partition instances by the specified partitioning (e.g. by instance type)
*
* @param instances RDD of instances to partition
* @return partitioned RDD if requested, original RDD if no partitioning is specified
*/
public JavaRDD<Instance> partition(JavaRDD<Instance> instances) {
if (!config.isRepartitionByType()) {
return instances;
}
log.info("Getting counts by type hash");
Map<Integer, Long> typeCounts = getApproximateTypeHashCounts(instances);
int numPartitions = instances.getNumPartitions();
long totalInstances = instances.count();
long instancesPerPartition = totalInstances / numPartitions + 1;
JavaPairRDD<Integer, Instance> instanceWithPartitions = instances.mapToPair(instance -> {
int typeHash = getTypeHash(instance);
int splitIncrement = getSplitIncrement(instance.getId(), typeCounts.get(typeHash), instancesPerPartition);
return new Tuple2<>(typeHash + splitIncrement, instance);
});
log.info("Partitioning instances by type");
return instanceWithPartitions
.partitionBy(new HashPartitioner(numPartitions))
.values();
}
示例2: fetchSampleData
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
static JavaRDD<Vector> fetchSampleData(JavaRDD<Vector> evalData) {
JavaRDD<Vector> data = evalData;
long count = evalData.count();
if (count > MAX_SAMPLE_SIZE) {
data = evalData.sample(false, (double) MAX_SAMPLE_SIZE / count);
}
return data;
}
示例3: accuracy
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
static double accuracy(DecisionForest forest, JavaRDD<Example> examples) {
long total = examples.count();
if (total == 0) {
return 0.0;
}
long correct = examples.filter(example -> {
CategoricalPrediction prediction = (CategoricalPrediction) forest.predict(example);
CategoricalFeature target = (CategoricalFeature) example.getTarget();
return prediction.getMostProbableCategoryEncoding() == target.getEncoding();
}).count();
return (double) correct / total;
}
示例4: evaluate
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public double evaluate(JavaSparkContext sparkContext,
PMML model,
Path modelParentPath,
JavaRDD<String> testData,
JavaRDD<String> trainData) {
long testDataCount = testData.count();
testCounts.add((int) testDataCount);
log.info("Returning eval {}", testDataCount);
return testDataCount;
}
示例5: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String args[]){
SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("Any");
JavaSparkContext sc = new JavaSparkContext(configuration);
// Load and parse the data file.
String input = "data/rf-data.txt";
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), input).toJavaRDD();
// Split the data into training and test sets (30% held out for testing)
JavaRDD<LabeledPoint>[] dataSplits = data.randomSplit(new double[]{0.7, 0.3});
JavaRDD<LabeledPoint> trainingData = dataSplits[0];
JavaRDD<LabeledPoint> testData = dataSplits[1];
// Train a RandomForest model.
Integer numClasses = 2;
HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();// Empty categoricalFeaturesInfo indicates all features are continuous.
Integer numTrees = 3; // Use more in practice.
String featureSubsetStrategy = "auto"; // Let the algorithm choose.
String impurity = "gini";
Integer maxDepth = 5;
Integer maxBins = 32;
Integer seed = 12345;
final RandomForestModel rfModel = RandomForest.trainClassifier(trainingData, numClasses,
categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,
seed);
// Evaluate model on test instances and compute test error
JavaPairRDD<Double, Double> label =
testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
public Tuple2<Double, Double> call(LabeledPoint p) {
return new Tuple2<Double, Double>(rfModel.predict(p.features()), p.label());
}
});
Double testError =
1.0 * label.filter(new Function<Tuple2<Double, Double>, Boolean>() {
public Boolean call(Tuple2<Double, Double> pl) {
return !pl._1().equals(pl._2());
}
}).count() / testData.count();
System.out.println("Test Error: " + testError);
System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
}
示例6: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) {
Logger.getLogger("org").setLevel(Level.WARN);
SparkConf sparkConf = new SparkConf()
.setAppName("ExampleSpark")
.setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
//String in = "data/iris2.data";
//String out = "data/iris2outSVM.data";
//double[][] inputs = IOUtils.readMatrix(in, ",");
//double[] outputs = IOUtils.readVector(out);
IdxManager idx = IOUtils.deserialize("data/idx.ser");
IdxManager idxTest = IOUtils.deserialize("data/idx-test.ser");
double[][] inputs = idx.getData();
double[] outputs = idx.getLabelsVec();
double[][] inputsTest = idxTest.getData();
double[] outputsTest = idxTest.getLabelsVec();
inputs = HogManager.exportDataFeatures(inputs, idx.getNumOfRows(),
idx.getNumOfCols());
inputsTest = HogManager.exportDataFeatures(inputsTest, idx.getNumOfRows(),
idx.getNumOfCols());
List<LabeledPoint> pointList = new ArrayList<>();
for (int i = 0; i < outputs.length; i++) {
pointList.add(new LabeledPoint(outputs[i], Vectors.dense(inputs[i])));
}
List<LabeledPoint> pointListTest = new ArrayList<>();
for (int i = 0; i < outputsTest.length; i++) {
pointListTest.add(new LabeledPoint(outputsTest[i],
Vectors.dense(inputsTest[i])));
}
JavaRDD<LabeledPoint> trainingData = jsc.parallelize(pointList);
JavaRDD<LabeledPoint> testData = jsc.parallelize(pointListTest);
// Split the data into training and test sets (30% held out for testing)
//JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
//JavaRDD<LabeledPoint> trainingData = splits[0];
//JavaRDD<LabeledPoint> testData = splits[1];
// Set parameters.
// Empty categoricalFeaturesInfo indicates all features are continuous.
Integer numClasses = 10;
Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
String impurity = "gini";
Integer maxDepth = 10;
Integer maxBins = 256;
// Train a DecisionTree model for classification.
long startTime = System.currentTimeMillis();
final DecisionTreeModel model = DecisionTree.trainClassifier(trainingData,
numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins);
long endTime = System.currentTimeMillis();
long learnTime = endTime - startTime;
// Evaluate model on test instances and compute test error
JavaPairRDD<Double, Double> predictionAndLabel =
testData.mapToPair(
p -> new Tuple2<>(model.predict(p.features()), p.label()));
Double testErr = 1.0 * predictionAndLabel.filter(
pl -> !pl._1().equals(pl._2())).count() / testData.count();
// results
new File("results").mkdir();
IOUtils.writeStr("results/dtree_error.data", Double.toString(testErr));
IOUtils.writeStr("results/dtree_model.data", model.toDebugString());
double[][] outFinal = new double[outputsTest.length][];
for (int i = 0; i < outputsTest.length; i++) {
outFinal[i] = valToVec(model.predict(Vectors.dense(inputsTest[i])));
}
ConfusionMatrix cm = new ConfusionMatrix(outFinal, idxTest.getLabels());
cm.writeClassErrorMatrix("results/confusion_matrix.data");
IOUtils.writeStr("results/learn_time_ms.data", Long.toString(learnTime));
}
示例7: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
final Configuration hadoopConfig = new Configuration(true);
hdfs = FileSystem.get(hadoopConfig);
if (!parseArguments(args)) {
printHelp();
System.exit(1);
}
if (hdfs.exists(directory)) {
if (!hdfs.isDirectory(directory)) {
System.out.printf("'%s' exists in HDFS, but is not a directory!%n", directory);
System.exit(1);
}
FileStatus[] fileStatus = hdfs.listStatus(directory);
if (fileStatus.length > 0) {
System.out.printf("'%s' exists in HDFS, but is not empty!%n", directory);
System.exit(1);
}
}
createDirectories();
System.out.printf("Creating test data in '%s'. This may take a while...%n", directory.toString());
Map<String, LogfileType> logfileTypeByPath = new HashMap<>();
LogfileSummary summary = writeLogFiles(logfileTypeByPath);
SparkConf sparkConfig = new SparkConf().setAppName("Testing LogfileInputFormat.");
JavaSparkContext sparkContext = new JavaSparkContext(sparkConfig);
logfileTypeByPath.forEach((path, type) -> {
LogfileInputFormat.setPattern(hadoopConfig, path, type.getFirstlinePattern());
});
LogfileInputFormat.setPattern(hadoopConfig, LogfileType.A.getFirstlinePattern());
JavaPairRDD<Tuple2<Path, Long>, Text> rdd;
JavaRDD<Tuple2<LocalDateTime, LogLevel>> logRecords;
rdd = sparkContext.newAPIHadoopFile(logDir + "/*" + FILE_EXT_LOG, LogfileInputFormat.class, LogfileInputFormat.KEY_CLASS, Text.class, hadoopConfig);
Function<Tuple2<Tuple2<Path, Long>, Text>, Tuple2<LocalDateTime, LogLevel>> mappingFunction = mappingFunction(logfileTypeByPath);
logRecords = rdd.map(mappingFunction).cache();
long totalCountLog = logRecords.count();
long infoCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.INFO).count();
long warnCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.WARN).count();
long errorCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.ERROR).count();
rdd = sparkContext.newAPIHadoopFile(logDirGz + "/*" + FILE_EXT_GZ, LogfileInputFormat.class, LogfileInputFormat.KEY_CLASS, Text.class, hadoopConfig);
logRecords = rdd.map(mappingFunction).cache();
long totalCountGz = logRecords.count();
long infoCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.INFO).count();
long warnCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.WARN).count();
long errorCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.ERROR).count();
long totalCountExpected = summary.getRecordCount();
long infoCountExpected = summary.getRecordCount(LogLevel.INFO);
long warnCountExpected = summary.getRecordCount(LogLevel.WARN);
long errorCountExpected = summary.getRecordCount(LogLevel.ERROR);
System.out.printf("%n%n%n%30s %15s %15s %15s %15s%n%n", "", "expected", "from *.log", "from *.log.gz", "test result");
System.out.printf("%30s %15d %15d %15d %15s%n", "total # of log records",
totalCountExpected, totalCountLog, totalCountGz,
((totalCountExpected == totalCountLog && totalCountLog == totalCountGz) ? "SUCCESS" : "FAILURE"));
System.out.printf("%30s %15d %15d %15d %15s%n", "# of INFO level records",
infoCountExpected, infoCountLog, infoCountGz,
((infoCountExpected == infoCountLog && infoCountLog == infoCountGz) ? "SUCCESS" : "FAILURE"));
System.out.printf("%30s %15d %15d %15d %15s%n", "# of WARN level records",
warnCountExpected, warnCountLog, warnCountGz,
((warnCountExpected == warnCountLog && warnCountLog == warnCountGz) ? "SUCCESS" : "FAILURE"));
System.out.printf("%30s %15d %15d %15d %15s%n%n%n", "# of ERROR level records",
errorCountExpected, errorCountLog, errorCountGz,
((errorCountExpected == errorCountLog && errorCountLog == errorCountGz) ? "SUCCESS" : "FAILURE"));
sparkContext.close();
}