当前位置: 首页>>代码示例>>Java>>正文


Java JavaRDD.count方法代码示例

本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.count方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.count方法的具体用法?Java JavaRDD.count怎么用?Java JavaRDD.count使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.api.java.JavaRDD的用法示例。


在下文中一共展示了JavaRDD.count方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: partition

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Partition instances by the specified partitioning (e.g. by instance type)
 *
 * @param instances RDD of instances to partition
 * @return partitioned RDD if requested, original RDD if no partitioning is specified
 */
public JavaRDD<Instance> partition(JavaRDD<Instance> instances) {
    if (!config.isRepartitionByType()) {
        return instances;
    }
    log.info("Getting counts by type hash");
    Map<Integer, Long> typeCounts = getApproximateTypeHashCounts(instances);
    int numPartitions = instances.getNumPartitions();
    long totalInstances = instances.count();
    long instancesPerPartition = totalInstances / numPartitions + 1;

    JavaPairRDD<Integer, Instance> instanceWithPartitions = instances.mapToPair(instance -> {
        int typeHash = getTypeHash(instance);
        int splitIncrement = getSplitIncrement(instance.getId(), typeCounts.get(typeHash), instancesPerPartition);
        return new Tuple2<>(typeHash + splitIncrement, instance);
    });

    log.info("Partitioning instances by type");
    return instanceWithPartitions
            .partitionBy(new HashPartitioner(numPartitions))
            .values();
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:28,代码来源:InstancePartitioner.java

示例2: fetchSampleData

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
static JavaRDD<Vector> fetchSampleData(JavaRDD<Vector> evalData) {
  JavaRDD<Vector> data = evalData;
  long count = evalData.count();
  if (count > MAX_SAMPLE_SIZE) {
    data = evalData.sample(false, (double) MAX_SAMPLE_SIZE / count);
  }
  return data;
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:9,代码来源:SilhouetteCoefficient.java

示例3: accuracy

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
static double accuracy(DecisionForest forest, JavaRDD<Example> examples) {
  long total = examples.count();
  if (total == 0) {
    return 0.0;
  }
  long correct = examples.filter(example -> {
      CategoricalPrediction prediction = (CategoricalPrediction) forest.predict(example);
      CategoricalFeature target = (CategoricalFeature) example.getTarget();
      return prediction.getMostProbableCategoryEncoding() == target.getEncoding();
    }).count();
  return (double) correct / total;
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:13,代码来源:Evaluation.java

示例4: evaluate

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public double evaluate(JavaSparkContext sparkContext,
                       PMML model,
                       Path modelParentPath,
                       JavaRDD<String> testData,
                       JavaRDD<String> trainData) {
  long testDataCount = testData.count();
  testCounts.add((int) testDataCount);
  log.info("Returning eval {}", testDataCount);
  return testDataCount;
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:12,代码来源:MockMLUpdate.java

示例5: main

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String args[]){

		SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("Any");
		JavaSparkContext sc = new JavaSparkContext(configuration);

		// Load and parse the data file.
		String input = "data/rf-data.txt";
		JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), input).toJavaRDD();
		// Split the data into training and test sets (30% held out for testing)
		JavaRDD<LabeledPoint>[] dataSplits = data.randomSplit(new double[]{0.7, 0.3});
		JavaRDD<LabeledPoint> trainingData = dataSplits[0];
		JavaRDD<LabeledPoint> testData = dataSplits[1];

		// Train a RandomForest model.
		Integer numClasses = 2;
		HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();//  Empty categoricalFeaturesInfo indicates all features are continuous.
		Integer numTrees = 3; // Use more in practice.
		String featureSubsetStrategy = "auto"; // Let the algorithm choose.
		String impurity = "gini";
		Integer maxDepth = 5;
		Integer maxBins = 32;
		Integer seed = 12345;

		final RandomForestModel rfModel = RandomForest.trainClassifier(trainingData, numClasses,
				categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,
				seed);

		// Evaluate model on test instances and compute test error
		JavaPairRDD<Double, Double> label =
				testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
					public Tuple2<Double, Double> call(LabeledPoint p) {
						return new Tuple2<Double, Double>(rfModel.predict(p.features()), p.label());
					}
				});

		Double testError =
				1.0 * label.filter(new Function<Tuple2<Double, Double>, Boolean>() {
					public Boolean call(Tuple2<Double, Double> pl) {
						return !pl._1().equals(pl._2());
					}
				}).count() / testData.count();

		System.out.println("Test Error: " + testError);
		System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
	}
 
开发者ID:PacktPublishing,项目名称:Java-Data-Science-Cookbook,代码行数:46,代码来源:RandomForestMlib.java

示例6: main

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) {

        Logger.getLogger("org").setLevel(Level.WARN);

        SparkConf sparkConf = new SparkConf()
                .setAppName("ExampleSpark")
                .setMaster("local");
        JavaSparkContext jsc = new JavaSparkContext(sparkConf);

        //String in = "data/iris2.data";
        //String out = "data/iris2outSVM.data";

        //double[][] inputs = IOUtils.readMatrix(in, ",");
        //double[] outputs = IOUtils.readVector(out);

        IdxManager idx = IOUtils.deserialize("data/idx.ser");
        IdxManager idxTest = IOUtils.deserialize("data/idx-test.ser");
        double[][] inputs = idx.getData();
        double[] outputs = idx.getLabelsVec();
        double[][] inputsTest = idxTest.getData();
        double[] outputsTest = idxTest.getLabelsVec();
        inputs = HogManager.exportDataFeatures(inputs, idx.getNumOfRows(),
               idx.getNumOfCols());
        inputsTest = HogManager.exportDataFeatures(inputsTest, idx.getNumOfRows(),
               idx.getNumOfCols());

        List<LabeledPoint> pointList = new ArrayList<>();
        for (int i = 0; i < outputs.length; i++) {
            pointList.add(new LabeledPoint(outputs[i], Vectors.dense(inputs[i])));
        }

        List<LabeledPoint> pointListTest = new ArrayList<>();
        for (int i = 0; i < outputsTest.length; i++) {
            pointListTest.add(new LabeledPoint(outputsTest[i],
                    Vectors.dense(inputsTest[i])));
        }

        JavaRDD<LabeledPoint> trainingData = jsc.parallelize(pointList);
        JavaRDD<LabeledPoint> testData = jsc.parallelize(pointListTest);

        // Split the data into training and test sets (30% held out for testing)
        //JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
        //JavaRDD<LabeledPoint> trainingData = splits[0];
        //JavaRDD<LabeledPoint> testData = splits[1];

        // Set parameters.
        // Empty categoricalFeaturesInfo indicates all features are continuous.
        Integer numClasses = 10;
        Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
        String impurity = "gini";
        Integer maxDepth = 10;
        Integer maxBins = 256;

        // Train a DecisionTree model for classification.
        long startTime = System.currentTimeMillis();
        final DecisionTreeModel model = DecisionTree.trainClassifier(trainingData,
                numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins);
        long endTime = System.currentTimeMillis();
        long learnTime = endTime - startTime;

        // Evaluate model on test instances and compute test error
        JavaPairRDD<Double, Double> predictionAndLabel =
                testData.mapToPair(
                        p -> new Tuple2<>(model.predict(p.features()), p.label()));
        Double testErr = 1.0 * predictionAndLabel.filter(
                pl -> !pl._1().equals(pl._2())).count() / testData.count();

        // results
        new File("results").mkdir();
        IOUtils.writeStr("results/dtree_error.data", Double.toString(testErr));
        IOUtils.writeStr("results/dtree_model.data", model.toDebugString());

        double[][] outFinal = new double[outputsTest.length][];
        for (int i = 0; i < outputsTest.length; i++) {
            outFinal[i] = valToVec(model.predict(Vectors.dense(inputsTest[i])));
        }

        ConfusionMatrix cm = new ConfusionMatrix(outFinal, idxTest.getLabels());
        cm.writeClassErrorMatrix("results/confusion_matrix.data");
        IOUtils.writeStr("results/learn_time_ms.data", Long.toString(learnTime));
    }
 
开发者ID:lukago,项目名称:neural-algorithms,代码行数:82,代码来源:ExampleSpark.java

示例7: main

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {

        final Configuration hadoopConfig = new Configuration(true);
        hdfs = FileSystem.get(hadoopConfig);

        if (!parseArguments(args)) {
            printHelp();
            System.exit(1);
        }

        if (hdfs.exists(directory)) {
            if (!hdfs.isDirectory(directory)) {
                System.out.printf("'%s' exists in HDFS, but is not a directory!%n", directory);
                System.exit(1);
            }
            FileStatus[] fileStatus = hdfs.listStatus(directory);
            if (fileStatus.length > 0) {
                System.out.printf("'%s' exists in HDFS, but is not empty!%n", directory);
                System.exit(1);
            }
        }

        createDirectories();

        System.out.printf("Creating test data in '%s'. This may take a while...%n", directory.toString());

        Map<String, LogfileType> logfileTypeByPath = new HashMap<>();

        LogfileSummary summary = writeLogFiles(logfileTypeByPath);

        SparkConf sparkConfig = new SparkConf().setAppName("Testing LogfileInputFormat.");
        JavaSparkContext sparkContext = new JavaSparkContext(sparkConfig);

        logfileTypeByPath.forEach((path, type) -> {
            LogfileInputFormat.setPattern(hadoopConfig, path, type.getFirstlinePattern());
        });
        LogfileInputFormat.setPattern(hadoopConfig, LogfileType.A.getFirstlinePattern());

        JavaPairRDD<Tuple2<Path, Long>, Text> rdd;
        JavaRDD<Tuple2<LocalDateTime, LogLevel>> logRecords;

        rdd = sparkContext.newAPIHadoopFile(logDir + "/*" + FILE_EXT_LOG, LogfileInputFormat.class, LogfileInputFormat.KEY_CLASS, Text.class, hadoopConfig);

        Function<Tuple2<Tuple2<Path, Long>, Text>, Tuple2<LocalDateTime, LogLevel>> mappingFunction = mappingFunction(logfileTypeByPath);

        logRecords = rdd.map(mappingFunction).cache();
        long totalCountLog = logRecords.count();
        long infoCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.INFO).count();
        long warnCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.WARN).count();
        long errorCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.ERROR).count();

        rdd = sparkContext.newAPIHadoopFile(logDirGz + "/*" + FILE_EXT_GZ, LogfileInputFormat.class, LogfileInputFormat.KEY_CLASS, Text.class, hadoopConfig);

        logRecords = rdd.map(mappingFunction).cache();
        long totalCountGz = logRecords.count();
        long infoCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.INFO).count();
        long warnCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.WARN).count();
        long errorCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.ERROR).count();

        long totalCountExpected = summary.getRecordCount();
        long infoCountExpected = summary.getRecordCount(LogLevel.INFO);
        long warnCountExpected = summary.getRecordCount(LogLevel.WARN);
        long errorCountExpected = summary.getRecordCount(LogLevel.ERROR);

        System.out.printf("%n%n%n%30s %15s %15s %15s %15s%n%n", "", "expected", "from *.log", "from *.log.gz", "test result");
        System.out.printf("%30s %15d %15d %15d %15s%n", "total # of log records",
                totalCountExpected, totalCountLog, totalCountGz,
                ((totalCountExpected == totalCountLog && totalCountLog == totalCountGz) ? "SUCCESS" : "FAILURE"));
        System.out.printf("%30s %15d %15d %15d %15s%n", "# of INFO level records",
                infoCountExpected, infoCountLog, infoCountGz,
                ((infoCountExpected == infoCountLog && infoCountLog == infoCountGz) ? "SUCCESS" : "FAILURE"));
        System.out.printf("%30s %15d %15d %15d %15s%n", "# of WARN level records",
                warnCountExpected, warnCountLog, warnCountGz,
                ((warnCountExpected == warnCountLog && warnCountLog == warnCountGz) ? "SUCCESS" : "FAILURE"));
        System.out.printf("%30s %15d %15d %15d %15s%n%n%n", "# of ERROR level records",
                errorCountExpected, errorCountLog, errorCountGz,
                ((errorCountExpected == errorCountLog && errorCountLog == errorCountGz) ? "SUCCESS" : "FAILURE"));

        sparkContext.close();
    }
 
开发者ID:comdirect,项目名称:hadoop-logfile-inputformat,代码行数:81,代码来源:Test.java


注:本文中的org.apache.spark.api.java.JavaRDD.count方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。