当前位置: 首页>>代码示例>>Java>>正文


Java Vector类代码示例

本文整理汇总了Java中org.apache.spark.mllib.linalg.Vector的典型用法代码示例。如果您正苦于以下问题:Java Vector类的具体用法?Java Vector怎么用?Java Vector使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


Vector类属于org.apache.spark.mllib.linalg包,在下文中一共展示了Vector类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: fetchClusterMetrics

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
 * @param evalData points to cluster for evaluation
 * @return cluster IDs as keys, and metrics for each cluster like the count, sum of distances to centroid,
 *  and sum of squared distances
 */
JavaPairRDD<Integer,ClusterMetric> fetchClusterMetrics(JavaRDD<Vector> evalData) {
  return evalData.mapToPair(vector -> {
    double closestDist = Double.POSITIVE_INFINITY;
    int minClusterID = Integer.MIN_VALUE;
    double[] vec = vector.toArray();
    for (ClusterInfo cluster : clusters.values()) {
      double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
      if (distance < closestDist) {
        closestDist = distance;
        minClusterID = cluster.getID();
      }
    }
    Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
    return new Tuple2<>(minClusterID, new ClusterMetric(1L, closestDist, closestDist * closestDist));
  }).reduceByKey(ClusterMetric::add);
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:22,代码来源:AbstractKMeansEvaluation.java

示例2: buildModel

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
 * @param sparkContext    active Spark Context
 * @param trainData       training data on which to build a model
 * @param hyperParameters ordered list of hyper parameter values to use in building model
 * @param candidatePath   directory where additional model files can be written
 * @return a {@link PMML} representation of a model trained on the given data
 */
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int numClusters = (Integer) hyperParameters.get(0);
  Preconditions.checkArgument(numClusters > 1);
  log.info("Building KMeans Model with {} clusters", numClusters);

  JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
  KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
                                         numberOfRuns, initializationStrategy);

  return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:23,代码来源:KMeansUpdate.java

示例3: hellingerDistance

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
 * Calculate similarity (Hellinger Distance) between vectors
 *
 * @param vecA initial vector from which to calculate a similarity
 * @param vecB second vector involved in similarity calculation
 * @return similarity between two vectors
 */
public static double hellingerDistance(Vector vecA, Vector vecB) {
  double[] arrA = vecA.toArray();
  double[] arrB = vecB.toArray();

  double sim = 0.0;

  int arrsize = arrA.length;
  for (int i = 0; i < arrsize; i++) {
    double a = arrA[i];
    double b = arrB[i];
    double sqrtDiff = Math.sqrt(a) - Math.sqrt(b);
    sim += sqrtDiff * sqrtDiff;
  }

  sim = sim / Math.sqrt(2);

  return sim;
}
 
开发者ID:apache,项目名称:incubator-sdap-mudrod,代码行数:26,代码来源:SimilarityUtil.java

示例4: pearsonDistance

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
 * Calculate similarity (Pearson Distance) between vectors
 *
 * @param vecA initial vector from which to calculate a similarity
 * @param vecB second vector involved in similarity calculation
 * @return similarity between two vectors
 */
public static double pearsonDistance(Vector vecA, Vector vecB) {
  double[] arrA = vecA.toArray();
  double[] arrB = vecB.toArray();

  int viewA = 0;
  int viewB = 0;
  int viewAB = 0;

  int arrsize = arrA.length;
  for (int i = 0; i < arrsize; i++) {
    if (arrA[i] > 0) {
      viewA++;
    }

    if (arrB[i] > 0) {
      viewB++;
    }

    if (arrB[i] > 0 && arrA[i] > 0) {
      viewAB++;
    }
  }
  return viewAB / (Math.sqrt(viewA) * Math.sqrt(viewB));
}
 
开发者ID:apache,项目名称:incubator-sdap-mudrod,代码行数:32,代码来源:SimilarityUtil.java

示例5: getSVDMatrix

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
 * GetSVDMatrix: Create SVD matrix csv file from original csv file.
 *
 * @param csvFileName       each row is a term, and each column is a document.
 * @param svdDimention      Dimension of SVD matrix
 * @param svdMatrixFileName CSV file name of SVD matrix
 */
public void getSVDMatrix(String csvFileName, int svdDimention, String svdMatrixFileName) {

  JavaPairRDD<String, Vector> importRDD = MatrixUtil.loadVectorFromCSV(spark, csvFileName, 1);
  JavaRDD<Vector> vectorRDD = importRDD.values();
  RowMatrix wordDocMatrix = new RowMatrix(vectorRDD.rdd());
  RowMatrix tfidfMatrix = MatrixUtil.createTFIDFMatrix(wordDocMatrix);
  RowMatrix svdMatrix = MatrixUtil.buildSVDMatrix(tfidfMatrix, svdDimention);

  List<String> rowKeys = importRDD.keys().collect();
  List<String> colKeys = new ArrayList<>();
  for (int i = 0; i < svdDimention; i++) {
    colKeys.add("dimension" + i);
  }
  MatrixUtil.exportToCSV(svdMatrix, rowKeys, colKeys, svdMatrixFileName);
}
 
开发者ID:apache,项目名称:incubator-sdap-mudrod,代码行数:23,代码来源:SVDAnalyzer.java

示例6: generateGaussianMixtureModel

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
public GaussianMixtureModel generateGaussianMixtureModel(JavaRDD<Vector> parsedData,
                                                         GaussianMixtureDetectionAlgorithm gaussianMixtureDetectionAlgorithm,
                                                         GaussianMixtureModelSummary gaussianMixtureModelSummary) {
    GaussianMixture gaussianMixture = new GaussianMixture();
    if (gaussianMixtureDetectionAlgorithm.getK() != -1) {
        gaussianMixture.setK(gaussianMixtureDetectionAlgorithm.getK());
    }
    if (gaussianMixtureDetectionAlgorithm.getMaxIterations() != -1) {
        gaussianMixture.setMaxIterations(gaussianMixtureDetectionAlgorithm.getMaxIterations());
    }
    if (gaussianMixtureDetectionAlgorithm.getConvergenceTol() != -1){
        gaussianMixture.setConvergenceTol(gaussianMixtureDetectionAlgorithm.getConvergenceTol());
    }
    if (gaussianMixtureDetectionAlgorithm.getInitializedModel() != null) {
        gaussianMixture.setInitialModel(gaussianMixtureDetectionAlgorithm.getInitializedModel());
    }
    if (gaussianMixtureDetectionAlgorithm.getSeed() != -1) {
        gaussianMixture.setSeed(gaussianMixtureDetectionAlgorithm.getSeed());
    }

    GaussianMixtureModel gaussianMixtureModel = gaussianMixture.run(parsedData);
    gaussianMixtureModelSummary.setGaussianMixtureDetectionAlgorithm(gaussianMixtureDetectionAlgorithm);
    return gaussianMixtureModel;
}
 
开发者ID:shlee89,项目名称:athena,代码行数:25,代码来源:GaussianMixtureDistJob.java

示例7: evaluate

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
 * @param evalData data for evaluation
 * @return the Dunn Index of a given clustering
 *  (https://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation); higher is better
 */
@Override
double evaluate(JavaRDD<Vector> evalData) {
  // Intra-cluster distance is mean distance to centroid
  double maxIntraClusterDistance =
      fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getMeanDist).max();

  // Inter-cluster distance is distance between centroids
  double minInterClusterDistance = Double.POSITIVE_INFINITY;
  List<ClusterInfo> clusters = new ArrayList<>(getClustersByID().values());
  DistanceFn<double[]> distanceFn = getDistanceFn();
  for (int i = 0; i < clusters.size(); i++) {
    double[] centerI = clusters.get(i).getCenter();
    // Distances are symmetric, hence d(i,j) == d(j,i)
    for (int j = i + 1; j < clusters.size(); j++) {
      double[] centerJ = clusters.get(j).getCenter();
      minInterClusterDistance = Math.min(minInterClusterDistance, distanceFn.applyAsDouble(centerI, centerJ));
    }
  }

  return minInterClusterDistance / maxIntraClusterDistance;
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:27,代码来源:DunnIndex.java

示例8: fetchClusteredPoints

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<Vector> evalData) {
  return evalData.mapToPair(vector -> {
    double closestDist = Double.POSITIVE_INFINITY;
    int minClusterID = Integer.MIN_VALUE;
    double[] vec = vector.toArray();
    DistanceFn<double[]> distanceFn = getDistanceFn();
    Map<Integer,ClusterInfo> clusters = getClustersByID();
    for (ClusterInfo cluster : clusters.values()) {
      double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
      if (distance < closestDist) {
        closestDist = distance;
        minClusterID = cluster.getID();
      }
    }
    Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
    return new Tuple2<>(minClusterID, vec);
  }).groupByKey();
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:19,代码来源:SilhouetteCoefficient.java

示例9: createDF

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
public DataFrame createDF(JavaRDD<Tuple2<Vector, String>> rdd) {

        // Generate the schema based on the string of schema
        List<StructField> fields = new ArrayList<StructField>();
        fields.add(DataTypes.createStructField("vectorized_count", new VectorUDT(), true));
        fields.add(DataTypes.createStructField("product_title", DataTypes.StringType, true));

        StructType schema = DataTypes.createStructType(fields);
        // Convert records of the RDD (people) to Rows.
        JavaRDD<Row> rowRDD = rdd.map(
                new Function<Tuple2<Vector, String>, Row>() {
                    public Row call(Tuple2<Vector, String> record) {
                        return RowFactory.create(record._1(), record._2());
                    }
                });

        return sqlContext.createDataFrame(rowRDD, schema);
    }
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:19,代码来源:IfZeroVectorBridgeTest.java

示例10: transform

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
public LabeledPoint transform(Tuple2<Double, Multiset<String>> doc) {
	double label = doc._1();
	List<Tuple2<Integer, Double>> vector = new ArrayList<>();
	for (Multiset.Entry<String> entry : doc._2().entrySet()) {
		String word = entry.getElement();
		int tf = entry.getCount();

		Tuple2<Integer, Long> wordInfo = idf.get(word);
		if (wordInfo != null) {
			int index = wordInfo._2().intValue();
			int numDocs = (int) this.newsCount;
			int df = wordInfo._2().intValue();

			double tfidf = this.calculate(tf, df, numDocs);

			vector.add(new Tuple2<>(index, tfidf));
		}
	}
	Vector features = Vectors.sparse((int) featuresCount, vector);

	return new LabeledPoint(label, features);
}
 
开发者ID:mhardalov,项目名称:news-credibility,代码行数:23,代码来源:TFIDFTransform.java

示例11: main

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
public static void main(String[] args) {

    String inputFile = "data/kmeans_data.txt";
    int k = 2; // two clusters
    int iterations = 10;
    int runs = 1;

    JavaSparkContext sc = new JavaSparkContext("local", "JavaKMeans");
    JavaRDD<String> lines = sc.textFile(inputFile);

    JavaRDD<Vector> points = lines.map(new ParsePoint());

    KMeansModel model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.K_MEANS_PARALLEL());

    System.out.println("Cluster centers:");
    for (Vector center : model.clusterCenters()) {
      System.out.println(" " + center);
    }
    double cost = model.computeCost(points.rdd());
    System.out.println("Cost: " + cost);

    sc.stop();
  }
 
开发者ID:mark-watson,项目名称:power-java,代码行数:24,代码来源:JavaKMeans.java

示例12: printClusterIndex

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
static private void printClusterIndex(int clusterIndex, KMeansModel model) throws IOException {
  System.out.println("\nDOCUMENTS IN CLUSTER INDEX " + clusterIndex + "\n");
  // re-read each "document* (single line in input file
  // and predict which cluster it belongs to:
  Stream<String> lines2 = Files.lines(Paths.get("data", input_file));
  lines2.forEach(s ->
      {
        String[] parts = s.split("\t");
        String[] tok = tokenizeAndRemoveNoiseWords(parts[1]);
        Vector v = sparseVectorGenerator.tokensToSparseVector(tok);
        int best_cluster_index = model.predict(v);
        if (best_cluster_index == clusterIndex)
          System.out.println("   Article title: " + parts[0]);
      }
  );
  lines2.close();
}
 
开发者ID:mark-watson,项目名称:power-java,代码行数:18,代码来源:WikipediaKMeans.java

示例13: tokensToSparseVector

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
public Vector tokensToSparseVector(String[] tokens) {
  List<Integer> indices = new ArrayList();
  for (String token : tokens) {
    String stem = Stemmer.stemWord(token);
    if(! noiseWords.contains(stem) && validWord((stem))) {
      if (! wordMap.containsKey(stem)) {
        wordMap.put(stem, startingWordIndex++);
      }
      indices.add(wordMap.get(stem));
    }
  }
  int[] ind = new int[MAX_WORDS];

  double [] vals = new double[MAX_WORDS];
  for (int i=0, len=indices.size(); i<len; i++) {
    int index = indices.get(i);
    ind[i] = index;
    vals[i] = 1d;
  }
  Vector ret = Vectors.sparse(MAX_WORDS, ind, vals);
  return ret;
}
 
开发者ID:mark-watson,项目名称:power-java,代码行数:23,代码来源:TextToSparseVector.java

示例14: convertRealMatrixToSparkRowMatrix

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
 * Create a distributed matrix given an Apache Commons RealMatrix.
 *
 * @param sc Never {@code null}
 * @param realMat Apache Commons RealMatrix.  Never {@code null}
 * @return A distributed Spark matrix
 */
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
    logger.info("Converting matrix to distributed Spark matrix...");
    final double [][] dataArray = realMat.getData();
    final LinkedList<Vector> rowsList = new LinkedList<>();
    for (final double [] i : dataArray) {
        final Vector currentRow = Vectors.dense(i);
        rowsList.add(currentRow);
    }

    // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
    // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
    // // Want the partitions to be ~100KB of space
    // final int slices = totalSpace/100000;
    final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);

    // Create a RowMatrix from JavaRDD<Vector>.
    final RowMatrix mat = new RowMatrix(rows.rdd());
    logger.info("Done converting matrix to distributed Spark matrix...");
    return mat;
}
 
开发者ID:broadinstitute,项目名称:gatk-protected,代码行数:28,代码来源:SparkConverter.java

示例15: convertSparkRowMatrixToRealMatrix

import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
 * Create an Apache Commons RealMatrix from a Spark RowMatrix.
 *
 * @param r Never {@code null}
 * @param cachedNumRows Checking the number of rows in {@code r} can be time-consuming.  Provide the value here, if it is already known.
 *                      Use {@code -1} if unknown.
 * @return Never {@code null}
 */
public static RealMatrix convertSparkRowMatrixToRealMatrix(final RowMatrix r, final int cachedNumRows) {

    Utils.nonNull(r, "Input row matrix cannot be null");

    int numRows;
    if (cachedNumRows == -1) {
        // This takes a while in Spark
        numRows = (int) r.numRows();
    } else {
        numRows = cachedNumRows;
    }

    final int numCols = (int) r.numCols();

    // This cast is required, even though it would not seem necessary, at first.  Exact reason why is unknown.
    //   Will fail compilation if the cast is removed.
    final Vector [] rowVectors = (Vector []) r.rows().collect();

    final RealMatrix result = new Array2DRowRealMatrix(numRows, numCols);
    for (int i = 0; i < numRows; i++) {
        result.setRow(i, rowVectors[i].toArray() );
    }
    return result;
}
 
开发者ID:broadinstitute,项目名称:gatk-protected,代码行数:33,代码来源:SparkConverter.java


注:本文中的org.apache.spark.mllib.linalg.Vector类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。