本文整理汇总了Java中org.apache.spark.mllib.linalg.Vector类的典型用法代码示例。如果您正苦于以下问题:Java Vector类的具体用法?Java Vector怎么用?Java Vector使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Vector类属于org.apache.spark.mllib.linalg包,在下文中一共展示了Vector类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: fetchClusterMetrics
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
* @param evalData points to cluster for evaluation
* @return cluster IDs as keys, and metrics for each cluster like the count, sum of distances to centroid,
* and sum of squared distances
*/
JavaPairRDD<Integer,ClusterMetric> fetchClusterMetrics(JavaRDD<Vector> evalData) {
return evalData.mapToPair(vector -> {
double closestDist = Double.POSITIVE_INFINITY;
int minClusterID = Integer.MIN_VALUE;
double[] vec = vector.toArray();
for (ClusterInfo cluster : clusters.values()) {
double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
if (distance < closestDist) {
closestDist = distance;
minClusterID = cluster.getID();
}
}
Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
return new Tuple2<>(minClusterID, new ClusterMetric(1L, closestDist, closestDist * closestDist));
}).reduceByKey(ClusterMetric::add);
}
示例2: buildModel
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
* @param sparkContext active Spark Context
* @param trainData training data on which to build a model
* @param hyperParameters ordered list of hyper parameter values to use in building model
* @param candidatePath directory where additional model files can be written
* @return a {@link PMML} representation of a model trained on the given data
*/
@Override
public PMML buildModel(JavaSparkContext sparkContext,
JavaRDD<String> trainData,
List<?> hyperParameters,
Path candidatePath) {
int numClusters = (Integer) hyperParameters.get(0);
Preconditions.checkArgument(numClusters > 1);
log.info("Building KMeans Model with {} clusters", numClusters);
JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
numberOfRuns, initializationStrategy);
return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}
示例3: hellingerDistance
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
* Calculate similarity (Hellinger Distance) between vectors
*
* @param vecA initial vector from which to calculate a similarity
* @param vecB second vector involved in similarity calculation
* @return similarity between two vectors
*/
public static double hellingerDistance(Vector vecA, Vector vecB) {
double[] arrA = vecA.toArray();
double[] arrB = vecB.toArray();
double sim = 0.0;
int arrsize = arrA.length;
for (int i = 0; i < arrsize; i++) {
double a = arrA[i];
double b = arrB[i];
double sqrtDiff = Math.sqrt(a) - Math.sqrt(b);
sim += sqrtDiff * sqrtDiff;
}
sim = sim / Math.sqrt(2);
return sim;
}
示例4: pearsonDistance
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
* Calculate similarity (Pearson Distance) between vectors
*
* @param vecA initial vector from which to calculate a similarity
* @param vecB second vector involved in similarity calculation
* @return similarity between two vectors
*/
public static double pearsonDistance(Vector vecA, Vector vecB) {
double[] arrA = vecA.toArray();
double[] arrB = vecB.toArray();
int viewA = 0;
int viewB = 0;
int viewAB = 0;
int arrsize = arrA.length;
for (int i = 0; i < arrsize; i++) {
if (arrA[i] > 0) {
viewA++;
}
if (arrB[i] > 0) {
viewB++;
}
if (arrB[i] > 0 && arrA[i] > 0) {
viewAB++;
}
}
return viewAB / (Math.sqrt(viewA) * Math.sqrt(viewB));
}
示例5: getSVDMatrix
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
* GetSVDMatrix: Create SVD matrix csv file from original csv file.
*
* @param csvFileName each row is a term, and each column is a document.
* @param svdDimention Dimension of SVD matrix
* @param svdMatrixFileName CSV file name of SVD matrix
*/
public void getSVDMatrix(String csvFileName, int svdDimention, String svdMatrixFileName) {
JavaPairRDD<String, Vector> importRDD = MatrixUtil.loadVectorFromCSV(spark, csvFileName, 1);
JavaRDD<Vector> vectorRDD = importRDD.values();
RowMatrix wordDocMatrix = new RowMatrix(vectorRDD.rdd());
RowMatrix tfidfMatrix = MatrixUtil.createTFIDFMatrix(wordDocMatrix);
RowMatrix svdMatrix = MatrixUtil.buildSVDMatrix(tfidfMatrix, svdDimention);
List<String> rowKeys = importRDD.keys().collect();
List<String> colKeys = new ArrayList<>();
for (int i = 0; i < svdDimention; i++) {
colKeys.add("dimension" + i);
}
MatrixUtil.exportToCSV(svdMatrix, rowKeys, colKeys, svdMatrixFileName);
}
示例6: generateGaussianMixtureModel
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
public GaussianMixtureModel generateGaussianMixtureModel(JavaRDD<Vector> parsedData,
GaussianMixtureDetectionAlgorithm gaussianMixtureDetectionAlgorithm,
GaussianMixtureModelSummary gaussianMixtureModelSummary) {
GaussianMixture gaussianMixture = new GaussianMixture();
if (gaussianMixtureDetectionAlgorithm.getK() != -1) {
gaussianMixture.setK(gaussianMixtureDetectionAlgorithm.getK());
}
if (gaussianMixtureDetectionAlgorithm.getMaxIterations() != -1) {
gaussianMixture.setMaxIterations(gaussianMixtureDetectionAlgorithm.getMaxIterations());
}
if (gaussianMixtureDetectionAlgorithm.getConvergenceTol() != -1){
gaussianMixture.setConvergenceTol(gaussianMixtureDetectionAlgorithm.getConvergenceTol());
}
if (gaussianMixtureDetectionAlgorithm.getInitializedModel() != null) {
gaussianMixture.setInitialModel(gaussianMixtureDetectionAlgorithm.getInitializedModel());
}
if (gaussianMixtureDetectionAlgorithm.getSeed() != -1) {
gaussianMixture.setSeed(gaussianMixtureDetectionAlgorithm.getSeed());
}
GaussianMixtureModel gaussianMixtureModel = gaussianMixture.run(parsedData);
gaussianMixtureModelSummary.setGaussianMixtureDetectionAlgorithm(gaussianMixtureDetectionAlgorithm);
return gaussianMixtureModel;
}
示例7: evaluate
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
* @param evalData data for evaluation
* @return the Dunn Index of a given clustering
* (https://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation); higher is better
*/
@Override
double evaluate(JavaRDD<Vector> evalData) {
// Intra-cluster distance is mean distance to centroid
double maxIntraClusterDistance =
fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getMeanDist).max();
// Inter-cluster distance is distance between centroids
double minInterClusterDistance = Double.POSITIVE_INFINITY;
List<ClusterInfo> clusters = new ArrayList<>(getClustersByID().values());
DistanceFn<double[]> distanceFn = getDistanceFn();
for (int i = 0; i < clusters.size(); i++) {
double[] centerI = clusters.get(i).getCenter();
// Distances are symmetric, hence d(i,j) == d(j,i)
for (int j = i + 1; j < clusters.size(); j++) {
double[] centerJ = clusters.get(j).getCenter();
minInterClusterDistance = Math.min(minInterClusterDistance, distanceFn.applyAsDouble(centerI, centerJ));
}
}
return minInterClusterDistance / maxIntraClusterDistance;
}
示例8: fetchClusteredPoints
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<Vector> evalData) {
return evalData.mapToPair(vector -> {
double closestDist = Double.POSITIVE_INFINITY;
int minClusterID = Integer.MIN_VALUE;
double[] vec = vector.toArray();
DistanceFn<double[]> distanceFn = getDistanceFn();
Map<Integer,ClusterInfo> clusters = getClustersByID();
for (ClusterInfo cluster : clusters.values()) {
double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
if (distance < closestDist) {
closestDist = distance;
minClusterID = cluster.getID();
}
}
Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
return new Tuple2<>(minClusterID, vec);
}).groupByKey();
}
示例9: createDF
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
public DataFrame createDF(JavaRDD<Tuple2<Vector, String>> rdd) {
// Generate the schema based on the string of schema
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("vectorized_count", new VectorUDT(), true));
fields.add(DataTypes.createStructField("product_title", DataTypes.StringType, true));
StructType schema = DataTypes.createStructType(fields);
// Convert records of the RDD (people) to Rows.
JavaRDD<Row> rowRDD = rdd.map(
new Function<Tuple2<Vector, String>, Row>() {
public Row call(Tuple2<Vector, String> record) {
return RowFactory.create(record._1(), record._2());
}
});
return sqlContext.createDataFrame(rowRDD, schema);
}
示例10: transform
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
public LabeledPoint transform(Tuple2<Double, Multiset<String>> doc) {
double label = doc._1();
List<Tuple2<Integer, Double>> vector = new ArrayList<>();
for (Multiset.Entry<String> entry : doc._2().entrySet()) {
String word = entry.getElement();
int tf = entry.getCount();
Tuple2<Integer, Long> wordInfo = idf.get(word);
if (wordInfo != null) {
int index = wordInfo._2().intValue();
int numDocs = (int) this.newsCount;
int df = wordInfo._2().intValue();
double tfidf = this.calculate(tf, df, numDocs);
vector.add(new Tuple2<>(index, tfidf));
}
}
Vector features = Vectors.sparse((int) featuresCount, vector);
return new LabeledPoint(label, features);
}
示例11: main
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
public static void main(String[] args) {
String inputFile = "data/kmeans_data.txt";
int k = 2; // two clusters
int iterations = 10;
int runs = 1;
JavaSparkContext sc = new JavaSparkContext("local", "JavaKMeans");
JavaRDD<String> lines = sc.textFile(inputFile);
JavaRDD<Vector> points = lines.map(new ParsePoint());
KMeansModel model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.K_MEANS_PARALLEL());
System.out.println("Cluster centers:");
for (Vector center : model.clusterCenters()) {
System.out.println(" " + center);
}
double cost = model.computeCost(points.rdd());
System.out.println("Cost: " + cost);
sc.stop();
}
示例12: printClusterIndex
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
static private void printClusterIndex(int clusterIndex, KMeansModel model) throws IOException {
System.out.println("\nDOCUMENTS IN CLUSTER INDEX " + clusterIndex + "\n");
// re-read each "document* (single line in input file
// and predict which cluster it belongs to:
Stream<String> lines2 = Files.lines(Paths.get("data", input_file));
lines2.forEach(s ->
{
String[] parts = s.split("\t");
String[] tok = tokenizeAndRemoveNoiseWords(parts[1]);
Vector v = sparseVectorGenerator.tokensToSparseVector(tok);
int best_cluster_index = model.predict(v);
if (best_cluster_index == clusterIndex)
System.out.println(" Article title: " + parts[0]);
}
);
lines2.close();
}
示例13: tokensToSparseVector
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
public Vector tokensToSparseVector(String[] tokens) {
List<Integer> indices = new ArrayList();
for (String token : tokens) {
String stem = Stemmer.stemWord(token);
if(! noiseWords.contains(stem) && validWord((stem))) {
if (! wordMap.containsKey(stem)) {
wordMap.put(stem, startingWordIndex++);
}
indices.add(wordMap.get(stem));
}
}
int[] ind = new int[MAX_WORDS];
double [] vals = new double[MAX_WORDS];
for (int i=0, len=indices.size(); i<len; i++) {
int index = indices.get(i);
ind[i] = index;
vals[i] = 1d;
}
Vector ret = Vectors.sparse(MAX_WORDS, ind, vals);
return ret;
}
示例14: convertRealMatrixToSparkRowMatrix
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
* Create a distributed matrix given an Apache Commons RealMatrix.
*
* @param sc Never {@code null}
* @param realMat Apache Commons RealMatrix. Never {@code null}
* @return A distributed Spark matrix
*/
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
logger.info("Converting matrix to distributed Spark matrix...");
final double [][] dataArray = realMat.getData();
final LinkedList<Vector> rowsList = new LinkedList<>();
for (final double [] i : dataArray) {
final Vector currentRow = Vectors.dense(i);
rowsList.add(currentRow);
}
// We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
// final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
// // Want the partitions to be ~100KB of space
// final int slices = totalSpace/100000;
final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);
// Create a RowMatrix from JavaRDD<Vector>.
final RowMatrix mat = new RowMatrix(rows.rdd());
logger.info("Done converting matrix to distributed Spark matrix...");
return mat;
}
示例15: convertSparkRowMatrixToRealMatrix
import org.apache.spark.mllib.linalg.Vector; //导入依赖的package包/类
/**
* Create an Apache Commons RealMatrix from a Spark RowMatrix.
*
* @param r Never {@code null}
* @param cachedNumRows Checking the number of rows in {@code r} can be time-consuming. Provide the value here, if it is already known.
* Use {@code -1} if unknown.
* @return Never {@code null}
*/
public static RealMatrix convertSparkRowMatrixToRealMatrix(final RowMatrix r, final int cachedNumRows) {
Utils.nonNull(r, "Input row matrix cannot be null");
int numRows;
if (cachedNumRows == -1) {
// This takes a while in Spark
numRows = (int) r.numRows();
} else {
numRows = cachedNumRows;
}
final int numCols = (int) r.numCols();
// This cast is required, even though it would not seem necessary, at first. Exact reason why is unknown.
// Will fail compilation if the cast is removed.
final Vector [] rowVectors = (Vector []) r.rows().collect();
final RealMatrix result = new Array2DRowRealMatrix(numRows, numCols);
for (int i = 0; i < numRows; i++) {
result.setRow(i, rowVectors[i].toArray() );
}
return result;
}