本文整理汇总了Java中org.apache.mahout.common.iterator.sequencefile.PathType类的典型用法代码示例。如果您正苦于以下问题:Java PathType类的具体用法?Java PathType怎么用?Java PathType使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
PathType类属于org.apache.mahout.common.iterator.sequencefile包,在下文中一共展示了PathType类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: runSequential
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
/***
* PPCA: sequential PPCA based on the paper from Tipping and Bishop
*
* @param conf
* the configuration
* @param input
* the path to the input matrix Y
* @param output
* the output path (not used currently)
* @param nRows
* number or rows in Y
* @param nCols
* number of columns in Y
* @param nPCs
* number of desired principal components
* @return the error
* @throws Exception
*/
double runSequential(Configuration conf, Path input, Path output,
final int nRows, final int nCols, final int nPCs) throws Exception {
Matrix centralY = new DenseMatrix(nRows, nCols);
FileSystem fs = FileSystem.get(input.toUri(), conf);
if (fs.listStatus(input).length == 0) {
System.err.println("No file under " + input);
return 0;
}
int row = 0;
for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(
input, PathType.LIST, null, conf)) {
centralY.assignRow(row, vw.get());
row++;
}
Matrix centralC = PCACommon.randomMatrix(nCols, nPCs);
double ss = PCACommon.randSS();
InitialValues initVal = new InitialValues(centralC, ss);
// Matrix sampledYe = sample(centralY);
// runSequential(conf, sampledYe, initVal, 100);
double error = runSequential(conf, centralY, initVal, 100);
return error;
}
示例2: runSequential_JacobVersion
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
/**
* PPCA: sequential PPCA based on the matlab implementation of Jacob Verbeek
*
* @param conf
* the configuration
* @param input
* the path to the input matrix Y
* @param output
* the output path (not used currently)
* @param nRows
* number or rows in Y
* @param nCols
* number of columns in Y
* @param nPCs
* number of desired principal components
* @return the error
* @throws Exception
*/
double runSequential_JacobVersion(Configuration conf, Path input,
Path output, final int nRows, final int nCols, final int nPCs) throws Exception {
Matrix centralY = new DenseMatrix(nRows, nCols);
FileSystem fs = FileSystem.get(input.toUri(), conf);
if (fs.listStatus(input).length == 0) {
System.err.println("No file under " + input);
return 0;
}
int row = 0;
for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(
input, PathType.LIST, null, conf)) {
centralY.assignRow(row, vw.get());
row++;
}
Matrix C = PCACommon.randomMatrix(nCols, nPCs);
double ss = PCACommon.randSS();
InitialValues initVal = new InitialValues(C, ss);
double error = runSequential_JacobVersion(conf, centralY, initVal, 100);
return error;
}
示例3: process
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
/**
* This method takes the clustered points output by the clustering algorithms as input and writes them into
* their respective clusters.
*/
public void process() throws IOException {
createPostProcessDirectory();
for (Pair<?,WeightedVectorWritable> record :
new SequenceFileDirIterable<Writable,WeightedVectorWritable>(clusteredPoints,
PathType.GLOB,
PathFilters.partFilter(),
null,
false,
conf)) {
String clusterId = record.getFirst().toString().trim();
putVectorInRespectiveCluster(clusterId, record.getSecond());
}
IOUtils.close(writersForClusters.values());
writersForClusters.clear();
}
示例4: getNumberOfClusters
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
/**
* Reads the number of clusters present by reading the clusters-*-final file.
*
* @param clusterOutputPath
* The output path provided to the clustering algorithm.
* @param conf
* The hadoop configuration.
* @return the number of final clusters.
*/
public static int getNumberOfClusters(Path clusterOutputPath, Configuration conf) throws IOException {
FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
int numberOfClusters = 0;
Iterator<?> it = new SequenceFileDirValueIterator<Writable>(clusterFiles[0].getPath(),
PathType.LIST,
PathFilters.partFilter(),
null,
true,
conf);
while (it.hasNext()) {
it.next();
numberOfClusters++;
}
return numberOfClusters;
}
示例5: configureWithClusterInfo
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
/**
* Create a list of SoftClusters from whatever type is passed in as the prior
*
* @param conf
* the Configuration
* @param clusterPath
* the path to the prior Clusters
* @param clusters
* a List<Cluster> to put values into
*/
public static void configureWithClusterInfo(Configuration conf, Path clusterPath, List<Cluster> clusters) {
for (Writable value : new SequenceFileDirValueIterable<Writable>(clusterPath, PathType.LIST,
PathFilters.partFilter(), conf)) {
Class<? extends Writable> valueClass = value.getClass();
if (valueClass.equals(ClusterWritable.class)) {
ClusterWritable clusterWritable = (ClusterWritable) value;
value = clusterWritable.getValue();
valueClass = value.getClass();
}
if (valueClass.equals(Kluster.class)) {
// get the cluster info
Kluster cluster = (Kluster) value;
clusters.add(new SoftCluster(cluster.getCenter(), cluster.getId(), cluster.getMeasure()));
} else if (valueClass.equals(SoftCluster.class)) {
// get the cluster info
clusters.add((SoftCluster) value);
} else if (valueClass.equals(Canopy.class)) {
// get the cluster info
Canopy canopy = (Canopy) value;
clusters.add(new SoftCluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure()));
} else {
throw new IllegalStateException("Bad value class: " + valueClass);
}
}
}
示例6: configureWithClusterInfo
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
/**
* Create a list of Klusters from whatever Cluster type is passed in as the prior
*
* @param conf
* the Configuration
* @param clusterPath
* the path to the prior Clusters
* @param clusters
* a List<Cluster> to put values into
*/
public static void configureWithClusterInfo(Configuration conf, Path clusterPath, Collection<Cluster> clusters) {
for (Writable value : new SequenceFileDirValueIterable<Writable>(clusterPath, PathType.LIST,
PathFilters.partFilter(), conf)) {
Class<? extends Writable> valueClass = value.getClass();
if (valueClass.equals(ClusterWritable.class)) {
ClusterWritable clusterWritable = (ClusterWritable) value;
value = clusterWritable.getValue();
valueClass = value.getClass();
}
log.debug("Read 1 Cluster from {}", clusterPath);
if (valueClass.equals(Kluster.class)) {
// get the cluster info
clusters.add((Kluster) value);
} else if (valueClass.equals(Canopy.class)) {
// get the cluster info
Canopy canopy = (Canopy) value;
clusters.add(new Kluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure()));
} else {
throw new IllegalStateException("Bad value class: " + valueClass);
}
}
}
示例7: readPerplexity
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
/**
* @param topicModelStateTemp
* @param iteration
* @return {@code double[2]} where first value is perplexity and second is model weight of those
* documents sampled during perplexity computation, or {@code null} if no perplexity data
* exists for the given iteration.
* @throws IOException
*/
public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration)
throws IOException {
Path perplexityPath = perplexityPath(topicModelStateTemp, iteration);
FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf);
if (!fs.exists(perplexityPath)) {
log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath);
return Double.NaN;
}
double perplexity = 0;
double modelWeight = 0;
long n = 0;
for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>(
perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
modelWeight += pair.getFirst().get();
perplexity += pair.getSecond().get();
n++;
}
log.info("Read {} entries with total perplexity {} and model weight {}", new Object[] { n,
perplexity, modelWeight });
return perplexity / modelWeight;
}
示例8: populateClusterModels
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
/**
* Populates a list with clusters present in clusters-*-final directory.
*
* @param clusterOutputPath
* The output path of the clustering.
* @param conf
* The Hadoop Configuration
* @return The list of clusters found by the clustering.
* @throws IOException
*/
private static List<Cluster> populateClusterModels(Path clusterOutputPath,
Configuration conf) throws IOException {
List<Cluster> clusterModels = Lists.newArrayList();
Path finalClustersPath = finalClustersPath(conf, clusterOutputPath);
Iterator<?> it = new SequenceFileDirValueIterator<Writable>(
finalClustersPath, PathType.LIST, PathFilters.partFilter(),
null, false, conf);
while (it.hasNext()) {
ClusterWritable next = (ClusterWritable) it.next();
Cluster cluster = next.getValue();
cluster.configure(conf);
clusterModels.add(cluster);
}
return clusterModels;
}
示例9: selectCluster
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
/**
* Classifies the vector into its respective cluster.
*
* @param input
* the path containing the input vector.
* @param clusterModels
* the clusters
* @param clusterClassifier
* used to classify the vectors into different clusters
* @param output
* the path to store classified data
* @param clusterClassificationThreshold
* @param emitMostLikely
* TODO
* @throws IOException
*/
private static void selectCluster(Path input, List<Cluster> clusterModels,
ClusterClassifier clusterClassifier, Path output,
Double clusterClassificationThreshold, boolean emitMostLikely)
throws IOException {
Configuration conf = new Configuration();
SequenceFile.Writer writer = new SequenceFile.Writer(
input.getFileSystem(conf), conf,
new Path(output, "part-m-" + 0), IntWritable.class, Text.class);
for (Pair<Text, VectorWritable> entry : new SequenceFileDirIterable<Text, VectorWritable>(
input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
Vector pdfPerCluster = clusterClassifier.classify(entry.getSecond()
.get());
if (shouldClassify(pdfPerCluster, clusterClassificationThreshold)) {
classifyAndWrite(clusterModels, clusterClassificationThreshold,
emitMostLikely, writer, entry.getFirst(), pdfPerCluster);
}
}
writer.close();
}
示例10: readClusters
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
public static List<List<Cluster>> readClusters(Configuration conf, Path output)
throws IOException {
List<List<Cluster>> Clusters = Lists.newArrayList();
FileSystem fs = FileSystem.get(output.toUri(), conf);
for (FileStatus s : fs.listStatus(output, new ClustersFilter())) {
List<Cluster> clusters = Lists.newArrayList();
for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(
s.getPath(), PathType.LIST, PathFilters.logsCRCFilter(),
conf)) {
Cluster cluster = value.getValue();
clusters.add(cluster);
}
Clusters.add(clusters);
}
return Clusters;
}
示例11: iterateAll
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
@Override
public Iterator<MatrixSlice> iterateAll() {
try {
Path pathPattern = rowPath;
if (FileSystem.get(conf).getFileStatus(rowPath).isDir()) {
pathPattern = new Path(rowPath, "*");
}
return Iterators.transform(
new SequenceFileDirIterator<IntWritable, VectorWritable>(pathPattern,
PathType.GLOB, PathFilters.logsCRCFilter(), null, true, conf),
new Function<Pair<IntWritable, VectorWritable>, MatrixSlice>() {
@Override
public MatrixSlice apply(Pair<IntWritable, VectorWritable> from) {
return new MatrixSlice(from.getSecond().get(), from.getFirst()
.get());
}
});
} catch (IOException ioe) {
throw new IllegalStateException(ioe);
}
}
示例12: crossTestIterationOfMapReducePPCASequentialPPCA
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
@Test
public void crossTestIterationOfMapReducePPCASequentialPPCA() throws Exception {
Matrix C_central = PCACommon.randomMatrix(D, d);
double ss = PCACommon.randSS();
InitialValues initValSeq = new InitialValues(C_central, ss);
InitialValues initValMR = new InitialValues(C_central.clone(), ss);
//1. run sequential
Matrix Ye_central = new DenseMatrix(N, D);
int row = 0;
for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(
input, PathType.LIST, null, conf)) {
Ye_central.assignRow(row, vw.get());
row++;
}
double bishopSeqErr = ppcaDriver.runSequential(conf, Ye_central, initValSeq, 1);
//2. run mapreduce
DistributedRowMatrix Ye = new DistributedRowMatrix(input, tmp, N, D);
Ye.setConf(conf);
double bishopMRErr = ppcaDriver.runMapReduce(conf, Ye, initValMR, output, N, D, d, 1, 1, 1, 1);
Assert.assertEquals(
"ss value is different in sequential and mapreduce PCA", initValSeq.ss,
initValMR.ss, EPSILON);
double seqCTrace = PCACommon.trace(initValSeq.C);
double mrCTrace = PCACommon.trace(initValMR.C);
Assert.assertEquals(
"C value is different in sequential and mapreduce PCA", seqCTrace,
mrCTrace, EPSILON);
Assert.assertEquals(
"The PPCA error between sequntial and mapreduce methods is too different: "
+ bishopSeqErr + "!= " + bishopMRErr, bishopSeqErr, bishopMRErr, EPSILON);
}
示例13: buildClustersSeq
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
/**
* Build a directory of Canopy clusters from the input vectors and other
* arguments. Run sequential execution
*
* @param input
* the Path to the directory containing input vectors
* @param output
* the Path for all output directories
* @param measure
* the DistanceMeasure
* @param t1
* the double T1 distance metric
* @param t2
* the double T2 distance metric
* @param clusterFilter
* the int minimum size of canopies produced
* @return the canopy output directory Path
*/
private static Path buildClustersSeq(Path input, Path output,
DistanceMeasure measure, double t1, double t2, int clusterFilter)
throws IOException {
CanopyClusterer clusterer = new CanopyClusterer(measure, t1, t2);
Collection<Canopy> canopies = Lists.newArrayList();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(input.toUri(), conf);
for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(
input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
clusterer.addPointToCanopies(vw.get(), canopies);
}
Path canopyOutputDir = new Path(output, Cluster.CLUSTERS_DIR + '0'+ Cluster.FINAL_ITERATION_SUFFIX);
Path path = new Path(canopyOutputDir, "part-r-00000");
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
Text.class, ClusterWritable.class);
ClusterWritable clusterWritable = new ClusterWritable();
try {
for (Canopy canopy : canopies) {
canopy.computeParameters();
if (log.isDebugEnabled()) {
log.debug("Writing Canopy:{} center:{} numPoints:{} radius:{}",
new Object[] { canopy.getIdentifier(),
AbstractCluster.formatVector(canopy.getCenter(), null),
canopy.getNumObservations(),
AbstractCluster.formatVector(canopy.getRadius(), null) });
}
if (canopy.getNumObservations() > clusterFilter) {
clusterWritable.setValue(canopy);
writer.append(new Text(canopy.getIdentifier()), clusterWritable);
}
}
} finally {
Closeables.closeQuietly(writer);
}
return canopyOutputDir;
}
示例14: populateClusterModels
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
/**
* Populates a list with clusters present in clusters-*-final directory.
*
* @param clusterOutputPath
* The output path of the clustering.
* @param conf
* The Hadoop Configuration
* @return The list of clusters found by the clustering.
* @throws IOException
*/
private static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException {
List<Cluster> clusterModels = new ArrayList<Cluster>();
Path finalClustersPath = finalClustersPath(conf, clusterOutputPath);
Iterator<?> it = new SequenceFileDirValueIterator<Writable>(finalClustersPath, PathType.LIST,
PathFilters.partFilter(), null, false, conf);
while (it.hasNext()) {
ClusterWritable next = (ClusterWritable) it.next();
Cluster cluster = next.getValue();
cluster.configure(conf);
clusterModels.add(cluster);
}
return clusterModels;
}
示例15: readFromSeqFiles
import org.apache.mahout.common.iterator.sequencefile.PathType; //导入依赖的package包/类
public void readFromSeqFiles(Configuration conf, Path path) throws IOException {
Configuration config = new Configuration();
List<Cluster> clusters = Lists.newArrayList();
for (ClusterWritable cw : new SequenceFileDirValueIterable<ClusterWritable>(path, PathType.LIST,
PathFilters.logsCRCFilter(), config)) {
Cluster cluster = cw.getValue();
cluster.configure(conf);
clusters.add(cluster);
}
this.models = clusters;
modelClass = models.get(0).getClass().getName();
this.policy = readPolicy(path);
}