本文整理汇总了Java中org.apache.mahout.common.Pair.getSecond方法的典型用法代码示例。如果您正苦于以下问题:Java Pair.getSecond方法的具体用法?Java Pair.getSecond怎么用?Java Pair.getSecond使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.mahout.common.Pair
的用法示例。
在下文中一共展示了Pair.getSecond方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: processOutput
import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
protected RuleBase processOutput(JobContext job, Path outputPath) throws IOException {
Configuration conf = job.getConfiguration();
FileSystem fs = outputPath.getFileSystem(conf);
Path[] outfiles = Chi_RWCSUtils.listOutputFiles(fs, outputPath);
RuleBase ruleBase = null;
// read all the outputs
for (Path path : outfiles) {
for (Pair<LongWritable,RuleBase> record : new SequenceFileIterable<LongWritable, RuleBase>(path, conf)) {
if(ruleBase == null){
ruleBase = record.getSecond();
}
}
}
return ruleBase;
}
示例2: nextVector
import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
@Override
public MySparseVector nextVector() throws IOException {
if(!vecIterator.hasNext()) return null;
Pair<Text, VectorWritable> entry = vecIterator.next();
String name = entry.getFirst().toString();
VectorWritable mahoutVector = entry.getSecond();
ArrayList<Integer> indices = new ArrayList();
ArrayList<Double> values = new ArrayList();
for(Element e: mahoutVector.get().all()){
double value =e.get();
if (value==0) continue;
values.add(value);
int index= e.index();
indices.add(index);
}
return new MySparseVector(indices, values);
}
示例3: loadDictionary
import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
private static String[] loadDictionary(String dictionaryPath, Configuration conf) {
if (dictionaryPath == null) {
return null;
}
Path dictionaryFile = new Path(dictionaryPath);
List<Pair<Integer, String>> termList = Lists.newArrayList();
int maxTermId = 0;
// key is word value is id
for (Pair<Writable, IntWritable> record
: new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
termList.add(new Pair<Integer, String>(record.getSecond().get(),
record.getFirst().toString()));
maxTermId = Math.max(maxTermId, record.getSecond().get());
}
String[] terms = new String[maxTermId + 1];
for (Pair<Integer, String> pair : termList) {
terms[pair.getFirst()] = pair.getSecond();
}
return terms;
}
示例4: processOutput
import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
protected RuleBase processOutput(JobContext job, Path outputPath) throws IOException {
Configuration conf = job.getConfiguration();
FileSystem fs = outputPath.getFileSystem(conf);
Path[] outfiles = Chi_RWUtils.listOutputFiles(fs, outputPath);
RuleBase ruleBase = null;
// read all the outputs
for (Path path : outfiles) {
for (Pair<LongWritable,RuleBase> record : new SequenceFileIterable<LongWritable, RuleBase>(path, conf)) {
if(ruleBase == null){
ruleBase = record.getSecond();
}
}
}
return ruleBase;
}
示例5: pruneVectors
import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
public static void pruneVectors(Path tfDir, Path prunedTFDir, Path prunedPartialTFDir, long maxDF,
Configuration baseConf,
Pair<Long[], List<Path>> docFrequenciesFeatures,
float normPower,
boolean logNormalize,
int numReducers) throws IOException, InterruptedException, ClassNotFoundException {
int partialVectorIndex = 0;
List<Path> partialVectorPaths = new ArrayList<Path>();
for (Path path : docFrequenciesFeatures.getSecond()) {
Path partialVectorOutputPath = new Path(prunedPartialTFDir, "partial-" + partialVectorIndex++);
partialVectorPaths.add(partialVectorOutputPath);
pruneVectorsPartial(tfDir, partialVectorOutputPath, path, maxDF, baseConf);
}
mergePartialVectors(partialVectorPaths, prunedTFDir, baseConf, normPower, logNormalize, numReducers);
HadoopUtil.delete(new Configuration(baseConf), prunedPartialTFDir);
}
示例6: loadVectors
import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
private static Matrix loadVectors(String vectorPathString, Configuration conf)
throws IOException {
Path vectorPath = new Path(vectorPathString);
FileSystem fs = vectorPath.getFileSystem(conf);
List<Path> subPaths = Lists.newArrayList();
if (fs.isFile(vectorPath)) {
subPaths.add(vectorPath);
} else {
for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
subPaths.add(fileStatus.getPath());
}
}
List<Pair<Integer, Vector>> rowList = Lists.newArrayList();
int numRows = Integer.MIN_VALUE;
int numCols = -1;
boolean sequentialAccess = false;
for (Path subPath : subPaths) {
for (Pair<IntWritable, VectorWritable> record
: new SequenceFileIterable<IntWritable, VectorWritable>(subPath, true, conf)) {
int id = record.getFirst().get();
Vector vector = record.getSecond().get();
if (vector instanceof NamedVector) {
vector = ((NamedVector)vector).getDelegate();
}
if (numCols < 0) {
numCols = vector.size();
sequentialAccess = vector.isSequentialAccess();
}
rowList.add(Pair.of(id, vector));
numRows = Math.max(numRows, id);
}
}
numRows++;
Vector[] rowVectors = new Vector[numRows];
for (Pair<Integer, Vector> pair : rowList) {
rowVectors[pair.getFirst()] = pair.getSecond();
}
return new SparseRowMatrix(numRows, numCols, rowVectors, true, !sequentialAccess);
}
示例7: TopicModel
import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
private TopicModel(Pair<Matrix, Vector> model, double eta, double alpha, String[] dict,
int numThreads, double modelWeight) {
this(model.getFirst(), model.getSecond(), eta, alpha, dict, numThreads, modelWeight);
}
示例8: createDictionaryChunks
import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
/**
* Read the document frequency List which is built at the end of the DF Count Job. This will use constant
* memory and will run at the speed of your disk read
*/
private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath,
Path dictionaryPathBase,
Configuration baseConf,
int chunkSizeInMegabytes) throws IOException {
List<Path> chunkPaths = Lists.newArrayList();
Configuration conf = new Configuration(baseConf);
FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);
long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
int chunkIndex = 0;
Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
chunkPaths.add(chunkPath);
SequenceFile.Writer freqWriter =
new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);
try {
long currentChunkSize = 0;
long featureCount = 0;
long vectorCount = Long.MAX_VALUE;
Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN);
for (Pair<IntWritable,LongWritable> record
: new SequenceFileDirIterable<IntWritable,LongWritable>(filesPattern,
PathType.GLOB,
null,
null,
true,
conf)) {
if (currentChunkSize > chunkSizeLimit) {
Closeables.closeQuietly(freqWriter);
chunkIndex++;
chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
chunkPaths.add(chunkPath);
freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);
currentChunkSize = 0;
}
int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
currentChunkSize += fieldSize;
IntWritable key = record.getFirst();
LongWritable value = record.getSecond();
if (key.get() >= 0) {
freqWriter.append(key, value);
} else if (key.get() == -1) {
vectorCount = value.get();
}
featureCount = Math.max(key.get(), featureCount);
}
featureCount++;
Long[] counts = {featureCount, vectorCount};
return new Pair<Long[], List<Path>>(counts, chunkPaths);
} finally {
Closeables.closeQuietly(freqWriter);
}
}
示例9: processTfIdf
import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
/**
* Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the input set of vectors in
* {@link SequenceFile} format. This job uses a fixed limit on the maximum memory used by the feature chunk
* per node thereby splitting the process across multiple map/reduces.
* Before using this method calculateDF should be called
*
* @param input
* input directory of the vectors in {@link SequenceFile} format
* @param output
* output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
* are generated
* @param datasetFeatures
* Document frequencies information calculated by calculateDF
* @param minDf
* The minimum document frequency. Default 1
* @param maxDF
* The max percentage of vectors for the DF. Can be used to remove really high frequency features.
* Expressed as an integer between 0 and 100. Default 99
* @param numReducers
* The number of reducers to spawn. This also affects the possible parallelism since each reducer
* will typically produce a single output file containing tf-idf vectors for a subset of the
* documents in the corpus.
*/
public static void processTfIdf(Path input,
Path output,
Configuration baseConf,
Pair<Long[], List<Path>> datasetFeatures,
int minDf,
long maxDF,
float normPower,
boolean logNormalize,
boolean sequentialAccessOutput,
boolean namedVector,
int numReducers) throws IOException, InterruptedException, ClassNotFoundException {
Preconditions.checkArgument(normPower == PartialVectorMerger.NO_NORMALIZING || normPower >= 0,
"If specified normPower must be nonnegative", normPower);
Preconditions.checkArgument(normPower == PartialVectorMerger.NO_NORMALIZING
|| (normPower > 1 && !Double.isInfinite(normPower))
|| !logNormalize,
"normPower must be > 1 and not infinite if log normalization is chosen", normPower);
int partialVectorIndex = 0;
List<Path> partialVectorPaths = Lists.newArrayList();
List<Path> dictionaryChunks = datasetFeatures.getSecond();
for (Path dictionaryChunk : dictionaryChunks) {
Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
partialVectorPaths.add(partialVectorOutputPath);
makePartialVectors(input,
baseConf,
datasetFeatures.getFirst()[0],
datasetFeatures.getFirst()[1],
minDf,
maxDF,
dictionaryChunk,
partialVectorOutputPath,
sequentialAccessOutput,
namedVector);
}
Configuration conf = new Configuration(baseConf);
Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
PartialVectorMerger.mergePartialVectors(partialVectorPaths,
outputDir,
baseConf,
normPower,
logNormalize,
datasetFeatures.getFirst()[0].intValue(),
sequentialAccessOutput,
namedVector,
numReducers);
HadoopUtil.delete(conf, partialVectorPaths);
}
示例10: writeDictEntry
import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
@Override
public void writeDictEntry(Pair<String, Integer> entry) throws IOException {
Text key = new Text(entry.getFirst());
IntWritable value = new IntWritable(entry.getSecond());
dictWriter.append(key, value);
}