本文整理汇总了Java中org.apache.mahout.common.Pair类的典型用法代码示例。如果您正苦于以下问题:Java Pair类的具体用法?Java Pair怎么用?Java Pair使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Pair类属于org.apache.mahout.common包,在下文中一共展示了Pair类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: loadResult
import org.apache.mahout.common.Pair; //导入依赖的package包/类
public double loadResult(Path outputDirPath, Configuration conf) throws IOException {
Path finalNumberFile = new Path(outputDirPath, "part-r-00000");
SequenceFileIterator<NullWritable, DoubleWritable> iterator =
new SequenceFileIterator<NullWritable, DoubleWritable>(
finalNumberFile, true, conf);
double norm2;
try {
Pair<NullWritable, DoubleWritable> next = iterator.next();
norm2 = next.getSecond().get();
if (iterator.hasNext())
throw new IOException("More than one value after norm2Job!");
} finally {
Closeables.close(iterator, false);
}
return norm2;
}
示例2: loadDictionary
import org.apache.mahout.common.Pair; //导入依赖的package包/类
private static String[] loadDictionary(String dictionaryPath, Configuration conf) {
if (dictionaryPath == null) {
return null;
}
Path dictionaryFile = new Path(dictionaryPath);
List<Pair<Integer, String>> termList = Lists.newArrayList();
int maxTermId = 0;
// key is word value is id
for (Pair<Writable, IntWritable> record
: new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
termList.add(new Pair<Integer, String>(record.getSecond().get(),
record.getFirst().toString()));
maxTermId = Math.max(maxTermId, record.getSecond().get());
}
String[] terms = new String[maxTermId + 1];
for (Pair<Integer, String> pair : termList) {
terms[pair.getFirst()] = pair.getSecond();
}
return terms;
}
示例3: readPerplexity
import org.apache.mahout.common.Pair; //导入依赖的package包/类
/**
* @param topicModelStateTemp
* @param iteration
* @return {@code double[2]} where first value is perplexity and second is model weight of those
* documents sampled during perplexity computation, or {@code null} if no perplexity data
* exists for the given iteration.
* @throws IOException
*/
public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration)
throws IOException {
Path perplexityPath = perplexityPath(topicModelStateTemp, iteration);
FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf);
if (!fs.exists(perplexityPath)) {
log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath);
return Double.NaN;
}
double perplexity = 0;
double modelWeight = 0;
long n = 0;
for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>(
perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
modelWeight += pair.getFirst().get();
perplexity += pair.getSecond().get();
n++;
}
log.info("Read {} entries with total perplexity {} and model weight {}", new Object[] { n,
perplexity, modelWeight });
return perplexity / modelWeight;
}
示例4: pruneVectors
import org.apache.mahout.common.Pair; //导入依赖的package包/类
public static void pruneVectors(Path tfDir, Path prunedTFDir, Path prunedPartialTFDir, long maxDF,
Configuration baseConf,
Pair<Long[], List<Path>> docFrequenciesFeatures,
float normPower,
boolean logNormalize,
int numReducers) throws IOException, InterruptedException, ClassNotFoundException {
int partialVectorIndex = 0;
List<Path> partialVectorPaths = new ArrayList<Path>();
for (Path path : docFrequenciesFeatures.getSecond()) {
Path partialVectorOutputPath = new Path(prunedPartialTFDir, "partial-" + partialVectorIndex++);
partialVectorPaths.add(partialVectorOutputPath);
pruneVectorsPartial(tfDir, partialVectorOutputPath, path, maxDF, baseConf);
}
mergePartialVectors(partialVectorPaths, prunedTFDir, baseConf, normPower, logNormalize, numReducers);
HadoopUtil.delete(new Configuration(baseConf), prunedPartialTFDir);
}
示例5: setup
import org.apache.mahout.common.Pair; //导入依赖的package包/类
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
URI[] localFiles = DistributedCache.getCacheFiles(conf);
Preconditions.checkArgument(localFiles != null && localFiles.length >= 1,
"missing paths from the DistributedCache");
dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);
Path dictionaryFile = new Path(localFiles[0].getPath());
// key is word value is id
for (Pair<Writable, IntWritable> record
: new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
dictionary.put(record.getFirst().toString(), record.getSecond().get());
}
}
示例6: calculateDF
import org.apache.mahout.common.Pair; //导入依赖的package包/类
/**
* Calculates the document frequencies of all terms from the input set of vectors in
* {@link SequenceFile} format. This job uses a fixed limit on the maximum memory used by the feature chunk
* per node thereby splitting the process across multiple map/reduces.
*
* @param input
* input directory of the vectors in {@link SequenceFile} format
* @param output
* output directory where document frequencies will be stored
* @param chunkSizeInMegabytes
* the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
* stage. Its recommended you calculated this based on the number of cores and the free memory
* available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
* recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
* partial vectors without thrashing the system due to increased swapping
*/
public static Pair<Long[],List<Path>> calculateDF(Path input,
Path output,
Configuration baseConf,
int chunkSizeInMegabytes)
throws IOException, InterruptedException, ClassNotFoundException {
if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
chunkSizeInMegabytes = MIN_CHUNKSIZE;
} else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
chunkSizeInMegabytes = MAX_CHUNKSIZE;
}
Path wordCountPath = new Path(output, WORDCOUNT_OUTPUT_FOLDER);
startDFCounting(input, wordCountPath, baseConf);
return createDictionaryChunks(wordCountPath, output, baseConf, chunkSizeInMegabytes);
}
示例7: setup
import org.apache.mahout.common.Pair; //导入依赖的package包/类
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
URI[] localFiles = DistributedCache.getCacheFiles(conf);
Preconditions.checkArgument(localFiles != null && localFiles.length >= 1,
"missing paths from the DistributedCache");
vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1);
featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1);
minDf = conf.getInt(TFIDFConverter.MIN_DF, 1);
maxDf = conf.getLong(TFIDFConverter.MAX_DF, -1);
sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
Path dictionaryFile = new Path(localFiles[0].getPath());
// key is feature, value is the document frequency
for (Pair<IntWritable,LongWritable> record
: new SequenceFileIterable<IntWritable,LongWritable>(dictionaryFile, true, conf)) {
dictionary.put(record.getFirst().get(), record.getSecond().get());
}
}
示例8: setup
import org.apache.mahout.common.Pair; //导入依赖的package包/类
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
URI[] localFiles = DistributedCache.getCacheFiles(conf);
Preconditions.checkArgument(localFiles != null && localFiles.length >= 1,
"missing paths from the DistributedCache");
maxDf = conf.getLong(HighDFWordsPruner.MAX_DF, -1);
Path dictionaryFile = new Path(localFiles[0].getPath());
// key is feature, value is the document frequency
for (Pair<IntWritable, LongWritable> record :
new SequenceFileIterable<IntWritable, LongWritable>(dictionaryFile, true, conf)) {
dictionary.put(record.getFirst().get(), record.getSecond().get());
}
}
示例9: processOutput
import org.apache.mahout.common.Pair; //导入依赖的package包/类
protected RuleBase processOutput(JobContext job, Path outputPath) throws IOException {
Configuration conf = job.getConfiguration();
FileSystem fs = outputPath.getFileSystem(conf);
Path[] outfiles = Chi_RWCSUtils.listOutputFiles(fs, outputPath);
RuleBase ruleBase = null;
// read all the outputs
for (Path path : outfiles) {
for (Pair<LongWritable,RuleBase> record : new SequenceFileIterable<LongWritable, RuleBase>(path, conf)) {
if(ruleBase == null){
ruleBase = record.getSecond();
}
}
}
return ruleBase;
}
示例10: computeNext
import org.apache.mahout.common.Pair; //导入依赖的package包/类
@Override
protected Pair<K,V> computeNext() {
if (!reuseKeyValueInstances || value == null) {
key = ReflectionUtils.newInstance(keyClass, conf);
if (!noValue) {
value = ReflectionUtils.newInstance(valueClass, conf);
}
}
try {
boolean available;
if (noValue) {
available = reader.next(key);
} else {
available = reader.next(key, value);
}
if (!available) {
close();
return null;
}
return new Pair<K,V>(key, value);
} catch (IOException ioe) {
close();
throw new IllegalStateException(ioe);
}
}
示例11: StableFixedSizeSamplingIterator
import org.apache.mahout.common.Pair; //导入依赖的package包/类
public StableFixedSizeSamplingIterator(int size, Iterator<T> source) {
List<Pair<Integer,T>> buf = Lists.newArrayListWithCapacity(size);
int sofar = 0;
Random random = RandomUtils.getRandom();
while (source.hasNext()) {
T v = source.next();
sofar++;
if (buf.size() < size) {
buf.add(new Pair<Integer,T>(sofar, v));
} else {
int position = random.nextInt(sofar);
if (position < buf.size()) {
buf.set(position, new Pair<Integer,T>(sofar, v));
}
}
}
Collections.sort(buf);
delegate = Iterators.transform(buf.iterator(),
new Function<Pair<Integer,T>,T>() {
@Override
public T apply(Pair<Integer,T> from) {
return from.getSecond();
}
});
}
示例12: move
import org.apache.mahout.common.Pair; //导入依赖的package包/类
public static void move(MyVectorInput input, MyVectorOutput output) throws IOException{
//move the dictionary
Pair<String,Integer> dictEntry;
int terms=0, vectors=0;
while((dictEntry=input.nextDictEntry())!=null){
output.writeDictEntry(dictEntry);
terms++;
}
System.out.println("Moved "+terms+" dict terms");
//move the vectors
MySparseVector vector;
while((vector=input.nextVector())!=null){
output.writeVector(vector);
vectors++;
}
output.close();
System.out.println("Moved "+ vectors+ " document vectors");
}
示例13: popMin
import org.apache.mahout.common.Pair; //导入依赖的package包/类
private static Pair<Integer,Double> popMin(MySparseVector v){
int minIndexIndex=-1, minIndexValue=Integer.MAX_VALUE;
double minValue=-1;
for(int i=0; i<v.size(); i++){
if(minIndexValue>v.indices.get(i)){
minIndexIndex=i;
minIndexValue=v.indices.get(i);
minValue=v.values.get(i);
}
}
v.indices.remove(minIndexIndex);
v.values.remove(minIndexIndex);
return new Pair(minIndexValue, minValue);
}
示例14: nextDictEntry
import org.apache.mahout.common.Pair; //导入依赖的package包/类
@Override
public Pair<String, Integer> nextDictEntry() throws IOException {
String line, term;
while((line=reader.readLine())!=null){
if(!line.startsWith("@attribute"))
if(line.startsWith("@data")) break; //end of dictinary
else continue;
if(line.contains("@@[email protected]@")) continue;
//handle the new term
termCount++; //new term
//strip the term
int beginningOfTerm=11; // "@attribute" is 10 chars long, we will strip this and the following whitespace
int endOfTerm=line.lastIndexOf("numeric")-1;
term = line.substring(beginningOfTerm, endOfTerm);
//return the new term with its id
return new Pair(term, termCount);
}
return null;
}
示例15: nextVector
import org.apache.mahout.common.Pair; //导入依赖的package包/类
@Override
public MySparseVector nextVector() throws IOException {
if(!vecIterator.hasNext()) return null;
Pair<Text, VectorWritable> entry = vecIterator.next();
String name = entry.getFirst().toString();
VectorWritable mahoutVector = entry.getSecond();
ArrayList<Integer> indices = new ArrayList();
ArrayList<Double> values = new ArrayList();
for(Element e: mahoutVector.get().all()){
double value =e.get();
if (value==0) continue;
values.add(value);
int index= e.index();
indices.add(index);
}
return new MySparseVector(indices, values);
}