当前位置: 首页>>代码示例>>Java>>正文


Java Pair.getSecond方法代码示例

本文整理汇总了Java中org.apache.mahout.common.Pair.getSecond方法的典型用法代码示例。如果您正苦于以下问题:Java Pair.getSecond方法的具体用法?Java Pair.getSecond怎么用?Java Pair.getSecond使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.mahout.common.Pair的用法示例。


在下文中一共展示了Pair.getSecond方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: processOutput

import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
protected RuleBase processOutput(JobContext job, Path outputPath) throws IOException {
 
  Configuration conf = job.getConfiguration();

  FileSystem fs = outputPath.getFileSystem(conf);

  Path[] outfiles = Chi_RWCSUtils.listOutputFiles(fs, outputPath);
  
  RuleBase ruleBase = null;
  
  // read all the outputs
  for (Path path : outfiles) {
    for (Pair<LongWritable,RuleBase> record : new SequenceFileIterable<LongWritable, RuleBase>(path, conf)) {
  	if(ruleBase == null){
        ruleBase = record.getSecond();
  	}
    }
  }
  
  return ruleBase;
}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigData-Max,代码行数:22,代码来源:PartialBuilder.java

示例2: nextVector

import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
@Override
public MySparseVector nextVector() throws IOException {
   
    if(!vecIterator.hasNext()) return null;
    
    Pair<Text, VectorWritable> entry = vecIterator.next();
    
    String name = entry.getFirst().toString();
    VectorWritable mahoutVector = entry.getSecond();
    
    ArrayList<Integer> indices = new ArrayList();
    ArrayList<Double> values = new ArrayList();
    
    for(Element e: mahoutVector.get().all()){
        double value =e.get();
        if (value==0) continue;
        values.add(value);
        int index= e.index();
        indices.add(index);            
    }
    
    return new MySparseVector(indices, values);
    
}
 
开发者ID:project-asap,项目名称:IReS-Platform,代码行数:25,代码来源:MahoutInput.java

示例3: loadDictionary

import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
private static String[] loadDictionary(String dictionaryPath, Configuration conf) {
  if (dictionaryPath == null) {
    return null;
  }
  Path dictionaryFile = new Path(dictionaryPath);
  List<Pair<Integer, String>> termList = Lists.newArrayList();
  int maxTermId = 0;
   // key is word value is id
  for (Pair<Writable, IntWritable> record
          : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
    termList.add(new Pair<Integer, String>(record.getSecond().get(),
        record.getFirst().toString()));
    maxTermId = Math.max(maxTermId, record.getSecond().get());
  }
  String[] terms = new String[maxTermId + 1];
  for (Pair<Integer, String> pair : termList) {
    terms[pair.getFirst()] = pair.getSecond();
  }
  return terms;
}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigData-Ave,代码行数:21,代码来源:InMemoryCollapsedVariationalBayes0.java

示例4: processOutput

import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
protected RuleBase processOutput(JobContext job, Path outputPath) throws IOException {
  	  
  Configuration conf = job.getConfiguration();

  FileSystem fs = outputPath.getFileSystem(conf);

  Path[] outfiles = Chi_RWUtils.listOutputFiles(fs, outputPath);
  
  RuleBase ruleBase = null;
  
  // read all the outputs
  for (Path path : outfiles) {
    for (Pair<LongWritable,RuleBase> record : new SequenceFileIterable<LongWritable, RuleBase>(path, conf)) {
  	if(ruleBase == null){
        ruleBase = record.getSecond();
  	}
    }
  }
  
  return ruleBase;
}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigData-Max,代码行数:22,代码来源:PartialBuilder.java

示例5: pruneVectors

import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
public static void pruneVectors(Path tfDir, Path prunedTFDir, Path prunedPartialTFDir, long maxDF,
                                Configuration baseConf,
                                Pair<Long[], List<Path>> docFrequenciesFeatures,
                                float normPower,
                                boolean logNormalize,
                                int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

  int partialVectorIndex = 0;
  List<Path> partialVectorPaths = new ArrayList<Path>();
  for (Path path : docFrequenciesFeatures.getSecond()) {
    Path partialVectorOutputPath = new Path(prunedPartialTFDir, "partial-" + partialVectorIndex++);
    partialVectorPaths.add(partialVectorOutputPath);
    pruneVectorsPartial(tfDir, partialVectorOutputPath, path, maxDF, baseConf);
  }

  mergePartialVectors(partialVectorPaths, prunedTFDir, baseConf, normPower, logNormalize, numReducers);
  HadoopUtil.delete(new Configuration(baseConf), prunedPartialTFDir);
}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigData-Max,代码行数:19,代码来源:HighDFWordsPruner.java

示例6: loadVectors

import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
private static Matrix loadVectors(String vectorPathString, Configuration conf)
  throws IOException {
  Path vectorPath = new Path(vectorPathString);
  FileSystem fs = vectorPath.getFileSystem(conf);
  List<Path> subPaths = Lists.newArrayList();
  if (fs.isFile(vectorPath)) {
    subPaths.add(vectorPath);
  } else {
    for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
      subPaths.add(fileStatus.getPath());
    }
  }
  List<Pair<Integer, Vector>> rowList = Lists.newArrayList();
  int numRows = Integer.MIN_VALUE;
  int numCols = -1;
  boolean sequentialAccess = false;
  for (Path subPath : subPaths) {
    for (Pair<IntWritable, VectorWritable> record
        : new SequenceFileIterable<IntWritable, VectorWritable>(subPath, true, conf)) {
      int id = record.getFirst().get();
      Vector vector = record.getSecond().get();
      if (vector instanceof NamedVector) {
        vector = ((NamedVector)vector).getDelegate();
      }
      if (numCols < 0) {
        numCols = vector.size();
        sequentialAccess = vector.isSequentialAccess();
      }
      rowList.add(Pair.of(id, vector));
      numRows = Math.max(numRows, id);
    }
  }
  numRows++;
  Vector[] rowVectors = new Vector[numRows];
  for (Pair<Integer, Vector> pair : rowList) {
    rowVectors[pair.getFirst()] = pair.getSecond();
  }
  return new SparseRowMatrix(numRows, numCols, rowVectors, true, !sequentialAccess);

}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:41,代码来源:InMemoryCollapsedVariationalBayes0.java

示例7: TopicModel

import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
private TopicModel(Pair<Matrix, Vector> model, double eta, double alpha, String[] dict,
    int numThreads, double modelWeight) {
  this(model.getFirst(), model.getSecond(), eta, alpha, dict, numThreads, modelWeight);
}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:5,代码来源:TopicModel.java

示例8: createDictionaryChunks

import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
/**
 * Read the document frequency List which is built at the end of the DF Count Job. This will use constant
 * memory and will run at the speed of your disk read
 */
private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath,
                                                               Path dictionaryPathBase,
                                                               Configuration baseConf,
                                                               int chunkSizeInMegabytes) throws IOException {
  List<Path> chunkPaths = Lists.newArrayList();
  Configuration conf = new Configuration(baseConf);

  FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);

  long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
  int chunkIndex = 0;
  Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
  chunkPaths.add(chunkPath);
  SequenceFile.Writer freqWriter =
    new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);

  try {
    long currentChunkSize = 0;
    long featureCount = 0;
    long vectorCount = Long.MAX_VALUE;
    Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN);
    for (Pair<IntWritable,LongWritable> record
         : new SequenceFileDirIterable<IntWritable,LongWritable>(filesPattern,
                                                                 PathType.GLOB,
                                                                 null,
                                                                 null,
                                                                 true,
                                                                 conf)) {

      if (currentChunkSize > chunkSizeLimit) {
        Closeables.closeQuietly(freqWriter);
        chunkIndex++;

        chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
        chunkPaths.add(chunkPath);

        freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);
        currentChunkSize = 0;
      }

      int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
      currentChunkSize += fieldSize;
      IntWritable key = record.getFirst();
      LongWritable value = record.getSecond();
      if (key.get() >= 0) {
        freqWriter.append(key, value);
      } else if (key.get() == -1) {
        vectorCount = value.get();
      }
      featureCount = Math.max(key.get(), featureCount);

    }
    featureCount++;
    Long[] counts = {featureCount, vectorCount};
    return new Pair<Long[], List<Path>>(counts, chunkPaths);
  } finally {
    Closeables.closeQuietly(freqWriter);
  }
}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigData-Max,代码行数:64,代码来源:TFIDFConverter.java

示例9: processTfIdf

import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
/**
 * Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the input set of vectors in
 * {@link SequenceFile} format. This job uses a fixed limit on the maximum memory used by the feature chunk
 * per node thereby splitting the process across multiple map/reduces.
 * Before using this method calculateDF should be called
 * 
 * @param input
 *          input directory of the vectors in {@link SequenceFile} format
 * @param output
 *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
 *          are generated
 * @param datasetFeatures
 *          Document frequencies information calculated by calculateDF
 * @param minDf
 *          The minimum document frequency. Default 1
 * @param maxDF
 *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
 *          Expressed as an integer between 0 and 100. Default 99
 * @param numReducers 
 *          The number of reducers to spawn. This also affects the possible parallelism since each reducer
 *          will typically produce a single output file containing tf-idf vectors for a subset of the
 *          documents in the corpus.
 */
public static void processTfIdf(Path input,
                                Path output,
                                Configuration baseConf,
                                Pair<Long[], List<Path>> datasetFeatures,
                                int minDf,
                                long maxDF,
                                float normPower,
                                boolean logNormalize,
                                boolean sequentialAccessOutput,
                                boolean namedVector,
                                int numReducers) throws IOException, InterruptedException, ClassNotFoundException {
  Preconditions.checkArgument(normPower == PartialVectorMerger.NO_NORMALIZING || normPower >= 0,
      "If specified normPower must be nonnegative", normPower);
  Preconditions.checkArgument(normPower == PartialVectorMerger.NO_NORMALIZING
                              || (normPower > 1 && !Double.isInfinite(normPower))
                              || !logNormalize,
      "normPower must be > 1 and not infinite if log normalization is chosen", normPower);

  int partialVectorIndex = 0;
  List<Path> partialVectorPaths = Lists.newArrayList();
  List<Path> dictionaryChunks = datasetFeatures.getSecond();
  for (Path dictionaryChunk : dictionaryChunks) {
    Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
    partialVectorPaths.add(partialVectorOutputPath);
    makePartialVectors(input,
                       baseConf,
                       datasetFeatures.getFirst()[0],
                       datasetFeatures.getFirst()[1],
                       minDf,
                       maxDF,
                       dictionaryChunk,
                       partialVectorOutputPath,
                       sequentialAccessOutput,
                       namedVector);
  }

  Configuration conf = new Configuration(baseConf);

  Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
  
  PartialVectorMerger.mergePartialVectors(partialVectorPaths,
                                          outputDir,
                                          baseConf,
                                          normPower,
                                          logNormalize,
                                          datasetFeatures.getFirst()[0].intValue(),
                                          sequentialAccessOutput,
                                          namedVector,
                                          numReducers);
  HadoopUtil.delete(conf, partialVectorPaths);

}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:76,代码来源:TFIDFConverter.java

示例10: writeDictEntry

import org.apache.mahout.common.Pair; //导入方法依赖的package包/类
@Override
public void writeDictEntry(Pair<String, Integer> entry) throws IOException {
    Text key = new Text(entry.getFirst());
    IntWritable value = new IntWritable(entry.getSecond());
    dictWriter.append(key, value);
}
 
开发者ID:project-asap,项目名称:IReS-Platform,代码行数:7,代码来源:SparkOutput.java


注:本文中的org.apache.mahout.common.Pair.getSecond方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。