当前位置: 首页>>代码示例>>Java>>正文


Java DistributedCache.setCacheFiles方法代码示例

本文整理汇总了Java中org.apache.hadoop.filecache.DistributedCache.setCacheFiles方法的典型用法代码示例。如果您正苦于以下问题:Java DistributedCache.setCacheFiles方法的具体用法?Java DistributedCache.setCacheFiles怎么用?Java DistributedCache.setCacheFiles使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.hadoop.filecache.DistributedCache的用法示例。


在下文中一共展示了DistributedCache.setCacheFiles方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: writeDocTopicInference

import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output)
  throws IOException, ClassNotFoundException, InterruptedException {
  String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output);
  log.info("About to run: " + jobName);
  Job job = new Job(conf, jobName);
  job.setMapperClass(CVB0DocInferenceMapper.class);
  job.setNumReduceTasks(0);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(VectorWritable.class);
  FileSystem fs = FileSystem.get(corpus.toUri(), conf);
  if (modelInput != null && fs.exists(modelInput)) {
    FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter());
    URI[] modelUris = new URI[statuses.length];
    for (int i = 0; i < statuses.length; i++) {
      modelUris[i] = statuses[i].getPath().toUri();
    }
    DistributedCache.setCacheFiles(modelUris, conf);
  }
  FileInputFormat.addInputPath(job, corpus);
  FileOutputFormat.setOutputPath(job, output);
  job.setJarByClass(CVB0Driver.class);
  job.submit();
  return job;
}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:27,代码来源:CVB0Driver.java

示例2: save

import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
/**
 * 
 * @param key SequenceFile key
 * @param vector Vector to save, to be wrapped as VectorWritable
 */
public static void save(Writable key,
                        Vector vector,
                        Path output,
                        Configuration conf,
                        boolean overwritePath,
                        boolean deleteOnExit) throws IOException {
  
  FileSystem fs = FileSystem.get(output.toUri(), conf);
  output = fs.makeQualified(output);
  if (overwritePath) {
    HadoopUtil.delete(conf, output);
  }

  // set the cache
  DistributedCache.setCacheFiles(new URI[] {output.toUri()}, conf);
  
  // set up the writer
  SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, 
      IntWritable.class, VectorWritable.class);
  try {
    writer.append(key, new VectorWritable(vector));
  } finally {
    Closeables.closeQuietly(writer);
  }

  if (deleteOnExit) {
    fs.deleteOnExit(output);
  }
}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:35,代码来源:VectorCache.java

示例3: pruneVectorsPartial

import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
private static void pruneVectorsPartial(Path input, Path output, Path dictionaryFilePath, long maxDF,
                                        Configuration baseConf) throws IOException, InterruptedException,
        ClassNotFoundException {

  Configuration conf = new Configuration(baseConf);
  // this conf parameter needs to be set enable serialisation of conf
  // values
  conf.set("io.serializations",
          "org.apache.hadoop.io.serializer.JavaSerialization,"
                  + "org.apache.hadoop.io.serializer.WritableSerialization");
  conf.setLong(MAX_DF, maxDF);
  DistributedCache.setCacheFiles(
          new URI[]{dictionaryFilePath.toUri()}, conf);

  Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
          Mapper.class, null, null, WordsPrunerReducer.class,
          Text.class, VectorWritable.class, SequenceFileOutputFormat.class,
          conf);
  job.setJobName(": Prune Vectors: input-folder: " + input
          + ", dictionary-file: " + dictionaryFilePath.toString());

  HadoopUtil.delete(conf, output);

  boolean succeeded = job.waitForCompletion(true);
  if (!succeeded) {
    throw new IllegalStateException("Job failed!");
  }
}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:29,代码来源:HighDFWordsPruner.java

示例4: cacheFiles

import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
public static void cacheFiles(Path fileToCache, Configuration conf) {
  DistributedCache.setCacheFiles(new URI[]{fileToCache.toUri()}, conf);
}
 
开发者ID:huyang1,项目名称:LDA,代码行数:4,代码来源:HadoopUtil.java

示例5: setupPipesJob

import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
private static void setupPipesJob(JobConf conf) throws IOException {
  // default map output types to Text
  if (!getIsJavaMapper(conf)) {
    conf.setMapRunnerClass(PipesMapRunner.class);
    // Save the user's partitioner and hook in our's.
    setJavaPartitioner(conf, conf.getPartitionerClass());
    conf.setPartitionerClass(PipesPartitioner.class);
  }
  if (!getIsJavaReducer(conf)) {
    conf.setReducerClass(PipesReducer.class);
    if (!getIsJavaRecordWriter(conf)) {
      conf.setOutputFormat(NullOutputFormat.class);
    }
  }
  String textClassname = Text.class.getName();
  setIfUnset(conf, "mapred.mapoutput.key.class", textClassname);
  setIfUnset(conf, "mapred.mapoutput.value.class", textClassname);
  setIfUnset(conf, "mapred.output.key.class", textClassname);
  setIfUnset(conf, "mapred.output.value.class", textClassname);
  
  // Use PipesNonJavaInputFormat if necessary to handle progress reporting
  // from C++ RecordReaders ...
  if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
    conf.setClass("mapred.pipes.user.inputformat", 
                  conf.getInputFormat().getClass(), InputFormat.class);
    conf.setInputFormat(PipesNonJavaInputFormat.class);
  }
  
  String exec = getExecutable(conf);
  if (exec == null) {
    throw new IllegalArgumentException("No application program defined.");
  }
  // add default debug script only when executable is expressed as
  // <path>#<executable>
  if (exec.contains("#")) {
    DistributedCache.createSymlink(conf);
    // set default gdb commands for map and reduce task 
    String defScript = "$HADOOP_HOME/src/c++/pipes/debug/pipes-default-script";
    setIfUnset(conf,"mapred.map.task.debug.script",defScript);
    setIfUnset(conf,"mapred.reduce.task.debug.script",defScript);
  }
  URI[] fileCache = DistributedCache.getCacheFiles(conf);
  if (fileCache == null) {
    fileCache = new URI[1];
  } else {
    URI[] tmp = new URI[fileCache.length+1];
    System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
    fileCache = tmp;
  }
  try {
    fileCache[0] = new URI(exec);
  } catch (URISyntaxException e) {
    IOException ie = new IOException("Problem parsing execable URI " + exec);
    ie.initCause(e);
    throw ie;
  }
  DistributedCache.setCacheFiles(fileCache, conf);
}
 
开发者ID:Nextzero,项目名称:hadoop-2.6.0-cdh5.4.3,代码行数:59,代码来源:Submitter.java

示例6: makePartialVectors

import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
/**
 * Create a partial vector using a chunk of features from the input documents. The input documents has to be
 * in the {@link SequenceFile} format
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param baseConf
 *          job configuration
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param dimension
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVectors
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          the desired number of reducer tasks
 */
private static void makePartialVectors(Path input,
                                       Configuration baseConf,
                                       int maxNGramSize,
                                       Path dictionaryFilePath,
                                       Path output,
                                       int dimension,
                                       boolean sequentialAccess, 
                                       boolean namedVectors,
                                       int numReducers)
  throws IOException, InterruptedException, ClassNotFoundException {
  
  Configuration conf = new Configuration(baseConf);
  // this conf parameter needs to be set enable serialisation of conf values
  conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                                + "org.apache.hadoop.io.serializer.WritableSerialization");
  conf.setInt(PartialVectorMerger.DIMENSION, dimension);
  conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
  conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
  conf.setInt(MAX_NGRAMS, maxNGramSize);   
  DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
  
  Job job = new Job(conf);
  job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input
                  + ", dictionary-file: " + dictionaryFilePath);
  job.setJarByClass(DictionaryVectorizer.class);
  
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(StringTuple.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(VectorWritable.class);
  FileInputFormat.setInputPaths(job, input);
  
  FileOutputFormat.setOutputPath(job, output);
  
  job.setMapperClass(Mapper.class);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setReducerClass(TFPartialVectorReducer.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setNumReduceTasks(numReducers);

  HadoopUtil.delete(conf, output);
  
  boolean succeeded = job.waitForCompletion(true);
  if (!succeeded) {
    throw new IllegalStateException("Job failed!");
  }
}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:70,代码来源:DictionaryVectorizer.java

示例7: makePartialVectors

import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
/**
 * Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to
 * be in the {@link SequenceFile} format
 * 
 * @param input
 *          input directory of the vectors in {@link SequenceFile} format
 * @param featureCount
 *          Number of unique features in the dataset
 * @param vectorCount
 *          Number of vectors in the dataset
 * @param minDf
 *          The minimum document frequency. Default 1
 * @param maxDF
 *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
 *          Expressed as an integer between 0 and 100. Default 99
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVector
 *          output vectors should be named, retaining key (doc id) as a label
 */
private static void makePartialVectors(Path input,
                                       Configuration baseConf,
                                       Long featureCount,
                                       Long vectorCount,
                                       int minDf,
                                       long maxDF,
                                       Path dictionaryFilePath,
                                       Path output,
                                       boolean sequentialAccess,
                                       boolean namedVector)
  throws IOException, InterruptedException, ClassNotFoundException {

  Configuration conf = new Configuration(baseConf);
  // this conf parameter needs to be set enable serialisation of conf values
  conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
      + "org.apache.hadoop.io.serializer.WritableSerialization");
  conf.setLong(FEATURE_COUNT, featureCount);
  conf.setLong(VECTOR_COUNT, vectorCount);
  conf.setInt(MIN_DF, minDf);
  conf.setLong(MAX_DF, maxDF);
  conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
  conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVector);
  DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);

  Job job = new Job(conf);
  job.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: "
      + dictionaryFilePath.toString());
  job.setJarByClass(TFIDFConverter.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(VectorWritable.class);
  FileInputFormat.setInputPaths(job, input);

  FileOutputFormat.setOutputPath(job, output);

  job.setMapperClass(Mapper.class);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setReducerClass(TFIDFPartialVectorReducer.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);

  HadoopUtil.delete(conf, output);

  boolean succeeded = job.waitForCompletion(true);
  if (!succeeded) {
    throw new IllegalStateException("Job failed!");
  }
}
 
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:71,代码来源:TFIDFConverter.java


注:本文中的org.apache.hadoop.filecache.DistributedCache.setCacheFiles方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。