本文整理汇总了Java中org.apache.hadoop.filecache.DistributedCache.setCacheFiles方法的典型用法代码示例。如果您正苦于以下问题:Java DistributedCache.setCacheFiles方法的具体用法?Java DistributedCache.setCacheFiles怎么用?Java DistributedCache.setCacheFiles使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.hadoop.filecache.DistributedCache
的用法示例。
在下文中一共展示了DistributedCache.setCacheFiles方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: writeDocTopicInference
import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output)
throws IOException, ClassNotFoundException, InterruptedException {
String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output);
log.info("About to run: " + jobName);
Job job = new Job(conf, jobName);
job.setMapperClass(CVB0DocInferenceMapper.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(VectorWritable.class);
FileSystem fs = FileSystem.get(corpus.toUri(), conf);
if (modelInput != null && fs.exists(modelInput)) {
FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter());
URI[] modelUris = new URI[statuses.length];
for (int i = 0; i < statuses.length; i++) {
modelUris[i] = statuses[i].getPath().toUri();
}
DistributedCache.setCacheFiles(modelUris, conf);
}
FileInputFormat.addInputPath(job, corpus);
FileOutputFormat.setOutputPath(job, output);
job.setJarByClass(CVB0Driver.class);
job.submit();
return job;
}
示例2: save
import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
/**
*
* @param key SequenceFile key
* @param vector Vector to save, to be wrapped as VectorWritable
*/
public static void save(Writable key,
Vector vector,
Path output,
Configuration conf,
boolean overwritePath,
boolean deleteOnExit) throws IOException {
FileSystem fs = FileSystem.get(output.toUri(), conf);
output = fs.makeQualified(output);
if (overwritePath) {
HadoopUtil.delete(conf, output);
}
// set the cache
DistributedCache.setCacheFiles(new URI[] {output.toUri()}, conf);
// set up the writer
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output,
IntWritable.class, VectorWritable.class);
try {
writer.append(key, new VectorWritable(vector));
} finally {
Closeables.closeQuietly(writer);
}
if (deleteOnExit) {
fs.deleteOnExit(output);
}
}
示例3: pruneVectorsPartial
import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
private static void pruneVectorsPartial(Path input, Path output, Path dictionaryFilePath, long maxDF,
Configuration baseConf) throws IOException, InterruptedException,
ClassNotFoundException {
Configuration conf = new Configuration(baseConf);
// this conf parameter needs to be set enable serialisation of conf
// values
conf.set("io.serializations",
"org.apache.hadoop.io.serializer.JavaSerialization,"
+ "org.apache.hadoop.io.serializer.WritableSerialization");
conf.setLong(MAX_DF, maxDF);
DistributedCache.setCacheFiles(
new URI[]{dictionaryFilePath.toUri()}, conf);
Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
Mapper.class, null, null, WordsPrunerReducer.class,
Text.class, VectorWritable.class, SequenceFileOutputFormat.class,
conf);
job.setJobName(": Prune Vectors: input-folder: " + input
+ ", dictionary-file: " + dictionaryFilePath.toString());
HadoopUtil.delete(conf, output);
boolean succeeded = job.waitForCompletion(true);
if (!succeeded) {
throw new IllegalStateException("Job failed!");
}
}
示例4: cacheFiles
import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
public static void cacheFiles(Path fileToCache, Configuration conf) {
DistributedCache.setCacheFiles(new URI[]{fileToCache.toUri()}, conf);
}
示例5: setupPipesJob
import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
private static void setupPipesJob(JobConf conf) throws IOException {
// default map output types to Text
if (!getIsJavaMapper(conf)) {
conf.setMapRunnerClass(PipesMapRunner.class);
// Save the user's partitioner and hook in our's.
setJavaPartitioner(conf, conf.getPartitionerClass());
conf.setPartitionerClass(PipesPartitioner.class);
}
if (!getIsJavaReducer(conf)) {
conf.setReducerClass(PipesReducer.class);
if (!getIsJavaRecordWriter(conf)) {
conf.setOutputFormat(NullOutputFormat.class);
}
}
String textClassname = Text.class.getName();
setIfUnset(conf, "mapred.mapoutput.key.class", textClassname);
setIfUnset(conf, "mapred.mapoutput.value.class", textClassname);
setIfUnset(conf, "mapred.output.key.class", textClassname);
setIfUnset(conf, "mapred.output.value.class", textClassname);
// Use PipesNonJavaInputFormat if necessary to handle progress reporting
// from C++ RecordReaders ...
if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
conf.setClass("mapred.pipes.user.inputformat",
conf.getInputFormat().getClass(), InputFormat.class);
conf.setInputFormat(PipesNonJavaInputFormat.class);
}
String exec = getExecutable(conf);
if (exec == null) {
throw new IllegalArgumentException("No application program defined.");
}
// add default debug script only when executable is expressed as
// <path>#<executable>
if (exec.contains("#")) {
DistributedCache.createSymlink(conf);
// set default gdb commands for map and reduce task
String defScript = "$HADOOP_HOME/src/c++/pipes/debug/pipes-default-script";
setIfUnset(conf,"mapred.map.task.debug.script",defScript);
setIfUnset(conf,"mapred.reduce.task.debug.script",defScript);
}
URI[] fileCache = DistributedCache.getCacheFiles(conf);
if (fileCache == null) {
fileCache = new URI[1];
} else {
URI[] tmp = new URI[fileCache.length+1];
System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
fileCache = tmp;
}
try {
fileCache[0] = new URI(exec);
} catch (URISyntaxException e) {
IOException ie = new IOException("Problem parsing execable URI " + exec);
ie.initCause(e);
throw ie;
}
DistributedCache.setCacheFiles(fileCache, conf);
}
示例6: makePartialVectors
import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
/**
* Create a partial vector using a chunk of features from the input documents. The input documents has to be
* in the {@link SequenceFile} format
*
* @param input
* input directory of the documents in {@link SequenceFile} format
* @param baseConf
* job configuration
* @param maxNGramSize
* maximum size of ngrams to generate
* @param dictionaryFilePath
* location of the chunk of features and the id's
* @param output
* output directory were the partial vectors have to be created
* @param dimension
* @param sequentialAccess
* output vectors should be optimized for sequential access
* @param namedVectors
* output vectors should be named, retaining key (doc id) as a label
* @param numReducers
* the desired number of reducer tasks
*/
private static void makePartialVectors(Path input,
Configuration baseConf,
int maxNGramSize,
Path dictionaryFilePath,
Path output,
int dimension,
boolean sequentialAccess,
boolean namedVectors,
int numReducers)
throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration(baseConf);
// this conf parameter needs to be set enable serialisation of conf values
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ "org.apache.hadoop.io.serializer.WritableSerialization");
conf.setInt(PartialVectorMerger.DIMENSION, dimension);
conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
conf.setInt(MAX_NGRAMS, maxNGramSize);
DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
Job job = new Job(conf);
job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input
+ ", dictionary-file: " + dictionaryFilePath);
job.setJarByClass(DictionaryVectorizer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(StringTuple.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(VectorWritable.class);
FileInputFormat.setInputPaths(job, input);
FileOutputFormat.setOutputPath(job, output);
job.setMapperClass(Mapper.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setReducerClass(TFPartialVectorReducer.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setNumReduceTasks(numReducers);
HadoopUtil.delete(conf, output);
boolean succeeded = job.waitForCompletion(true);
if (!succeeded) {
throw new IllegalStateException("Job failed!");
}
}
示例7: makePartialVectors
import org.apache.hadoop.filecache.DistributedCache; //导入方法依赖的package包/类
/**
* Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to
* be in the {@link SequenceFile} format
*
* @param input
* input directory of the vectors in {@link SequenceFile} format
* @param featureCount
* Number of unique features in the dataset
* @param vectorCount
* Number of vectors in the dataset
* @param minDf
* The minimum document frequency. Default 1
* @param maxDF
* The max percentage of vectors for the DF. Can be used to remove really high frequency features.
* Expressed as an integer between 0 and 100. Default 99
* @param dictionaryFilePath
* location of the chunk of features and the id's
* @param output
* output directory were the partial vectors have to be created
* @param sequentialAccess
* output vectors should be optimized for sequential access
* @param namedVector
* output vectors should be named, retaining key (doc id) as a label
*/
private static void makePartialVectors(Path input,
Configuration baseConf,
Long featureCount,
Long vectorCount,
int minDf,
long maxDF,
Path dictionaryFilePath,
Path output,
boolean sequentialAccess,
boolean namedVector)
throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration(baseConf);
// this conf parameter needs to be set enable serialisation of conf values
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ "org.apache.hadoop.io.serializer.WritableSerialization");
conf.setLong(FEATURE_COUNT, featureCount);
conf.setLong(VECTOR_COUNT, vectorCount);
conf.setInt(MIN_DF, minDf);
conf.setLong(MAX_DF, maxDF);
conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVector);
DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
Job job = new Job(conf);
job.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: "
+ dictionaryFilePath.toString());
job.setJarByClass(TFIDFConverter.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(VectorWritable.class);
FileInputFormat.setInputPaths(job, input);
FileOutputFormat.setOutputPath(job, output);
job.setMapperClass(Mapper.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setReducerClass(TFIDFPartialVectorReducer.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
HadoopUtil.delete(conf, output);
boolean succeeded = job.waitForCompletion(true);
if (!succeeded) {
throw new IllegalStateException("Job failed!");
}
}