本文整理汇总了Java中org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPaths方法的典型用法代码示例。如果您正苦于以下问题:Java FileInputFormat.addInputPaths方法的具体用法?Java FileInputFormat.addInputPaths怎么用?Java FileInputFormat.addInputPaths使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.hadoop.mapreduce.lib.input.FileInputFormat
的用法示例。
在下文中一共展示了FileInputFormat.addInputPaths方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
public int run(String[] args) throws Exception {
if(args.length < 2) {
printUsage();
return 2;
}
Job job = Job.getInstance(getConf());
job.setJobName("MultiFileWordCount");
job.setJarByClass(MultiFileWordCount.class);
//set the InputFormat of the job to our InputFormat
job.setInputFormatClass(MyInputFormat.class);
// the keys are words (strings)
job.setOutputKeyClass(Text.class);
// the values are counts (ints)
job.setOutputValueClass(IntWritable.class);
//use the defined mapper
job.setMapperClass(MapClass.class);
//use the WordCount Reducer
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
FileInputFormat.addInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
示例2: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Job job = Job.getInstance(getConf());
job.setJarByClass(WordDistributionStatisticsCollector.class);
job.setJobName(WordDistributionStatisticsCollector.class.getName());
// mapper
job.setMapperClass(getMapperClass());
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// reducer
job.setReducerClass(SumReducer.class);
job.setInputFormatClass(getInputFormatClass());
job.setOutputFormatClass(TextOutputFormat.class);
// paths
String commaSeparatedInputFiles = args[0];
String outputPath = args[1];
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
return job.waitForCompletion(true) ? 0 : 1;
}
示例3: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Configuration conf = getConf();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = Job.getInstance();
job.setJarByClass(TopDomainCounter.class);
job.setJobName(TopDomainCounter.class.getName());
// mapper
job.setMapperClass(DomainMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// combiner + reducer
job.setCombinerClass(TextLongCountingReducer.class);
job.setReducerClass(TextLongCountingReducer.class);
job.setInputFormatClass(WARCInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// paths
String commaSeparatedInputFiles = otherArgs[0];
String outputPath = otherArgs[1];
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
return job.waitForCompletion(true) ? 0 : 1;
}
示例4: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
org.apache.hadoop.conf.Configuration conf = getConf();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
System.out.println("Other args: " + Arrays.toString(otherArgs));
Job job = Job.getInstance();
job.setJarByClass(OriginalURLGrep.class);
job.setJobName(OriginalURLGrep.class.getName());
job.setMapperClass(OrigURLGrepMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// cache file - IDs for index
String idFile = args[2];
System.err.println("idFile: " + idFile);
job.addCacheFile(new URI(idFile + "#" + NODE_IDS));
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
String commaSeparatedInputFiles = otherArgs[0];
String outputPath = otherArgs[1];
System.err.println("commaSeparatedInputFiles: " + commaSeparatedInputFiles);
System.err.println("outputPath: " + outputPath);
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
return job.waitForCompletion(true) ? 0 : 1;
}
示例5: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Job job = Job.getInstance(getConf());
for (Map.Entry<String, String> next : job.getConfiguration()) {
System.out.println(next.getKey() + ": " + next.getValue());
}
job.setJarByClass(ClueWebTRECIdFileExtractor.class);
job.setJobName(ClueWebTRECIdFileExtractor.class.getName());
// mapper
job.setMapperClass(MapperClass.class);
// input
job.setInputFormatClass(WARCInputFormat.class);
// output
job.setOutputFormatClass(WARCOutputFormat.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(WARCWritable.class);
FileOutputFormat.setCompressOutput(job, true);
// paths
String commaSeparatedInputFiles = args[0];
String outputPath = args[1];
// load IDs to be searched for
job.getConfiguration().set(MAPREDUCE_MAPPER_TREC_IDS, loadTrecIds(args[2]));
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
return job.waitForCompletion(true) ? 0 : 1;
}
开发者ID:UKPLab,项目名称:sigir2016-collection-for-focused-retrieval,代码行数:38,代码来源:ClueWebTRECIdFileExtractor.java
示例6: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Configuration conf = getConf();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = Job.getInstance();
job.setJarByClass(SimpleTextSearch.class);
job.setJobName(SimpleTextSearch.class.getName());
// mapper
job.setMapperClass(TextSearchMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// combiner + reducer
job.setCombinerClass(TextLongCountingReducer.class);
job.setReducerClass(TextLongCountingReducer.class);
job.setInputFormatClass(WARCInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// paths
String commaSeparatedInputFiles = otherArgs[0];
String outputPath = otherArgs[1];
// regex with a phrase to be searched for
String regex = otherArgs[2];
job.getConfiguration().set(MAPREDUCE_MAP_REGEX, regex);
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
return job.waitForCompletion(true) ? 0 : 1;
}
示例7: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Configuration conf = getConf();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = Job.getInstance();
job.setJarByClass(WordCounterExample.class);
job.setJobName(WordCounterExample.class.getName());
// mapper
job.setMapperClass(WordCounterMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// combiner + reducer
job.setCombinerClass(TextLongCountingReducer.class);
job.setReducerClass(TextLongCountingReducer.class);
job.setInputFormatClass(WARCInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// paths
String commaSeparatedInputFiles = otherArgs[0];
String outputPath = otherArgs[1];
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
return job.waitForCompletion(true) ? 0 : 1;
}
示例8: configureJob
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
/**
* Job configurator
*
* @param job job instance
* @param jarByClass class of the jar
* @param mapperClass mapper
* @param reducerClass reducer
* @param commaSeparatedInputFiles input paths
* @param outputPath output
* @throws IOException I/O exception
*/
public static void configureJob(Job job, Class<?> jarByClass,
Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass,
String commaSeparatedInputFiles, String outputPath)
throws IOException
{
job.setJarByClass(jarByClass);
job.setJobName(jarByClass.getName());
// mapper
job.setMapperClass(mapperClass);
// reducer
job.setReducerClass(reducerClass);
// input-output is warc
job.setInputFormatClass(WARCInputFormat.class);
// prevent producing empty files
LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class);
// intermediate data
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(WARCWritable.class);
// output data
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(WARCWritable.class);
// set output compression to GZip
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
}
示例9: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Job job = Job.getInstance(getConf());
job.setJarByClass(Phase3Step1ExtractNearDupInfo.class);
job.setJobName(Phase3Step1ExtractNearDupInfo.class.getName());
// mapper
job.setMapperClass(MapperClass.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DocumentInfo.class);
// reducer
job.setReducerClass(DeDuplicationTextOutputReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(List.class);
job.setInputFormatClass(WARCInputFormat.class);
LazyOutputFormat.setOutputFormatClass(job, DocumentInfoOutputFormat.class);
// paths
String commaSeparatedInputFiles = args[0];
String outputPath = args[1];
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
return job.waitForCompletion(true) ? 0 : 1;
}
示例10: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Job job = Job.getInstance(getConf());
//set from the command line
job.setJarByClass(Phase2ExactMatchDeDuplication.class);
job.setJobName(Phase2ExactMatchDeDuplication.class.getName());
// mapper
job.setMapperClass(ExactMatchDetectionMapper.class);
// we will compress the mapper's output (use fast Snappy compressor)
job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
job.getConfiguration()
.setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
// reducer
job.setReducerClass(UniqueWarcWriterReducer.class);
// no combiner, as the output classes in mapper and reducer are different!
// input-output is warc
job.setInputFormatClass(WARCInputFormat.class);
job.setOutputFormatClass(WARCOutputFormat.class);
// mapper output data
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(WARCWritable.class);
// set output compression to GZip
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
FileInputFormat.addInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
示例11: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Job job = Job.getInstance(getConf());
job.setJarByClass(Phase3Step3NearDupTuplesCreation.class);
job.setJobName(Phase3Step3NearDupTuplesCreation.class.getName());
// mapper
job.setMapperClass(CreateTuplesMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(TreeSet.class);
job.setInputFormatClass(TextInputFormat.class);
LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
// paths
String commaSeparatedInputFiles = args[0];
String outputPath = args[1];
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setNumReduceTasks(0); //must be added or the mapper wont be called
return job.waitForCompletion(true) ? 0 : 1;
}
示例12: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Job job = Job.getInstance(getConf());
// set from the command line
job.setJarByClass(Phase1FullJob.class);
job.setJobName(Phase1FullJob.class.getName());
// mapper
job.setMapperClass(MapperClass.class);
// we will compress the mapper's output (use fast Snappy compressor)
job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
job.getConfiguration()
.setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
// reducer
job.setReducerClass(SimpleWarcWriterReducer.class);
// input-output is warc
job.setInputFormatClass(WARCInputFormat.class);
job.setOutputFormatClass(WARCOutputFormat.class);
// mapper output data
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(WARCWritable.class);
// set output compression to GZip
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
FileInputFormat.addInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
示例13: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Configuration conf = getConf();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = Job.getInstance(conf);
job.setJarByClass(TextToSentencesSplitter.class);
job.setJobName(TextToSentencesSplitter.class.getName());
// mapper
job.setMapperClass(TextToSentencesSplitter.MapperClass.class);
job.setInputFormatClass(WARCInputFormat.class);
// reducer
job.setReducerClass(ReducerClass.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
// paths
String commaSeparatedInputFiles = otherArgs[0];
String outputPath = otherArgs[1];
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
return job.waitForCompletion(true) ? 0 : 1;
}
示例14: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
/**
* {@inheritDoc}
*/
@Override
public int run(String[] args)
throws Exception
{
Job job = Job.getInstance(getConf());
// set from the command line
job.setJarByClass(URIExtractor.class);
job.setJobName(URIExtractor.class.getName());
// mapper
job.setMapperClass(URIExtractorMapper.class);
job.setReducerClass(URIExtractorReducer.class);
// input-output is warc
job.setInputFormatClass(WARCInputFormat.class);
// is necessary, so that Hadoop does not mix the map input format up.
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// set output compression to GZip
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
FileInputFormat.addInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
示例15: run
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Job job = Job.getInstance(getConf());
for (Map.Entry<String, String> next : job.getConfiguration()) {
System.out.println(next.getKey() + ": " + next.getValue());
}
job.setJarByClass(PagesByURLExtractor.class);
job.setJobName(PagesByURLExtractor.class.getName());
// mapper
job.setMapperClass(MapperClass.class);
// input
job.setInputFormatClass(WARCInputFormat.class);
// output
job.setOutputFormatClass(WARCOutputFormat.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(WARCWritable.class);
FileOutputFormat.setCompressOutput(job, true);
// paths
String commaSeparatedInputFiles = args[0];
String outputPath = args[1];
// load IDs to be searched for
job.getConfiguration().set(MAPREDUCE_MAPPER_URLS, loadURLs(args[2]));
FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
return job.waitForCompletion(true) ? 0 : 1;
}