本文整理汇总了Java中edu.cmu.lemurproject.WarcFileInputFormat类的典型用法代码示例。如果您正苦于以下问题:Java WarcFileInputFormat类的具体用法?Java WarcFileInputFormat怎么用?Java WarcFileInputFormat使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
WarcFileInputFormat类属于edu.cmu.lemurproject包,在下文中一共展示了WarcFileInputFormat类的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: init
import edu.cmu.lemurproject.WarcFileInputFormat; //导入依赖的package包/类
@Override
public void init(JobConf conf) throws IOException {
conf.setInputFormat(WarcFileInputFormat.class);
}
示例2: run
import edu.cmu.lemurproject.WarcFileInputFormat; //导入依赖的package包/类
public int run(String[] args) throws Exception {
// Get current configuration.
Configuration conf = getConf();
// Parse command line arguments.
String inputPath = args[0];
String outputPath = args[1];
String maxArcFiles = "";
if (args.length == 3)
maxArcFiles = args[2];
// Set the maximum number of arc files to process.
conf.set(MAX_FILES_KEY, maxArcFiles);
JobConf job = new JobConf(conf);
// Set input path.
if (inputPath.length() > 0) {
LOG.info("Setting input path to " + inputPath);
FileInputFormat.addInputPath(job, new Path(inputPath));
FileInputFormat.setInputPathFilter(job, FileCountFilter.class);
} else {
System.err.println("No input path found.");
return 1;
}
// Set output path.
if (outputPath.length() > 0) {
LOG.info("Setting output path to " + outputPath);
SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
// Compress output to boost performance.
SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
SequenceFileOutputFormat.setCompressOutput(job, true);
} else {
System.err.println("No output path found.");
return 1;
}
// Load other classes from same jar a this class.
job.setJarByClass(WikiReverse.class);
// Input is in WARC file format.
job.setInputFormat(WarcFileInputFormat.class);
// Output is Hadoop sequence file format.
job.setOutputFormat(SequenceFileOutputFormat.class);
// Set the output data types.
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LinkArrayWritable.class);
// Use custom mapper class.
job.setMapRunnerClass(WikiReverseMapper.class);
// Use custom reducer class.
job.setReducerClass(LinkArrayReducer.class);
// Allow 5 percent of map tasks to fail.
job.setMaxMapTaskFailuresPercent(MAX_MAP_TASK_FAILURES_PERCENT);
if (JobClient.runJob(job).isSuccessful())
return 0;
else
return 1;
}
示例3: run
import edu.cmu.lemurproject.WarcFileInputFormat; //导入依赖的package包/类
public int run(String[] args) throws Exception {
// Get current configuration.
Configuration conf = getConf();
// Parse command line arguments.
String inputPath = args[0];
String outputPath = args[1];
// Set Max Files if specified otherwise parse the entire segment.
String maxFiles = "";
if (args.length == 3)
maxFiles = args[2];
conf.set(MAX_FILES_KEY, maxFiles);
JobConf job = new JobConf(conf);
// Set input path.
if (inputPath.length() > 0) {
LOG.info("Setting input path to " + inputPath);
FileInputFormat.addInputPath(job, new Path(inputPath));
FileInputFormat.setInputPathFilter(job, FileCountFilter.class);
} else {
System.err.println("No input path found.");
return 1;
}
// Set output path.
if (outputPath.length() > 0) {
LOG.info("Setting output path to " + outputPath);
SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
// Compress output to boost performance.
SequenceFileOutputFormat.setCompressOutput(job, true);
SequenceFileOutputFormat.getOutputCompressorClass(job, GzipCodec.class);
} else {
System.err.println("No output path found.");
return 1;
}
// Load other classes from same jar as this class.
job.setJarByClass(WordCount.class);
// Input is in WARC file format.
job.setInputFormat(WarcFileInputFormat.class);
// Output to Hadoop sequence file format.
job.setOutputFormat(SequenceFileOutputFormat.class);
// Set the output data types.
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// Use custom mapper class.
job.setMapperClass(WordCountMapper.class);
// Use standard reducer class.
job.setReducerClass(LongSumReducer.class);
// Allow 5 percent of map tasks to fail.
job.setMaxMapTaskFailuresPercent(MAX_MAP_TASK_FAILURES_PERCENT);
if (JobClient.runJob(job).isSuccessful())
return 0;
else
return 1;
}