本文整理汇总了Java中org.apache.hadoop.mapreduce.lib.input.NLineInputFormat.addInputPath方法的典型用法代码示例。如果您正苦于以下问题:Java NLineInputFormat.addInputPath方法的具体用法?Java NLineInputFormat.addInputPath怎么用?Java NLineInputFormat.addInputPath使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.hadoop.mapreduce.lib.input.NLineInputFormat
的用法示例。
在下文中一共展示了NLineInputFormat.addInputPath方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: run
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat; //导入方法依赖的package包/类
@Override
public int run(String[] args)
throws Exception
{
Job job = Job.getInstance(getConf());
job.setJarByClass(Phase3Step4LocalDeDuplication.class);
job.setJobName(Phase3Step4LocalDeDuplication.class.getName());
// paths
String inputPath = args[0];
// text files of ids to be deleted
String outputPath = args[1];
// input: reading max N lines for each mapper
job.setInputFormatClass(NLineInputFormat.class);
NLineInputFormat.addInputPath(job, new Path(inputPath));
job.getConfiguration().setInt("mapreduce.input.lineinputformat.linespermap", LINES);
// mapper
job.setMapperClass(LocalGreedyDeDuplicationMapper.class);
LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
// reducer
job.setReducerClass(IDCollectorReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
return job.waitForCompletion(true) ? 0 : 1;
}
示例2: prepareHadoopJob
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat; //导入方法依赖的package包/类
/**
* Prepare the Hadoop MR job, including configuring the job and setting up the input/output paths.
*/
private Path prepareHadoopJob(List<WorkUnit> workUnits) throws IOException {
TimingEvent mrJobSetupTimer = this.eventSubmitter.getTimingEvent(TimingEventNames.RunJobTimings.MR_JOB_SETUP);
this.job.setJarByClass(MRJobLauncher.class);
this.job.setMapperClass(TaskRunner.class);
// The job is mapper-only
this.job.setNumReduceTasks(0);
this.job.setInputFormatClass(NLineInputFormat.class);
this.job.setOutputFormatClass(GobblinOutputFormat.class);
this.job.setMapOutputKeyClass(NullWritable.class);
this.job.setMapOutputValueClass(NullWritable.class);
// Turn off speculative execution
this.job.setSpeculativeExecution(false);
// Job input path is where input work unit files are stored
Path jobInputPath = new Path(this.mrJobDir, INPUT_DIR_NAME);
// Prepare job input
Path jobInputFile = prepareJobInput(jobInputPath, workUnits);
NLineInputFormat.addInputPath(this.job, jobInputFile);
// Job output path is where serialized task states are stored
Path jobOutputPath = new Path(this.mrJobDir, OUTPUT_DIR_NAME);
SequenceFileOutputFormat.setOutputPath(this.job, jobOutputPath);
// Serialize source state to a file which will be picked up by the mappers
Path jobStateFilePath = new Path(this.mrJobDir, JOB_STATE_FILE_NAME);
SerializationUtils.serializeState(this.fs, jobStateFilePath, this.jobContext.getJobState());
job.getConfiguration().set(ConfigurationKeys.JOB_STATE_FILE_PATH_KEY, jobStateFilePath.toString());
if (this.jobProps.containsKey(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY)) {
// When there is a limit on the number of mappers, each mapper may run
// multiple tasks if the total number of tasks is larger than the limit.
int maxMappers = Integer.parseInt(this.jobProps.getProperty(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY));
if (workUnits.size() > maxMappers) {
int numTasksPerMapper =
workUnits.size() % maxMappers == 0 ? workUnits.size() / maxMappers : workUnits.size() / maxMappers + 1;
NLineInputFormat.setNumLinesPerSplit(this.job, numTasksPerMapper);
}
}
mrJobSetupTimer.stop();
return jobOutputPath;
}
示例3: randomizeManyInputFiles
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat; //导入方法依赖的package包/类
/**
* To uniformly spread load across all mappers we randomize fullInputList
* with a separate small Mapper & Reducer preprocessing step. This way
* each input line ends up on a random position in the output file list.
* Each mapper indexes a disjoint consecutive set of files such that each
* set has roughly the same size, at least from a probabilistic
* perspective.
*
* For example an input file with the following input list of URLs:
*
* A
* B
* C
* D
*
* might be randomized into the following output list of URLs:
*
* C
* A
* D
* B
*
* The implementation sorts the list of lines by randomly generated numbers.
*/
private Job randomizeManyInputFiles(Configuration baseConfig, Path fullInputList, Path outputStep2Dir, int numLinesPerSplit)
throws IOException {
Job job2 = Job.getInstance(baseConfig);
job2.setJarByClass(getClass());
job2.setJobName(getClass().getName() + "/" + Utils.getShortClassName(LineRandomizerMapper.class));
job2.setInputFormatClass(NLineInputFormat.class);
NLineInputFormat.addInputPath(job2, fullInputList);
NLineInputFormat.setNumLinesPerSplit(job2, numLinesPerSplit);
job2.setMapperClass(LineRandomizerMapper.class);
job2.setReducerClass(LineRandomizerReducer.class);
job2.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job2, outputStep2Dir);
job2.setNumReduceTasks(1);
job2.setOutputKeyClass(LongWritable.class);
job2.setOutputValueClass(Text.class);
return job2;
}