本文整理汇总了Java中org.apache.hadoop.mapred.JobConf.setInputFormat方法的典型用法代码示例。如果您正苦于以下问题:Java JobConf.setInputFormat方法的具体用法?Java JobConf.setInputFormat怎么用?Java JobConf.setInputFormat使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.hadoop.mapred.JobConf
的用法示例。
在下文中一共展示了JobConf.setInputFormat方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testInputFormat
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
void testInputFormat(Class<? extends InputFormat> clazz) throws IOException {
final JobConf job = MapreduceTestingShim.getJobConf(mrCluster);
job.setInputFormat(clazz);
job.setOutputFormat(NullOutputFormat.class);
job.setMapperClass(ExampleVerifier.class);
job.setNumReduceTasks(0);
LOG.debug("submitting job.");
final RunningJob run = JobClient.runJob(job);
assertTrue("job failed!", run.isSuccessful());
assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getCounter());
assertEquals("Saw any instances of the filtered out row.", 0, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getCounter());
assertEquals("Saw the wrong number of instances of columnA.", 1, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getCounter());
assertEquals("Saw the wrong number of instances of columnB.", 1, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getCounter());
assertEquals("Saw the wrong count of values for the filtered-for row.", 2, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getCounter());
assertEquals("Saw the wrong count of values for the filtered-out row.", 0, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getCounter());
}
示例2: runIOTest
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
private void runIOTest(
Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass,
Path outputDir) throws IOException {
JobConf job = new JobConf(config, TestDFSIO.class);
FileInputFormat.setInputPaths(job, getControlDir(config));
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(mapperClass);
job.setReducerClass(AccumulatingReducer.class);
FileOutputFormat.setOutputPath(job, outputDir);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(1);
JobClient.runJob(job);
}
示例3: getJob
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
/**
* Sets up a job conf for the given job using the given config object. Ensures
* that the correct input format is set, the mapper and and reducer class and
* the input and output keys and value classes along with any other job
* configuration.
*
* @param config
* @return JobConf representing the job to be ran
* @throws IOException
*/
private JobConf getJob(ConfigExtractor config) throws IOException {
JobConf job = new JobConf(config.getConfig(), SliveTest.class);
job.setInputFormat(DummyInputFormat.class);
FileOutputFormat.setOutputPath(job, config.getOutputPath());
job.setMapperClass(SliveMapper.class);
job.setPartitionerClass(SlivePartitioner.class);
job.setReducerClass(SliveReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormat(TextOutputFormat.class);
TextOutputFormat.setCompressOutput(job, false);
job.setNumReduceTasks(config.getReducerAmount());
job.setNumMapTasks(config.getMapAmount());
return job;
}
示例4: joinAs
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
private static void joinAs(String jointype,
Class<? extends SimpleCheckerBase> c) throws Exception {
final int srcs = 4;
Configuration conf = new Configuration();
JobConf job = new JobConf(conf, c);
Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
Path[] src = writeSimpleSrc(base, conf, srcs);
job.set("mapreduce.join.expr", CompositeInputFormat.compose(jointype,
SequenceFileInputFormat.class, src));
job.setInt("testdatamerge.sources", srcs);
job.setInputFormat(CompositeInputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(base, "out"));
job.setMapperClass(c);
job.setReducerClass(c);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
JobClient.runJob(job);
base.getFileSystem(job).delete(base, true);
}
示例5: testEmptyJoin
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
public void testEmptyJoin() throws Exception {
JobConf job = new JobConf();
Path base = cluster.getFileSystem().makeQualified(new Path("/empty"));
Path[] src = { new Path(base,"i0"), new Path("i1"), new Path("i2") };
job.set("mapreduce.join.expr", CompositeInputFormat.compose("outer",
Fake_IF.class, src));
job.setInputFormat(CompositeInputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(base, "out"));
job.setMapperClass(IdentityMapper.class);
job.setReducerClass(IdentityReducer.class);
job.setOutputKeyClass(IncomparableKey.class);
job.setOutputValueClass(NullWritable.class);
JobClient.runJob(job);
base.getFileSystem(job).delete(base, true);
}
示例6: initMultiTableSnapshotMapperJob
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
/**
* Sets up the job for reading from one or more multiple table snapshots, with one or more scans
* per snapshot.
* It bypasses hbase servers and read directly from snapshot files.
*
* @param snapshotScans map of snapshot name to scans on that snapshot.
* @param mapper The mapper class to use.
* @param outputKeyClass The class of the output key.
* @param outputValueClass The class of the output value.
* @param job The current job to adjust. Make sure the passed job is
* carrying all necessary HBase configuration.
* @param addDependencyJars upload HBase jars and jars for any of the configured
* job classes via the distributed cache (tmpjars).
*/
public static void initMultiTableSnapshotMapperJob(Map<String, Collection<Scan>> snapshotScans,
Class<? extends TableMap> mapper, Class<?> outputKeyClass, Class<?> outputValueClass,
JobConf job, boolean addDependencyJars, Path tmpRestoreDir) throws IOException {
MultiTableSnapshotInputFormat.setInput(job, snapshotScans, tmpRestoreDir);
job.setInputFormat(MultiTableSnapshotInputFormat.class);
if (outputValueClass != null) {
job.setMapOutputValueClass(outputValueClass);
}
if (outputKeyClass != null) {
job.setMapOutputKeyClass(outputKeyClass);
}
job.setMapperClass(mapper);
if (addDependencyJars) {
addDependencyJars(job);
}
org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.resetCacheConfig(job);
}
示例7: createJobConf
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
private static JobConf createJobConf(Configuration conf) {
JobConf jobconf = new JobConf(conf, DistCpV1.class);
jobconf.setJobName(conf.get("mapred.job.name", NAME));
// turn off speculative execution, because DFS doesn't handle
// multiple writers to the same file.
jobconf.setMapSpeculativeExecution(false);
jobconf.setInputFormat(CopyInputFormat.class);
jobconf.setOutputKeyClass(Text.class);
jobconf.setOutputValueClass(Text.class);
jobconf.setMapperClass(CopyFilesMapper.class);
jobconf.setNumReduceTasks(0);
return jobconf;
}
示例8: fillMap
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
private static void fillMap(JetInstance client, String name, String inputPath, int parallelism) throws Exception {
DAG dag = new DAG();
JobConf conf = new JobConf();
conf.setInputFormat(TextInputFormat.class);
TextInputFormat.addInputPath(conf, new Path(inputPath));
Vertex reader = dag.newVertex("reader", readHdfsP(conf, Util::entry));
Vertex mapper = dag.newVertex("mapper",
mapP((Map.Entry<LongWritable, Text> e) -> entry(e.getKey().get(), e.getValue().toString())));
Vertex writer = dag.newVertex("writer", writeMapP(name));
reader.localParallelism(parallelism);
mapper.localParallelism(parallelism);
writer.localParallelism(parallelism);
dag.edge(between(reader, mapper));
dag.edge(between(mapper, writer));
JobConfig jobConfig = new JobConfig();
jobConfig.addClass(HdfsToMap.class);
client.newJob(dag, jobConfig).join();
}
示例9: splitInput
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
private void splitInput(final Properties properties, final StorageDescriptor sd, final Partition partition)
throws ReflectiveOperationException, IOException {
final JobConf job = new JobConf();
for (final Object obj : properties.keySet()) {
job.set((String) obj, (String) properties.get(obj));
}
for (final Map.Entry<String, String> entry : hiveReadEntry.hiveConfigOverride.entrySet()) {
job.set(entry.getKey(), entry.getValue());
}
InputFormat<?, ?> format = (InputFormat<?, ?>)
Class.forName(sd.getInputFormat()).getConstructor().newInstance();
job.setInputFormat(format.getClass());
final Path path = new Path(sd.getLocation());
final FileSystem fs = path.getFileSystem(job);
if (fs.exists(path)) {
FileInputFormat.addInputPath(job, path);
format = job.getInputFormat();
for (final InputSplit split : format.getSplits(job, 1)) {
inputSplits.add(split);
partitionMap.put(split, partition);
}
}
final String numRowsProp = properties.getProperty("numRows");
logger.trace("HiveScan num rows property = {}", numRowsProp);
if (numRowsProp != null) {
final long numRows = Long.valueOf(numRowsProp);
// starting from hive-0.13, when no statistics are available, this property is set to -1
// it's important to note that the value returned by hive may not be up to date
if (numRows > 0) {
rowCount += numRows;
}
}
}
示例10: runJob
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
static boolean runJob(JobConf conf, Path inDir, Path outDir, int numMaps,
int numReds) throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outDir)) {
fs.delete(outDir, true);
}
if (!fs.exists(inDir)) {
fs.mkdirs(inDir);
}
String input = "The quick brown fox\n" + "has many silly\n"
+ "red fox sox\n";
for (int i = 0; i < numMaps; ++i) {
DataOutputStream file = fs.create(new Path(inDir, "part-" + i));
file.writeBytes(input);
file.close();
}
DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf, fs);
conf.setOutputCommitter(CustomOutputCommitter.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputKeyClass(LongWritable.class);
conf.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(conf, inDir);
FileOutputFormat.setOutputPath(conf, outDir);
conf.setNumMapTasks(numMaps);
conf.setNumReduceTasks(numReds);
JobClient jobClient = new JobClient(conf);
RunningJob job = jobClient.submitJob(conf);
return jobClient.monitorAndPrintJob(conf, job);
}
示例11: testCombinerShouldUpdateTheReporter
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
@Test
public void testCombinerShouldUpdateTheReporter() throws Exception {
JobConf conf = new JobConf(mrCluster.getConfig());
int numMaps = 5;
int numReds = 2;
Path in = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
"testCombinerShouldUpdateTheReporter-in");
Path out = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
"testCombinerShouldUpdateTheReporter-out");
createInputOutPutFolder(in, out, numMaps);
conf.setJobName("test-job-with-combiner");
conf.setMapperClass(IdentityMapper.class);
conf.setCombinerClass(MyCombinerToCheckReporter.class);
//conf.setJarByClass(MyCombinerToCheckReporter.class);
conf.setReducerClass(IdentityReducer.class);
DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf);
conf.setOutputCommitter(CustomOutputCommitter.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputKeyClass(LongWritable.class);
conf.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(conf, in);
FileOutputFormat.setOutputPath(conf, out);
conf.setNumMapTasks(numMaps);
conf.setNumReduceTasks(numReds);
runJob(conf);
}
示例12: addInputPath
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
/**
* Add a {@link Path} with a custom {@link InputFormat} to the list of
* inputs for the map-reduce job.
*
* @param conf The configuration of the job
* @param path {@link Path} to be added to the list of inputs for the job
* @param inputFormatClass {@link InputFormat} class to use for this path
*/
public static void addInputPath(JobConf conf, Path path,
Class<? extends InputFormat> inputFormatClass) {
String inputFormatMapping = path.toString() + ";"
+ inputFormatClass.getName();
String inputFormats = conf.get("mapreduce.input.multipleinputs.dir.formats");
conf.set("mapreduce.input.multipleinputs.dir.formats",
inputFormats == null ? inputFormatMapping : inputFormats + ","
+ inputFormatMapping);
conf.setInputFormat(DelegatingInputFormat.class);
}
示例13: createJobConf
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
private static JobConf createJobConf(Configuration conf) {
JobConf jobconf = new JobConf(conf, DistCh.class);
jobconf.setJobName(NAME);
jobconf.setMapSpeculativeExecution(false);
jobconf.setInputFormat(ChangeInputFormat.class);
jobconf.setOutputKeyClass(Text.class);
jobconf.setOutputValueClass(Text.class);
jobconf.setMapperClass(ChangeFilesMapper.class);
jobconf.setNumReduceTasks(0);
return jobconf;
}
示例14: setup
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
@Override
public void setup(OutputMutator output) throws ExecutionSetupException {
final SequenceFileAsBinaryInputFormat inputFormat = new SequenceFileAsBinaryInputFormat();
final JobConf jobConf = new JobConf(dfs.getConf());
jobConf.setInputFormat(inputFormat.getClass());
reader = getRecordReader(inputFormat, jobConf);
final Field keyField = new Field(keySchema, true, getArrowTypeForMajorType(KEY_TYPE), null);
final Field valueField = new Field(valueSchema, true, getArrowTypeForMajorType(VALUE_TYPE), null);
try {
keyVector = output.addField(keyField, NullableVarBinaryVector.class);
valueVector = output.addField(valueField, NullableVarBinaryVector.class);
} catch (SchemaChangeException sce) {
throw new ExecutionSetupException("Error in setting up sequencefile reader.", sce);
}
}
示例15: initTableMapJob
import org.apache.hadoop.mapred.JobConf; //导入方法依赖的package包/类
/**
* Use this before submitting a TableMap job. It will
* appropriately set up the JobConf.
*
* @param table The table name to read from.
* @param columns The columns to scan.
* @param mapper The mapper class to use.
* @param outputKeyClass The class of the output key.
* @param outputValueClass The class of the output value.
* @param job The current job configuration to adjust.
* @param addDependencyJars upload HBase jars and jars for any of the configured
* job classes via the distributed cache (tmpjars).
*/
public static void initTableMapJob(String table, String columns,
Class<? extends TableMap> mapper,
Class<?> outputKeyClass,
Class<?> outputValueClass, JobConf job, boolean addDependencyJars,
Class<? extends InputFormat> inputFormat) {
job.setInputFormat(inputFormat);
job.setMapOutputValueClass(outputValueClass);
job.setMapOutputKeyClass(outputKeyClass);
job.setMapperClass(mapper);
job.setStrings("io.serializations", job.get("io.serializations"),
MutationSerialization.class.getName(), ResultSerialization.class.getName());
FileInputFormat.addInputPaths(job, table);
job.set(TableInputFormat.COLUMN_LIST, columns);
if (addDependencyJars) {
try {
addDependencyJars(job);
} catch (IOException e) {
e.printStackTrace();
}
}
try {
initCredentials(job);
} catch (IOException ioe) {
// just spit out the stack trace? really?
ioe.printStackTrace();
}
}