本文整理汇总了Java中org.apache.hadoop.mapreduce.InputFormat类的典型用法代码示例。如果您正苦于以下问题:Java InputFormat类的具体用法?Java InputFormat怎么用?Java InputFormat使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
InputFormat类属于org.apache.hadoop.mapreduce包,在下文中一共展示了InputFormat类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getInputFormatClass
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
@Override
protected Class<? extends InputFormat> getInputFormatClass()
throws ClassNotFoundException {
if (isHCatJob) {
return SqoopHCatUtilities.getInputFormatClass();
}
switch (fileType) {
case AVRO_DATA_FILE:
return AvroInputFormat.class;
case PARQUET_FILE:
return DatasetKeyInputFormat.class;
default:
Class<? extends InputFormat> configuredIF = super.getInputFormatClass();
if (null == configuredIF) {
return ExportInputFormat.class;
} else {
return configuredIF;
}
}
}
示例2: testReinit
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
@Test
public void testReinit() throws Exception {
// Test that a split containing multiple files works correctly,
// with the child RecordReader getting its initialize() method
// called a second time.
TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
Configuration conf = new Configuration();
TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId);
// This will create a CombineFileRecordReader that itself contains a
// DummyRecordReader.
InputFormat inputFormat = new ChildRRInputFormat();
Path [] files = { new Path("file1"), new Path("file2") };
long [] lengths = { 1, 1 };
CombineFileSplit split = new CombineFileSplit(files, lengths);
RecordReader rr = inputFormat.createRecordReader(split, context);
assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);
// first initialize() call comes from MapTask. We'll do it here.
rr.initialize(split, context);
// First value is first filename.
assertTrue(rr.nextKeyValue());
assertEquals("file1", rr.getCurrentValue().toString());
// The inner RR will return false, because it only emits one (k, v) pair.
// But there's another sub-split to process. This returns true to us.
assertTrue(rr.nextKeyValue());
// And the 2nd rr will have its initialize method called correctly.
assertEquals("file2", rr.getCurrentValue().toString());
// But after both child RR's have returned their singleton (k, v), this
// should also return false.
assertFalse(rr.nextKeyValue());
}
示例3: readSplit
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
private static List<Text> readSplit(InputFormat<LongWritable,Text> format,
InputSplit split, Job job) throws IOException, InterruptedException {
List<Text> result = new ArrayList<Text>();
Configuration conf = job.getConfiguration();
TaskAttemptContext context = MapReduceTestUtil.
createDummyMapTaskAttemptContext(conf);
RecordReader<LongWritable, Text> reader = format.createRecordReader(split,
MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
MapContext<LongWritable,Text,LongWritable,Text> mcontext =
new MapContextImpl<LongWritable,Text,LongWritable,Text>(conf,
context.getTaskAttemptID(), reader, null, null,
MapReduceTestUtil.createDummyReporter(),
split);
reader.initialize(split, mcontext);
while (reader.nextKeyValue()) {
result.add(new Text(reader.getCurrentValue()));
}
return result;
}
示例4: testAddInputPathWithMapper
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
@SuppressWarnings("unchecked")
public void testAddInputPathWithMapper() throws IOException {
final Job conf = Job.getInstance();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
MapClass.class);
MultipleInputs.addInputPath(conf, new Path("/bar"),
KeyValueTextInputFormat.class, KeyValueMapClass.class);
final Map<Path, InputFormat> inputs = MultipleInputs
.getInputFormatMap(conf);
final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
.getMapperTypeMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
.getClass());
assertEquals(MapClass.class, maps.get(new Path("/foo")));
assertEquals(KeyValueMapClass.class, maps.get(new Path("/bar")));
}
示例5: main
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
public static void main(String[] args) {
Configuration conf = new Configuration(); // assume defaults on CP
conf.setClass("mapreduce.job.inputformat.class", DwCAInputFormat.class, InputFormat.class);
conf.setStrings("mapreduce.input.fileinputformat.inputdir", "hdfs://ha-nn/tmp/dwca-lep5.zip");
conf.setClass("key.class", Text.class, Object.class);
conf.setClass("value.class", ExtendedRecord.class, Object.class);
Pipeline p = newPipeline(args, conf);
Coders.registerAvroCoders(p, UntypedOccurrence.class, TypedOccurrence.class, ExtendedRecord.class);
PCollection<KV<Text, ExtendedRecord>> rawRecords =
p.apply("Read DwC-A", HadoopInputFormatIO.<Text, ExtendedRecord>read().withConfiguration(conf));
PCollection<UntypedOccurrence> verbatimRecords = rawRecords.apply(
"Convert to Avro", ParDo.of(fromExtendedRecordKVP()));
verbatimRecords.apply(
"Write Avro files", AvroIO.write(UntypedOccurrence.class).to("hdfs://ha-nn/tmp/dwca-lep5.avro"));
LOG.info("Starting the pipeline");
PipelineResult result = p.run();
result.waitUntilFinish();
LOG.info("Pipeline finished with state: {} ", result.getState());
}
示例6: testInputFormat
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
void testInputFormat(Class<? extends InputFormat> clazz)
throws IOException, InterruptedException, ClassNotFoundException {
final Job job = MapreduceTestingShim.createJob(UTIL.getConfiguration());
job.setInputFormatClass(clazz);
job.setOutputFormatClass(NullOutputFormat.class);
job.setMapperClass(ExampleVerifier.class);
job.setNumReduceTasks(0);
LOG.debug("submitting job.");
assertTrue("job failed!", job.waitForCompletion(true));
assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, job.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getValue());
assertEquals("Saw any instances of the filtered out row.", 0, job.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getValue());
assertEquals("Saw the wrong number of instances of columnA.", 1, job.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getValue());
assertEquals("Saw the wrong number of instances of columnB.", 1, job.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getValue());
assertEquals("Saw the wrong count of values for the filtered-for row.", 2, job.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getValue());
assertEquals("Saw the wrong count of values for the filtered-out row.", 0, job.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getValue());
}
示例7: LocalMapTask
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
public LocalMapTask(InputFormat<INKEY, INVALUE> inputFormat,
OutputFormat<OUTKEY, OUTVALUE> outputFormat,
Configuration conf, int id, InputSplit split,
ContentPumpReporter reporter, AtomicInteger pctProgress) {
this.inputFormat = inputFormat;
this.outputFormat = outputFormat;
this.conf = conf;
this.id = id;
this.split = split;
this.pctProgress = pctProgress;
this.reporter = reporter;
try {
mapperClass = job.getMapperClass();
} catch (ClassNotFoundException e) {
LOG.error("Mapper class not found", e);
}
}
示例8: testComputeSplitsIfGetSplitsReturnsEmptyList
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
* This test validates behavior of
* {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
* InputFormat's {@link InputFormat#getSplits(JobContext)} returns empty list.
*/
@Test
public void testComputeSplitsIfGetSplitsReturnsEmptyList() throws Exception {
InputFormat<?, ?> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(
new ArrayList<InputSplit>());
HadoopInputFormatBoundedSource<Text, Employee> hifSource =
new HadoopInputFormatBoundedSource<Text, Employee>(
serConf,
WritableCoder.of(Text.class),
AvroCoder.of(Employee.class),
null, // No key translation required.
null, // No value translation required.
mockInputSplit);
thrown.expect(IOException.class);
thrown.expectMessage("Error in computing splits, getSplits() returns a empty list");
hifSource.setInputFormatObj(mockInputFormat);
hifSource.computeSplitsIfNecessary();
}
示例9: testReadersStartWhenZeroRecords
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
* This test validates behavior of
* {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#start() start()} method if
* InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplitList having zero
* records.
*/
@Test
public void testReadersStartWhenZeroRecords() throws Exception {
InputFormat mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
EmployeeRecordReader mockReader = Mockito.mock(EmployeeRecordReader.class);
Mockito.when(
mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
Mockito.any(TaskAttemptContext.class))).thenReturn(mockReader);
Mockito.when(mockReader.nextKeyValue()).thenReturn(false);
InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
new HadoopInputFormatBoundedSource<Text, Employee>(
serConf,
WritableCoder.of(Text.class),
AvroCoder.of(Employee.class),
null, // No key translation required.
null, // No value translation required.
new SerializableSplit(mockInputSplit));
boundedSource.setInputFormatObj(mockInputFormat);
BoundedReader<KV<Text, Employee>> reader = boundedSource.createReader(p.getOptions());
assertEquals(false, reader.start());
assertEquals(Double.valueOf(1), reader.getFractionConsumed());
reader.close();
}
示例10: head
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
@Override
public Iterator<Vertex> head(final String location, final Class readerClass, final int totalLines) {
final Configuration configuration = new BaseConfiguration();
configuration.setProperty(Constants.GREMLIN_HADOOP_INPUT_LOCATION, location);
configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_READER, readerClass.getCanonicalName());
try {
if (InputRDD.class.isAssignableFrom(readerClass)) {
return IteratorUtils.map(((InputRDD) readerClass.getConstructor().newInstance()).readGraphRDD(configuration, new JavaSparkContext(Spark.getContext())).take(totalLines).iterator(), tuple -> tuple._2().get());
} else if (InputFormat.class.isAssignableFrom(readerClass)) {
return IteratorUtils.map(new InputFormatRDD().readGraphRDD(configuration, new JavaSparkContext(Spark.getContext())).take(totalLines).iterator(), tuple -> tuple._2().get());
}
} catch (final Exception e) {
throw new IllegalArgumentException(e.getMessage(), e);
}
throw new IllegalArgumentException("The provided parserClass must be an " + InputFormat.class.getCanonicalName() + " or an " + InputRDD.class.getCanonicalName() + ": " + readerClass.getCanonicalName());
}
示例11: readInputFormat
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
@SuppressWarnings({ "rawtypes", "unchecked" })
private Dataset<Row> readInputFormat(String path) throws Exception {
String inputType = config.getString(INPUT_FORMAT_TYPE_CONFIG);
String keyType = config.getString(INPUT_FORMAT_KEY_CONFIG);
String valueType = config.getString(INPUT_FORMAT_VALUE_CONFIG);
LOG.debug("Reading InputFormat[{}]: {}", inputType, path);
Class<? extends InputFormat> typeClazz = Class.forName(inputType).asSubclass(InputFormat.class);
Class<?> keyClazz = Class.forName(keyType);
Class<?> valueClazz = Class.forName(valueType);
@SuppressWarnings("resource")
JavaSparkContext context = new JavaSparkContext(Contexts.getSparkSession().sparkContext());
JavaPairRDD<?, ?> rdd = context.newAPIHadoopFile(path, typeClazz, keyClazz, valueClazz, new Configuration());
TranslateFunction translateFunction = new TranslateFunction(config.getConfig("translator"));
return Contexts.getSparkSession().createDataFrame(rdd.flatMap(translateFunction), translateFunction.getSchema());
}
示例12: getConfiguration
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
* Returns Hadoop configuration for reading data from Elasticsearch. Configuration object should
* have InputFormat class, key class and value class to be set. Mandatory fields for ESInputFormat
* to be set are es.resource, es.nodes, es.port, es.internal.es.version, es.nodes.wan.only. Please
* refer <a href="https://www.elastic.co/guide/en/elasticsearch/hadoop/current/configuration.html"
* >Elasticsearch Configuration</a> for more details.
*/
private static Configuration getConfiguration(HIFTestOptions options) {
Configuration conf = new Configuration();
conf.set(ConfigurationOptions.ES_NODES, options.getElasticServerIp());
conf.set(ConfigurationOptions.ES_PORT, options.getElasticServerPort().toString());
conf.set(ConfigurationOptions.ES_NODES_WAN_ONLY, TRUE);
// Set username and password if Elasticsearch is configured with security.
conf.set(ConfigurationOptions.ES_NET_HTTP_AUTH_USER, options.getElasticUserName());
conf.set(ConfigurationOptions.ES_NET_HTTP_AUTH_PASS, options.getElasticPassword());
conf.set(ConfigurationOptions.ES_RESOURCE, ELASTIC_RESOURCE);
conf.set("es.internal.es.version", ELASTIC_INTERNAL_VERSION);
conf.set(ConfigurationOptions.ES_INDEX_AUTO_CREATE, TRUE);
conf.setClass("mapreduce.job.inputformat.class",
org.elasticsearch.hadoop.mr.EsInputFormat.class, InputFormat.class);
conf.setClass("key.class", Text.class, Object.class);
conf.setClass("value.class", LinkedMapWritable.class, Object.class);
// Optimizations added to change the max docs per partition, scroll size and batch size of
// bytes to improve the test time for large data
conf.set("es.input.max.docs.per.partition", "50000");
conf.set("es.scroll.size", "400");
conf.set("es.batch.size.bytes", "8mb");
return conf;
}
示例13: getConfiguration
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
* Returns Hadoop configuration for reading data from Cassandra. To read data from Cassandra using
* HadoopInputFormatIO, following properties must be set: InputFormat class, InputFormat key
* class, InputFormat value class, Thrift address, Thrift port, partitioner class, keyspace and
* columnfamily name.
*/
private static Configuration getConfiguration(HIFTestOptions options) {
Configuration conf = new Configuration();
conf.set(CASSANDRA_THRIFT_PORT_PROPERTY, options.getCassandraServerPort().toString());
conf.set(CASSANDRA_THRIFT_ADDRESS_PROPERTY, options.getCassandraServerIp());
conf.set(CASSANDRA_PARTITIONER_CLASS_PROPERTY, CASSANDRA_PARTITIONER_CLASS_VALUE);
conf.set(CASSANDRA_KEYSPACE_PROPERTY, CASSANDRA_KEYSPACE);
conf.set(CASSANDRA_COLUMNFAMILY_PROPERTY, CASSANDRA_TABLE);
// Set user name and password if Cassandra instance has security configured.
conf.set(USERNAME, options.getCassandraUserName());
conf.set(PASSWORD, options.getCassandraPassword());
conf.set(INPUT_KEYSPACE_USERNAME_CONFIG, options.getCassandraUserName());
conf.set(INPUT_KEYSPACE_PASSWD_CONFIG, options.getCassandraPassword());
conf.setClass("mapreduce.job.inputformat.class",
org.apache.cassandra.hadoop.cql3.CqlInputFormat.class, InputFormat.class);
conf.setClass("key.class", java.lang.Long.class, Object.class);
conf.setClass("value.class", com.datastax.driver.core.Row.class, Object.class);
return conf;
}
示例14: testReadValidationFailsWithWrongInputTypeKeyTranslationFunction
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
* This test validates functionality of {@link HadoopInputFormatIO.Read#validateTransform()
* Read.validateTransform()} function when myKeyTranslate's (simple function provided by user for
* key translation) input type is not same as Hadoop InputFormat's keyClass(Which is property set
* in configuration as "key.class").
*/
@Test
public void testReadValidationFailsWithWrongInputTypeKeyTranslationFunction() {
SimpleFunction<LongWritable, String> myKeyTranslateWithWrongInputType =
new SimpleFunction<LongWritable, String>() {
@Override
public String apply(LongWritable input) {
return input.toString();
}
};
HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read()
.withConfiguration(serConf.get())
.withKeyTranslation(myKeyTranslateWithWrongInputType);
thrown.expect(IllegalArgumentException.class);
thrown.expectMessage(String.format(
"Key translation's input type is not same as hadoop InputFormat : %s key " + "class : %s",
serConf.get().getClass("mapreduce.job.inputformat.class",
InputFormat.class), serConf.get()
.getClass("key.class", Object.class)));
read.validateTransform();
}
示例15: testReadValidationFailsWithWrongInputTypeValueTranslationFunction
import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
* This test validates functionality of {@link HadoopInputFormatIO.Read#validateTransform()
* Read.validateTransform()} function when myValueTranslate's (simple function provided by user
* for value translation) input type is not same as Hadoop InputFormat's valueClass(Which is
* property set in configuration as "value.class").
*/
@Test
public void testReadValidationFailsWithWrongInputTypeValueTranslationFunction() {
SimpleFunction<LongWritable, String> myValueTranslateWithWrongInputType =
new SimpleFunction<LongWritable, String>() {
@Override
public String apply(LongWritable input) {
return input.toString();
}
};
HadoopInputFormatIO.Read<Text, String> read =
HadoopInputFormatIO.<Text, String>read()
.withConfiguration(serConf.get())
.withValueTranslation(myValueTranslateWithWrongInputType);
String expectedMessage =
String.format(
"Value translation's input type is not same as hadoop InputFormat : "
+ "%s value class : %s",
serConf.get().getClass("mapreduce.job.inputformat.class",
InputFormat.class),
serConf.get().getClass("value.class", Object.class));
thrown.expect(IllegalArgumentException.class);
thrown.expectMessage(expectedMessage);
read.validateTransform();
}