当前位置: 首页>>代码示例>>Java>>正文


Java InputFormat类代码示例

本文整理汇总了Java中org.apache.hadoop.mapreduce.InputFormat的典型用法代码示例。如果您正苦于以下问题:Java InputFormat类的具体用法?Java InputFormat怎么用?Java InputFormat使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


InputFormat类属于org.apache.hadoop.mapreduce包,在下文中一共展示了InputFormat类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getInputFormatClass

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
@Override
protected Class<? extends InputFormat> getInputFormatClass()
    throws ClassNotFoundException {
  if (isHCatJob) {
    return SqoopHCatUtilities.getInputFormatClass();
  }
  switch (fileType) {
    case AVRO_DATA_FILE:
      return AvroInputFormat.class;
    case PARQUET_FILE:
      return DatasetKeyInputFormat.class;
    default:
      Class<? extends InputFormat> configuredIF = super.getInputFormatClass();
      if (null == configuredIF) {
        return ExportInputFormat.class;
      } else {
        return configuredIF;
      }
  }
}
 
开发者ID:aliyun,项目名称:aliyun-maxcompute-data-collectors,代码行数:21,代码来源:HdfsOdpsImportJob.java

示例2: testReinit

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
@Test
public void testReinit() throws Exception {
  // Test that a split containing multiple files works correctly,
  // with the child RecordReader getting its initialize() method
  // called a second time.
  TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
  Configuration conf = new Configuration();
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId);

  // This will create a CombineFileRecordReader that itself contains a
  // DummyRecordReader.
  InputFormat inputFormat = new ChildRRInputFormat();

  Path [] files = { new Path("file1"), new Path("file2") };
  long [] lengths = { 1, 1 };

  CombineFileSplit split = new CombineFileSplit(files, lengths);
  RecordReader rr = inputFormat.createRecordReader(split, context);
  assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  // first initialize() call comes from MapTask. We'll do it here.
  rr.initialize(split, context);

  // First value is first filename.
  assertTrue(rr.nextKeyValue());
  assertEquals("file1", rr.getCurrentValue().toString());

  // The inner RR will return false, because it only emits one (k, v) pair.
  // But there's another sub-split to process. This returns true to us.
  assertTrue(rr.nextKeyValue());
  
  // And the 2nd rr will have its initialize method called correctly.
  assertEquals("file2", rr.getCurrentValue().toString());
  
  // But after both child RR's have returned their singleton (k, v), this
  // should also return false.
  assertFalse(rr.nextKeyValue());
}
 
开发者ID:naver,项目名称:hadoop,代码行数:39,代码来源:TestCombineFileInputFormat.java

示例3: readSplit

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
private static List<Text> readSplit(InputFormat<LongWritable,Text> format,
  InputSplit split, Job job) throws IOException, InterruptedException {
  List<Text> result = new ArrayList<Text>();
  Configuration conf = job.getConfiguration();
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(conf);
  RecordReader<LongWritable, Text> reader = format.createRecordReader(split,
    MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
  MapContext<LongWritable,Text,LongWritable,Text> mcontext =
    new MapContextImpl<LongWritable,Text,LongWritable,Text>(conf,
    context.getTaskAttemptID(), reader, null, null,
    MapReduceTestUtil.createDummyReporter(),
    split);
  reader.initialize(split, mcontext);
  while (reader.nextKeyValue()) {
    result.add(new Text(reader.getCurrentValue()));
  }
  return result;
}
 
开发者ID:naver,项目名称:hadoop,代码行数:20,代码来源:TestCombineTextInputFormat.java

示例4: testAddInputPathWithMapper

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
@SuppressWarnings("unchecked")
public void testAddInputPathWithMapper() throws IOException {
  final Job conf = Job.getInstance();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
     MapClass.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
      KeyValueTextInputFormat.class, KeyValueMapClass.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
     .getMapperTypeMap(conf);

  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
  assertEquals(MapClass.class, maps.get(new Path("/foo")));
  assertEquals(KeyValueMapClass.class, maps.get(new Path("/bar")));
}
 
开发者ID:naver,项目名称:hadoop,代码行数:19,代码来源:TestMultipleInputs.java

示例5: main

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
public static void main(String[] args) {
  Configuration conf = new Configuration(); // assume defaults on CP
  conf.setClass("mapreduce.job.inputformat.class", DwCAInputFormat.class, InputFormat.class);
  conf.setStrings("mapreduce.input.fileinputformat.inputdir", "hdfs://ha-nn/tmp/dwca-lep5.zip");
  conf.setClass("key.class", Text.class, Object.class);
  conf.setClass("value.class", ExtendedRecord.class, Object.class);

  Pipeline p = newPipeline(args, conf);
  Coders.registerAvroCoders(p, UntypedOccurrence.class, TypedOccurrence.class, ExtendedRecord.class);

  PCollection<KV<Text, ExtendedRecord>> rawRecords =
    p.apply("Read DwC-A", HadoopInputFormatIO.<Text, ExtendedRecord>read().withConfiguration(conf));

  PCollection<UntypedOccurrence> verbatimRecords = rawRecords.apply(
    "Convert to Avro", ParDo.of(fromExtendedRecordKVP()));

  verbatimRecords.apply(
    "Write Avro files", AvroIO.write(UntypedOccurrence.class).to("hdfs://ha-nn/tmp/dwca-lep5.avro"));

  LOG.info("Starting the pipeline");
  PipelineResult result = p.run();
  result.waitUntilFinish();
  LOG.info("Pipeline finished with state: {} ", result.getState());
}
 
开发者ID:gbif,项目名称:pipelines,代码行数:25,代码来源:DwCA2AvroPipeline.java

示例6: testInputFormat

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
void testInputFormat(Class<? extends InputFormat> clazz)
    throws IOException, InterruptedException, ClassNotFoundException {
  final Job job = MapreduceTestingShim.createJob(UTIL.getConfiguration());
  job.setInputFormatClass(clazz);
  job.setOutputFormatClass(NullOutputFormat.class);
  job.setMapperClass(ExampleVerifier.class);
  job.setNumReduceTasks(0);

  LOG.debug("submitting job.");
  assertTrue("job failed!", job.waitForCompletion(true));
  assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getValue());
  assertEquals("Saw any instances of the filtered out row.", 0, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getValue());
  assertEquals("Saw the wrong number of instances of columnA.", 1, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getValue());
  assertEquals("Saw the wrong number of instances of columnB.", 1, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getValue());
  assertEquals("Saw the wrong count of values for the filtered-for row.", 2, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getValue());
  assertEquals("Saw the wrong count of values for the filtered-out row.", 0, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getValue());
}
 
开发者ID:fengchen8086,项目名称:ditb,代码行数:24,代码来源:TestTableInputFormat.java

示例7: LocalMapTask

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
public LocalMapTask(InputFormat<INKEY, INVALUE> inputFormat, 
             OutputFormat<OUTKEY, OUTVALUE> outputFormat, 
             Configuration conf, int id, InputSplit split, 
             ContentPumpReporter reporter, AtomicInteger pctProgress) {
         this.inputFormat = inputFormat;
         this.outputFormat = outputFormat;
         this.conf = conf;
         this.id = id;
         this.split = split;
         this.pctProgress = pctProgress;
         this.reporter = reporter;
         try {
    mapperClass = job.getMapperClass();
} catch (ClassNotFoundException e) {
    LOG.error("Mapper class not found", e);
}
     }
 
开发者ID:marklogic,项目名称:marklogic-contentpump,代码行数:18,代码来源:LocalJobRunner.java

示例8: testComputeSplitsIfGetSplitsReturnsEmptyList

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
 * This test validates behavior of
 * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
 * InputFormat's {@link InputFormat#getSplits(JobContext)} returns empty list.
 */
@Test
public void testComputeSplitsIfGetSplitsReturnsEmptyList() throws Exception {
  InputFormat<?, ?> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
  SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
  Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(
      new ArrayList<InputSplit>());
  HadoopInputFormatBoundedSource<Text, Employee> hifSource =
      new HadoopInputFormatBoundedSource<Text, Employee>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          mockInputSplit);
  thrown.expect(IOException.class);
  thrown.expectMessage("Error in computing splits, getSplits() returns a empty list");
  hifSource.setInputFormatObj(mockInputFormat);
  hifSource.computeSplitsIfNecessary();
}
 
开发者ID:apache,项目名称:beam,代码行数:25,代码来源:HadoopInputFormatIOTest.java

示例9: testReadersStartWhenZeroRecords

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
 * This test validates behavior of
 * {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#start() start()} method if
 * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplitList having zero
 * records.
 */
@Test
public void testReadersStartWhenZeroRecords() throws Exception {

  InputFormat mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
  EmployeeRecordReader mockReader = Mockito.mock(EmployeeRecordReader.class);
  Mockito.when(
      mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
          Mockito.any(TaskAttemptContext.class))).thenReturn(mockReader);
  Mockito.when(mockReader.nextKeyValue()).thenReturn(false);
  InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
  HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
      new HadoopInputFormatBoundedSource<Text, Employee>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          new SerializableSplit(mockInputSplit));
  boundedSource.setInputFormatObj(mockInputFormat);
  BoundedReader<KV<Text, Employee>> reader = boundedSource.createReader(p.getOptions());
  assertEquals(false, reader.start());
  assertEquals(Double.valueOf(1), reader.getFractionConsumed());
  reader.close();
}
 
开发者ID:apache,项目名称:beam,代码行数:31,代码来源:HadoopInputFormatIOTest.java

示例10: head

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
@Override
public Iterator<Vertex> head(final String location, final Class readerClass, final int totalLines) {
    final Configuration configuration = new BaseConfiguration();
    configuration.setProperty(Constants.GREMLIN_HADOOP_INPUT_LOCATION, location);
    configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_READER, readerClass.getCanonicalName());
    try {
        if (InputRDD.class.isAssignableFrom(readerClass)) {
            return IteratorUtils.map(((InputRDD) readerClass.getConstructor().newInstance()).readGraphRDD(configuration, new JavaSparkContext(Spark.getContext())).take(totalLines).iterator(), tuple -> tuple._2().get());
        } else if (InputFormat.class.isAssignableFrom(readerClass)) {
            return IteratorUtils.map(new InputFormatRDD().readGraphRDD(configuration, new JavaSparkContext(Spark.getContext())).take(totalLines).iterator(), tuple -> tuple._2().get());
        }
    } catch (final Exception e) {
        throw new IllegalArgumentException(e.getMessage(), e);
    }
    throw new IllegalArgumentException("The provided parserClass must be an " + InputFormat.class.getCanonicalName() + " or an " + InputRDD.class.getCanonicalName() + ": " + readerClass.getCanonicalName());
}
 
开发者ID:PKUSilvester,项目名称:LiteGraph,代码行数:17,代码来源:SparkContextStorage.java

示例11: readInputFormat

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
@SuppressWarnings({ "rawtypes", "unchecked" })
private Dataset<Row> readInputFormat(String path) throws Exception {
  String inputType = config.getString(INPUT_FORMAT_TYPE_CONFIG);
  String keyType = config.getString(INPUT_FORMAT_KEY_CONFIG);
  String valueType = config.getString(INPUT_FORMAT_VALUE_CONFIG);

  LOG.debug("Reading InputFormat[{}]: {}", inputType, path);

  Class<? extends InputFormat> typeClazz = Class.forName(inputType).asSubclass(InputFormat.class);
  Class<?> keyClazz = Class.forName(keyType);
  Class<?> valueClazz = Class.forName(valueType);

  @SuppressWarnings("resource")
  JavaSparkContext context = new JavaSparkContext(Contexts.getSparkSession().sparkContext());
  JavaPairRDD<?, ?> rdd = context.newAPIHadoopFile(path, typeClazz, keyClazz, valueClazz, new Configuration());
  
  TranslateFunction translateFunction = new TranslateFunction(config.getConfig("translator"));

  return Contexts.getSparkSession().createDataFrame(rdd.flatMap(translateFunction), translateFunction.getSchema());
}
 
开发者ID:cloudera-labs,项目名称:envelope,代码行数:21,代码来源:FileSystemInput.java

示例12: getConfiguration

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
 * Returns Hadoop configuration for reading data from Elasticsearch. Configuration object should
 * have InputFormat class, key class and value class to be set. Mandatory fields for ESInputFormat
 * to be set are es.resource, es.nodes, es.port, es.internal.es.version, es.nodes.wan.only. Please
 * refer <a href="https://www.elastic.co/guide/en/elasticsearch/hadoop/current/configuration.html"
 * >Elasticsearch Configuration</a> for more details.
 */
private static Configuration getConfiguration(HIFTestOptions options) {
  Configuration conf = new Configuration();
  conf.set(ConfigurationOptions.ES_NODES, options.getElasticServerIp());
  conf.set(ConfigurationOptions.ES_PORT, options.getElasticServerPort().toString());
  conf.set(ConfigurationOptions.ES_NODES_WAN_ONLY, TRUE);
  // Set username and password if Elasticsearch is configured with security.
  conf.set(ConfigurationOptions.ES_NET_HTTP_AUTH_USER, options.getElasticUserName());
  conf.set(ConfigurationOptions.ES_NET_HTTP_AUTH_PASS, options.getElasticPassword());
  conf.set(ConfigurationOptions.ES_RESOURCE, ELASTIC_RESOURCE);
  conf.set("es.internal.es.version", ELASTIC_INTERNAL_VERSION);
  conf.set(ConfigurationOptions.ES_INDEX_AUTO_CREATE, TRUE);
  conf.setClass("mapreduce.job.inputformat.class",
      org.elasticsearch.hadoop.mr.EsInputFormat.class, InputFormat.class);
  conf.setClass("key.class", Text.class, Object.class);
  conf.setClass("value.class", LinkedMapWritable.class, Object.class);
  // Optimizations added to change the max docs per partition, scroll size and batch size of
  // bytes to improve the test time for large data
  conf.set("es.input.max.docs.per.partition", "50000");
  conf.set("es.scroll.size", "400");
  conf.set("es.batch.size.bytes", "8mb");
  return conf;
}
 
开发者ID:apache,项目名称:beam,代码行数:30,代码来源:HIFIOElasticIT.java

示例13: getConfiguration

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
 * Returns Hadoop configuration for reading data from Cassandra. To read data from Cassandra using
 * HadoopInputFormatIO, following properties must be set: InputFormat class, InputFormat key
 * class, InputFormat value class, Thrift address, Thrift port, partitioner class, keyspace and
 * columnfamily name.
 */
private static Configuration getConfiguration(HIFTestOptions options) {
  Configuration conf = new Configuration();
  conf.set(CASSANDRA_THRIFT_PORT_PROPERTY, options.getCassandraServerPort().toString());
  conf.set(CASSANDRA_THRIFT_ADDRESS_PROPERTY, options.getCassandraServerIp());
  conf.set(CASSANDRA_PARTITIONER_CLASS_PROPERTY, CASSANDRA_PARTITIONER_CLASS_VALUE);
  conf.set(CASSANDRA_KEYSPACE_PROPERTY, CASSANDRA_KEYSPACE);
  conf.set(CASSANDRA_COLUMNFAMILY_PROPERTY, CASSANDRA_TABLE);
  // Set user name and password if Cassandra instance has security configured.
  conf.set(USERNAME, options.getCassandraUserName());
  conf.set(PASSWORD, options.getCassandraPassword());
  conf.set(INPUT_KEYSPACE_USERNAME_CONFIG, options.getCassandraUserName());
  conf.set(INPUT_KEYSPACE_PASSWD_CONFIG, options.getCassandraPassword());
  conf.setClass("mapreduce.job.inputformat.class",
      org.apache.cassandra.hadoop.cql3.CqlInputFormat.class, InputFormat.class);
  conf.setClass("key.class", java.lang.Long.class, Object.class);
  conf.setClass("value.class", com.datastax.driver.core.Row.class, Object.class);
  return conf;
}
 
开发者ID:apache,项目名称:beam,代码行数:25,代码来源:HIFIOCassandraIT.java

示例14: testReadValidationFailsWithWrongInputTypeKeyTranslationFunction

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
 * This test validates functionality of {@link HadoopInputFormatIO.Read#validateTransform()
 * Read.validateTransform()} function when myKeyTranslate's (simple function provided by user for
 * key translation) input type is not same as Hadoop InputFormat's keyClass(Which is property set
 * in configuration as "key.class").
 */
@Test
public void testReadValidationFailsWithWrongInputTypeKeyTranslationFunction() {
  SimpleFunction<LongWritable, String> myKeyTranslateWithWrongInputType =
      new SimpleFunction<LongWritable, String>() {
        @Override
        public String apply(LongWritable input) {
          return input.toString();
        }
      };
  HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read()
      .withConfiguration(serConf.get())
      .withKeyTranslation(myKeyTranslateWithWrongInputType);
  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage(String.format(
      "Key translation's input type is not same as hadoop InputFormat : %s key " + "class : %s",
      serConf.get().getClass("mapreduce.job.inputformat.class",
          InputFormat.class), serConf.get()
          .getClass("key.class", Object.class)));
  read.validateTransform();
}
 
开发者ID:apache,项目名称:beam,代码行数:27,代码来源:HadoopInputFormatIOTest.java

示例15: testReadValidationFailsWithWrongInputTypeValueTranslationFunction

import org.apache.hadoop.mapreduce.InputFormat; //导入依赖的package包/类
/**
 * This test validates functionality of {@link HadoopInputFormatIO.Read#validateTransform()
 * Read.validateTransform()} function when myValueTranslate's (simple function provided by user
 * for value translation) input type is not same as Hadoop InputFormat's valueClass(Which is
 * property set in configuration as "value.class").
 */
@Test
public void testReadValidationFailsWithWrongInputTypeValueTranslationFunction() {
  SimpleFunction<LongWritable, String> myValueTranslateWithWrongInputType =
      new SimpleFunction<LongWritable, String>() {
        @Override
        public String apply(LongWritable input) {
          return input.toString();
        }
      };
  HadoopInputFormatIO.Read<Text, String> read =
      HadoopInputFormatIO.<Text, String>read()
          .withConfiguration(serConf.get())
          .withValueTranslation(myValueTranslateWithWrongInputType);
  String expectedMessage =
      String.format(
          "Value translation's input type is not same as hadoop InputFormat :  "
              + "%s value class : %s",
          serConf.get().getClass("mapreduce.job.inputformat.class",
              InputFormat.class),
          serConf.get().getClass("value.class", Object.class));
  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage(expectedMessage);
  read.validateTransform();
}
 
开发者ID:apache,项目名称:beam,代码行数:31,代码来源:HadoopInputFormatIOTest.java


注:本文中的org.apache.hadoop.mapreduce.InputFormat类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。