Java InputSplit类代码示例

本文整理汇总了Java中org.apache.hadoop.mapreduce.InputSplit类的典型用法代码示例。如果您正苦于以下问题：Java InputSplit类的具体用法？Java InputSplit怎么用？Java InputSplit使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

InputSplit类属于org.apache.hadoop.mapreduce包，在下文中一共展示了InputSplit类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: addCreatedSplit

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
private void addCreatedSplit(List<InputSplit> splitList,
  Collection<String> locations,
  ArrayList<OneBlockInfo> validBlocks) {
  // create an input split
  Path[] fl = new Path[validBlocks.size()];
  long[] offset = new long[validBlocks.size()];
  long[] length = new long[validBlocks.size()];
  for (int i = 0; i < validBlocks.size(); i++) {
    fl[i] = validBlocks.get(i).onepath;
    offset[i] = validBlocks.get(i).offset;
    length[i] = validBlocks.get(i).length;
  }
  // add this split to the list that is returned
  CombineFileSplit thissplit = new CombineFileSplit(fl, offset,
    length, locations.toArray(new String[0]));
  splitList.add(thissplit);
}

开发者ID:Tencent，项目名称:angel，代码行数:18，代码来源:BalanceInputFormat.java

示例2: XMLRecordReader

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
/**
 * 初始化读取资源以及相关的参数也可以放到initialize（）方法中去执行
 * @param inputSplit
 * @param context
 * @throws IOException
 */
public XMLRecordReader(InputSplit inputSplit, Configuration context) throws IOException {
    /**
     * 获取开传入的开始和结束标签
     */
    startTag = context.get(START_TAG_KEY).getBytes("UTF-8");
    endTag = context.get(END_TAG_KEY).getBytes("UTF-8");
    FileSplit fileSplit = (FileSplit) inputSplit;
    /**
     * 获取分片的开始位置和结束的位置
     */
    start = fileSplit.getStart();
    end = start + fileSplit.getLength();
    Path file = fileSplit.getPath();
    FileSystem fs = file.getFileSystem(context);
    /**
     * 根据分片打开一个HDFS的文件输入流
     */
    fsin = fs.open(fileSplit.getPath());
    /**
     * 定位到分片开始的位置
     */
    fsin.seek(start);
}

开发者ID:lzmhhh123，项目名称:Wikipedia-Index，代码行数:30，代码来源:XmlInputFormat.java

示例3: createSplits

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
private List<InputSplit> createSplits(JobContext jobContext, List<DynamicInputChunk> chunks) throws IOException {
  int numMaps = getNumMapTasks(jobContext.getConfiguration());

  final int nSplits = Math.min(numMaps, chunks.size());
  List<InputSplit> splits = new ArrayList<>(nSplits);

  for (int i = 0; i < nSplits; ++i) {
    TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
    chunks.get(i).assignTo(taskId);
    splits.add(new FileSplit(chunks.get(i).getPath(), 0,
        // Setting non-zero length for FileSplit size, to avoid a possible
        // future when 0-sized file-splits are considered "empty" and skipped
        // over.
        getMinRecordsPerChunk(jobContext.getConfiguration()), null));
  }
  ConfigurationUtil.publish(jobContext.getConfiguration(), CONF_LABEL_NUM_SPLITS, splits.size());
  return splits;
}

开发者ID:HotelsDotCom，项目名称:circus-train，代码行数:19，代码来源:DynamicInputFormat.java

示例4: getSplits

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
  List<InputSplit> allSplits = new ArrayList<InputSplit>();
  Scan originalScan = getScan();

  Scan[] scans = rowKeyDistributor.getDistributedScans(originalScan);

  for (Scan scan : scans) {
    // Internally super.getSplits(...) uses scan object stored in private variable,
    // to re-use the code of super class we switch scan object with scans we
    setScan(scan);
    List<InputSplit> splits = super.getSplits(context);
    allSplits.addAll(splits);
  }

  // Setting original scan back
  setScan(originalScan);

  return allSplits;
}

开发者ID:fchenxi，项目名称:easyhbase，代码行数:21，代码来源:WdTableInputFormat.java

示例5: getSplits

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
  int numMappers = ConfigurationHelper.getJobNumMaps(job);

  String boundaryQuery = getDBConf().getInputBoundingQuery();
  // Resort to base class if
  // dataslice aligned import is not requested
  // Not table extract
  // No boundary query
  // Only one mapper.
  if (!getConf().getBoolean(
      NetezzaManager.NETEZZA_DATASLICE_ALIGNED_ACCESS_OPT, false)
      || getDBConf().getInputTableName() == null
      || numMappers == 1
      || (boundaryQuery != null && !boundaryQuery.isEmpty())) {
    return super.getSplits(job);
  }

  // Generate a splitter that splits only on datasliceid. It is an
  // integer split. We will just use the lower bounding query to specify
  // the restriction of dataslice and set the upper bound to a constant

  NetezzaDBDataSliceSplitter splitter = new NetezzaDBDataSliceSplitter();

  return splitter.split(getConf(), null, null);
}

开发者ID:aliyun，项目名称:aliyun-maxcompute-data-collectors，代码行数:27，代码来源:NetezzaDataDrivenDBInputFormat.java

示例6: testSplitLocationInfo

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Test
public void testSplitLocationInfo() throws Exception {
  Configuration conf = getConfiguration();
  conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
      "test:///a1/a2");
  Job job = Job.getInstance(conf);
  TextInputFormat fileInputFormat = new TextInputFormat();
  List<InputSplit> splits = fileInputFormat.getSplits(job);
  String[] locations = splits.get(0).getLocations();
  Assert.assertEquals(2, locations.length);
  SplitLocationInfo[] locationInfo = splits.get(0).getLocationInfo();
  Assert.assertEquals(2, locationInfo.length);
  SplitLocationInfo localhostInfo = locations[0].equals("localhost") ?
      locationInfo[0] : locationInfo[1];
  SplitLocationInfo otherhostInfo = locations[0].equals("otherhost") ?
      locationInfo[0] : locationInfo[1];
  Assert.assertTrue(localhostInfo.isOnDisk());
  Assert.assertTrue(localhostInfo.isInMemory());
  Assert.assertTrue(otherhostInfo.isOnDisk());
  Assert.assertFalse(otherhostInfo.isInMemory());
}

开发者ID:naver，项目名称:hadoop，代码行数:22，代码来源:TestFileInputFormat.java

示例7: getSplits

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
  // Set the max split size based on the number of map tasks we want.
  long numTasks = getNumMapTasks(job);
  long numFileBytes = getJobSize(job);
  long maxSplitSize = numFileBytes / numTasks;

  setMaxSplitSize(maxSplitSize);

  LOG.debug("Target numMapTasks=" + numTasks);
  LOG.debug("Total input bytes=" + numFileBytes);
  LOG.debug("maxSplitSize=" + maxSplitSize);

  List<InputSplit> splits =  super.getSplits(job);

  if (LOG.isDebugEnabled()) {
    LOG.debug("Generated splits:");
    for (InputSplit split : splits) {
      LOG.debug("  " + split);
    }
  }
  return splits;
}

开发者ID:aliyun，项目名称:aliyun-maxcompute-data-collectors，代码行数:24，代码来源:ExportInputFormat.java

示例8: split

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Override
public List<InputSplit> split(Configuration conf, ResultSet results,
    String colName) {
  // For each map we will add a split such that
  // the datasliceid % the mapper index equals the mapper index.
  // The query will only be on the lower bound where clause.
  // For upper bounds, we will specify a constant clause which always
  // evaluates to true

  int numSplits = ConfigurationHelper.getConfNumMaps(conf);
  List<InputSplit> splitList = new ArrayList<InputSplit>(numSplits);
  for (int i = 0; i < numSplits; ++i) {
    StringBuilder lowerBoundClause = new StringBuilder(128);
    lowerBoundClause.append(" datasliceid % ").append(numSplits)
        .append(" = ").append(i);
    splitList.add(new DataDrivenDBInputSplit(lowerBoundClause.toString(),
        "1 = 1"));
  }
  return splitList;
}

开发者ID:aliyun，项目名称:aliyun-maxcompute-data-collectors，代码行数:21，代码来源:NetezzaDBDataSliceSplitter.java

示例9: getSplits

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
/** Partitions the summation into parts and then return them as splits */
@Override
public List<InputSplit> getSplits(JobContext context) {
  //read sigma from conf
  final Configuration conf = context.getConfiguration();
  final Summation sigma = SummationWritable.read(DistSum.class, conf); 
  final int nParts = conf.getInt(N_PARTS, 0);
  
  //create splits
  final List<InputSplit> splits = new ArrayList<InputSplit>(nParts);
  final Summation[] parts = sigma.partition(nParts);
  for(int i = 0; i < parts.length; ++i) {
    splits.add(new SummationSplit(parts[i]));
    //LOG.info("parts[" + i + "] = " + parts[i]);
  }
  return splits;
}

开发者ID:naver，项目名称:hadoop，代码行数:18，代码来源:DistSum.java

示例10: readFields

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
/**
 * {@inheritDoc}
 * @throws IOException If the child InputSplit cannot be read, typically
 *                     for failing access checks.
 */
@SuppressWarnings("unchecked")  // Generic array assignment
public void readFields(DataInput in) throws IOException {
  int card = WritableUtils.readVInt(in);
  if (splits == null || splits.length != card) {
    splits = new InputSplit[card];
  }
  Class<? extends InputSplit>[] cls = new Class[card];
  try {
    for (int i = 0; i < card; ++i) {
      cls[i] =
        Class.forName(Text.readString(in)).asSubclass(InputSplit.class);
    }
    for (int i = 0; i < card; ++i) {
      splits[i] = ReflectionUtils.newInstance(cls[i], null);
      SerializationFactory factory = new SerializationFactory(conf);
      Deserializer deserializer = factory.getDeserializer(cls[i]);
      deserializer.open((DataInputStream)in);
      splits[i] = (InputSplit)deserializer.deserialize(splits[i]);
    }
  } catch (ClassNotFoundException e) {
    throw new IOException("Failed split init", e);
  }
}

开发者ID:naver，项目名称:hadoop，代码行数:29，代码来源:CompositeInputSplit.java

示例11: getNewFileSplits

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
/**
 * Solve the schedule and modify the FileSplit array to reflect the new
 * schedule. It will move placed splits to front and unplacable splits
 * to the end.
 * @return a new list of FileSplits that are modified to have the
 *    best host as the only host.
 * @throws IOException
 */
public List<InputSplit> getNewFileSplits() throws IOException {
  solve();
  FileSplit[] result = new FileSplit[realSplits.length];
  int left = 0;
  int right = realSplits.length - 1;
  for(int i=0; i < splits.length; ++i) {
    if (splits[i].isAssigned) {
      // copy the split and fix up the locations
      String[] newLocations = {splits[i].locations.get(0).hostname};
      realSplits[i] = new FileSplit(realSplits[i].getPath(),
          realSplits[i].getStart(), realSplits[i].getLength(), newLocations);
      result[left++] = realSplits[i];
    } else {
      result[right--] = realSplits[i];
    }
  }
  List<InputSplit> ret = new ArrayList<InputSplit>();
  for (FileSplit fs : result) {
    ret.add(fs);
  }
  return ret;
}

开发者ID:naver，项目名称:hadoop，代码行数:31，代码来源:TeraScheduler.java

示例12: createRecordReader

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Override
public RecordReader<LongWritable, LongWritable> createRecordReader(InputSplit split,
  TaskAttemptContext context)
      throws IOException, InterruptedException {
  int taskId = context.getTaskAttemptID().getTaskID().getId();
  int numMapTasks = context.getConfiguration().getInt(NUM_MAPS_KEY, NUM_MAPS);
  int numIterations = context.getConfiguration().getInt(NUM_IMPORT_ROUNDS_KEY, NUM_IMPORT_ROUNDS);
  int iteration = context.getConfiguration().getInt(ROUND_NUM_KEY, 0);

  taskId = taskId + iteration * numMapTasks;
  numMapTasks = numMapTasks * numIterations;

  long chainId = Math.abs(new Random().nextLong());
  chainId = chainId - (chainId % numMapTasks) + taskId; // ensure that chainId is unique per task and across iterations
  LongWritable[] keys = new LongWritable[] {new LongWritable(chainId)};

  return new FixedRecordReader<LongWritable, LongWritable>(keys, keys);
}

开发者ID:fengchen8086，项目名称:ditb，代码行数:19，代码来源:IntegrationTestBulkLoad.java

示例13: map

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
public void map(LongWritable key, Record val, Context context) 
    throws IOException, InterruptedException{
  try {
    odpsImpl.parse(val);
    context.write(odpsImpl, NullWritable.get());
  } catch (Exception e) {
    LOG.error("Exception raised during data export");
    LOG.error("Exception: ", e);

    LOG.error("On input: " + val);
    LOG.error("At position " + key);
    InputSplit is = context.getInputSplit();
    LOG.error("");
    LOG.error("Currently processing split:");
    LOG.error(is);
    LOG.error("");
    LOG.error("This issue might not necessarily be caused by current input");
    LOG.error("due to the batching nature of export.");
    LOG.error("");
    throw new IOException("Can't export data, please check failed map task logs", e);
  }
}

开发者ID:aliyun，项目名称:aliyun-maxcompute-data-collectors，代码行数:23，代码来源:OdpsExportMapper.java

示例14: testRandomLocation

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
private void testRandomLocation(int locations, int njobs,
                                UserGroupInformation ugi) throws Exception {
  Configuration configuration = new Configuration();

  DebugJobProducer jobProducer = new DebugJobProducer(njobs, configuration);
  Configuration jconf = GridmixTestUtils.mrvl.getConfig();
  jconf.setInt(JobCreator.SLEEPJOB_RANDOM_LOCATIONS, locations);

  JobStory story;
  int seq = 1;
  while ((story = jobProducer.getNextJob()) != null) {
    GridmixJob gridmixJob = JobCreator.SLEEPJOB.createGridmixJob(jconf, 0,
            story, new Path("ignored"), ugi, seq++);
    gridmixJob.buildSplits(null);
    List<InputSplit> splits = new SleepJob.SleepInputFormat()
            .getSplits(gridmixJob.getJob());
    for (InputSplit split : splits) {
      assertEquals(locations, split.getLocations().length);
    }
  }
  jobProducer.close();
}

开发者ID:naver，项目名称:hadoop，代码行数:23，代码来源:TestSleepJob.java

示例15: testNumInputFilesRecursively

import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Test
public void testNumInputFilesRecursively() throws Exception {
  Configuration conf = getConfiguration();
  conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
  conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
  Job job = Job.getInstance(conf);
  FileInputFormat<?, ?> fileInputFormat = new TextInputFormat();
  List<InputSplit> splits = fileInputFormat.getSplits(job);
  Assert.assertEquals("Input splits are not correct", 3, splits.size());
  verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3",
      "test:/a1/file1"), splits);

  // Using the deprecated configuration
  conf = getConfiguration();
  conf.set("mapred.input.dir.recursive", "true");
  job = Job.getInstance(conf);
  splits = fileInputFormat.getSplits(job);
  verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3",
      "test:/a1/file1"), splits);
}

开发者ID:naver，项目名称:hadoop，代码行数:21，代码来源:TestFileInputFormat.java

注：本文中的org.apache.hadoop.mapreduce.InputSplit类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。