Java FileSplit类代码示例

本文整理汇总了Java中org.apache.hadoop.mapreduce.lib.input.FileSplit类的典型用法代码示例。如果您正苦于以下问题：Java FileSplit类的具体用法？Java FileSplit怎么用？Java FileSplit使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

FileSplit类属于org.apache.hadoop.mapreduce.lib.input包，在下文中一共展示了FileSplit类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: interleaveSplitFastq

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }

开发者ID:NGSeq，项目名称:ViraPipe，代码行数:17，代码来源:InterleaveMulti.java

示例2: interleaveSplitFastq

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);

      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }

开发者ID:NGSeq，项目名称:ViraPipe，代码行数:18，代码来源:Decompress.java

示例3: map

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
/**
 * {@inheritDoc}
 */
protected void map(final Object key, final OrcStruct value, final Context context) throws IOException, InterruptedException {
    if (value!= null && value.toString() != null && value.toString().isEmpty()) {
        return;
    }

    // Mapper sends data with parent directory path as keys to retain directory structure
    final FileSplit fileSplit = (FileSplit) context.getInputSplit();
    final Path filePath = fileSplit.getPath();
    final String parentFilePath = String.format("%s/", filePath.getParent().toString());
    log.debug("Parent file path {}", parentFilePath);

    if (!fileSizesMap.containsKey(filePath.toString())) {
        if (fileSystem == null){
            final URI uri = URI.create(filePath.toString());
            fileSystem = FileSystem.get(uri, configuration);
        }
        final FileStatus[] listStatuses = fileSystem.listStatus(filePath);
        for (FileStatus fileStatus : listStatuses) {
            if (!fileStatus.isDirectory()) {
                fileSizesMap.put(fileStatus.getPath().toString(), fileStatus.getLen());
                log.info("Entry added to fileSizes Map {} {}", fileStatus.getPath().toString(), fileStatus.getLen());
            }
        }
    }

    final Text parentFilePathKey = new Text(parentFilePath);
    final Text filePathKey = new Text(filePath.toString());
    final OrcValue orcValue = new OrcValue();
    orcValue.value = value;


    final Long fileSize = fileSizesMap.get(filePath.toString());

    if (fileSize < threshold) {
        context.write(parentFilePathKey, orcValue);
    } else {
        context.write(filePathKey, orcValue);
    }
}

开发者ID:ExpediaInceCommercePlatform，项目名称:dataSqueeze，代码行数:43，代码来源:OrcCompactionMapper.java

示例4: splitFastq

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  //TODO: Handle also compressed files
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}

开发者ID:NGSeq，项目名称:ViraPipe，代码行数:17，代码来源:InterleaveMulti.java

示例5: interleaveSplitFastq

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    String[] ns = fst.getPath().getName().split("\\.");
    //TODO: Handle also compressed files
    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir, path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }

开发者ID:NGSeq，项目名称:ViraPipe，代码行数:19，代码来源:DecompressInterleave.java

示例6: XMLRecordReader

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
/**
 * 初始化读取资源以及相关的参数也可以放到initialize（）方法中去执行
 * @param inputSplit
 * @param context
 * @throws IOException
 */
public XMLRecordReader(InputSplit inputSplit, Configuration context) throws IOException {
    /**
     * 获取开传入的开始和结束标签
     */
    startTag = context.get(START_TAG_KEY).getBytes("UTF-8");
    endTag = context.get(END_TAG_KEY).getBytes("UTF-8");
    FileSplit fileSplit = (FileSplit) inputSplit;
    /**
     * 获取分片的开始位置和结束的位置
     */
    start = fileSplit.getStart();
    end = start + fileSplit.getLength();
    Path file = fileSplit.getPath();
    FileSystem fs = file.getFileSystem(context);
    /**
     * 根据分片打开一个HDFS的文件输入流
     */
    fsin = fs.open(fileSplit.getPath());
    /**
     * 定位到分片开始的位置
     */
    fsin.seek(start);
}

开发者ID:lzmhhh123，项目名称:Wikipedia-Index，代码行数:30，代码来源:XmlInputFormat.java

示例7: createSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
private List<InputSplit> createSplits(JobContext jobContext, List<DynamicInputChunk> chunks) throws IOException {
  int numMaps = getNumMapTasks(jobContext.getConfiguration());

  final int nSplits = Math.min(numMaps, chunks.size());
  List<InputSplit> splits = new ArrayList<>(nSplits);

  for (int i = 0; i < nSplits; ++i) {
    TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
    chunks.get(i).assignTo(taskId);
    splits.add(new FileSplit(chunks.get(i).getPath(), 0,
        // Setting non-zero length for FileSplit size, to avoid a possible
        // future when 0-sized file-splits are considered "empty" and skipped
        // over.
        getMinRecordsPerChunk(jobContext.getConfiguration()), null));
  }
  ConfigurationUtil.publish(jobContext.getConfiguration(), CONF_LABEL_NUM_SPLITS, splits.size());
  return splits;
}

开发者ID:HotelsDotCom，项目名称:circus-train，代码行数:19，代码来源:DynamicInputFormat.java

示例8: setup

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
@Override
protected void setup(Context context)
    throws IOException, InterruptedException {
  Configuration conf = context.getConfiguration();
  keyColName = conf.get(MergeJob.MERGE_KEY_COL_KEY);

  InputSplit is = context.getInputSplit();
  FileSplit fs = (FileSplit) is;
  Path splitPath = fs.getPath();

  if (splitPath.toString().startsWith(
      conf.get(MergeJob.MERGE_NEW_PATH_KEY))) {
    this.isNew = true;
  } else if (splitPath.toString().startsWith(
      conf.get(MergeJob.MERGE_OLD_PATH_KEY))) {
    this.isNew = false;
  } else {
    throw new IOException("File " + splitPath + " is not under new path "
        + conf.get(MergeJob.MERGE_NEW_PATH_KEY) + " or old path "
        + conf.get(MergeJob.MERGE_OLD_PATH_KEY));
  }
}

开发者ID:aliyun，项目名称:aliyun-maxcompute-data-collectors，代码行数:23，代码来源:MergeMapperBase.java

示例9: testMaxBlockLocationsNewSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
@Test
public void testMaxBlockLocationsNewSplits() throws Exception {
  TEST_DIR.mkdirs();
  try {
    Configuration conf = new Configuration();
    conf.setInt(MRConfig.MAX_BLOCK_LOCATIONS_KEY, 4);
    Path submitDir = new Path(TEST_DIR.getAbsolutePath());
    FileSystem fs = FileSystem.getLocal(conf);
    FileSplit split = new FileSplit(new Path("/some/path"), 0, 1,
        new String[] { "loc1", "loc2", "loc3", "loc4", "loc5" });
    JobSplitWriter.createSplitFiles(submitDir, conf, fs,
        new FileSplit[] { split });
    JobSplit.TaskSplitMetaInfo[] infos =
        SplitMetaInfoReader.readSplitMetaInfo(new JobID(), fs, conf,
            submitDir);
    assertEquals("unexpected number of splits", 1, infos.length);
    assertEquals("unexpected number of split locations",
        4, infos[0].getLocations().length);
  } finally {
    FileUtil.fullyDelete(TEST_DIR);
  }
}

开发者ID:naver，项目名称:hadoop，代码行数:23，代码来源:TestJobSplitWriter.java

示例10: testMaxBlockLocationsOldSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
@Test
public void testMaxBlockLocationsOldSplits() throws Exception {
  TEST_DIR.mkdirs();
  try {
    Configuration conf = new Configuration();
    conf.setInt(MRConfig.MAX_BLOCK_LOCATIONS_KEY, 4);
    Path submitDir = new Path(TEST_DIR.getAbsolutePath());
    FileSystem fs = FileSystem.getLocal(conf);
    org.apache.hadoop.mapred.FileSplit split =
        new org.apache.hadoop.mapred.FileSplit(new Path("/some/path"), 0, 1,
            new String[] { "loc1", "loc2", "loc3", "loc4", "loc5" });
    JobSplitWriter.createSplitFiles(submitDir, conf, fs,
        new org.apache.hadoop.mapred.InputSplit[] { split });
    JobSplit.TaskSplitMetaInfo[] infos =
        SplitMetaInfoReader.readSplitMetaInfo(new JobID(), fs, conf,
            submitDir);
    assertEquals("unexpected number of splits", 1, infos.length);
    assertEquals("unexpected number of split locations",
        4, infos[0].getLocations().length);
  } finally {
    FileUtil.fullyDelete(TEST_DIR);
  }
}

开发者ID:naver，项目名称:hadoop，代码行数:24，代码来源:TestJobSplitWriter.java

示例11: getSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
  if (job == lastContext) {
    return lastResult;
  }
  long t1, t2, t3;
  t1 = System.currentTimeMillis();
  lastContext = job;
  lastResult = super.getSplits(job);
  t2 = System.currentTimeMillis();
  System.out.println("Spent " + (t2 - t1) + "ms computing base-splits.");
  if (job.getConfiguration().getBoolean(TeraScheduler.USE, true)) {
    TeraScheduler scheduler = new TeraScheduler(
      lastResult.toArray(new FileSplit[0]), job.getConfiguration());
    lastResult = scheduler.getNewFileSplits();
    t3 = System.currentTimeMillis(); 
    System.out.println("Spent " + (t3 - t2) + "ms computing TeraScheduler splits.");
  }
  return lastResult;
}

开发者ID:naver，项目名称:hadoop，代码行数:21，代码来源:TeraInputFormat.java

示例12: TeraScheduler

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
public TeraScheduler(FileSplit[] realSplits,
                     Configuration conf) throws IOException {
  this.realSplits = realSplits;
  this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4);
  Map<String, Host> hostTable = new HashMap<String, Host>();
  splits = new Split[realSplits.length];
  for(FileSplit realSplit: realSplits) {
    Split split = new Split(realSplit.getPath().toString());
    splits[remainingSplits++] = split;
    for(String hostname: realSplit.getLocations()) {
      Host host = hostTable.get(hostname);
      if (host == null) {
        host = new Host(hostname);
        hostTable.put(hostname, host);
        hosts.add(host);
      }
      host.splits.add(split);
      split.locations.add(host);
    }
  }
}

开发者ID:naver，项目名称:hadoop，代码行数:22，代码来源:TeraScheduler.java

示例13: getNewFileSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
/**
 * Solve the schedule and modify the FileSplit array to reflect the new
 * schedule. It will move placed splits to front and unplacable splits
 * to the end.
 * @return a new list of FileSplits that are modified to have the
 *    best host as the only host.
 * @throws IOException
 */
public List<InputSplit> getNewFileSplits() throws IOException {
  solve();
  FileSplit[] result = new FileSplit[realSplits.length];
  int left = 0;
  int right = realSplits.length - 1;
  for(int i=0; i < splits.length; ++i) {
    if (splits[i].isAssigned) {
      // copy the split and fix up the locations
      String[] newLocations = {splits[i].locations.get(0).hostname};
      realSplits[i] = new FileSplit(realSplits[i].getPath(),
          realSplits[i].getStart(), realSplits[i].getLength(), newLocations);
      result[left++] = realSplits[i];
    } else {
      result[right--] = realSplits[i];
    }
  }
  List<InputSplit> ret = new ArrayList<InputSplit>();
  for (FileSplit fs : result) {
    ret.add(fs);
  }
  return ret;
}

开发者ID:naver，项目名称:hadoop，代码行数:31，代码来源:TeraScheduler.java

示例14: createSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
private List<InputSplit> createSplits(JobContext jobContext,
                                      List<DynamicInputChunk> chunks)
        throws IOException {
  int numMaps = getNumMapTasks(jobContext.getConfiguration());

  final int nSplits = Math.min(numMaps, chunks.size());
  List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);
  
  for (int i=0; i< nSplits; ++i) {
    TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
    chunks.get(i).assignTo(taskId);
    splits.add(new FileSplit(chunks.get(i).getPath(), 0,
        // Setting non-zero length for FileSplit size, to avoid a possible
        // future when 0-sized file-splits are considered "empty" and skipped
        // over.
        getMinRecordsPerChunk(jobContext.getConfiguration()),
        null));
  }
  DistCpUtils.publish(jobContext.getConfiguration(),
                      CONF_LABEL_NUM_SPLITS, splits.size());
  return splits;
}

开发者ID:naver，项目名称:hadoop，代码行数:23，代码来源:DynamicInputFormat.java

示例15: checkSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
private void checkSplits(Path listFile, List<InputSplit> splits) throws IOException {
  long lastEnd = 0;

  //Verify if each split's start is matching with the previous end and
  //we are not missing anything
  for (InputSplit split : splits) {
    FileSplit fileSplit = (FileSplit) split;
    long start = fileSplit.getStart();
    Assert.assertEquals(lastEnd, start);
    lastEnd = start + fileSplit.getLength();
  }

  //Verify there is nothing more to read from the input file
  SequenceFile.Reader reader
          = new SequenceFile.Reader(cluster.getFileSystem().getConf(),
                  SequenceFile.Reader.file(listFile));

  try {
    reader.seek(lastEnd);
    CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
    Text srcRelPath = new Text();
    Assert.assertFalse(reader.next(srcRelPath, srcFileStatus));
  } finally {
    IOUtils.closeStream(reader);
  }
}

开发者ID:naver，项目名称:hadoop，代码行数:27，代码来源:TestUniformSizeInputFormat.java

注：本文中的org.apache.hadoop.mapreduce.lib.input.FileSplit类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。