Java FileSplit類代碼示例

本文整理匯總了Java中org.apache.hadoop.mapreduce.lib.input.FileSplit類的典型用法代碼示例。如果您正苦於以下問題：Java FileSplit類的具體用法？Java FileSplit怎麽用？Java FileSplit使用的例子？那麽, 這裏精選的類代碼示例或許可以為您提供幫助。

FileSplit類屬於org.apache.hadoop.mapreduce.lib.input包，在下文中一共展示了FileSplit類的15個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Java代碼示例。

示例1: interleaveSplitFastq

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }

開發者ID:NGSeq，項目名稱:ViraPipe，代碼行數:17，代碼來源:InterleaveMulti.java

示例2: interleaveSplitFastq

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);

      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }

開發者ID:NGSeq，項目名稱:ViraPipe，代碼行數:18，代碼來源:Decompress.java

示例3: map

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
/**
 * {@inheritDoc}
 */
protected void map(final Object key, final OrcStruct value, final Context context) throws IOException, InterruptedException {
    if (value!= null && value.toString() != null && value.toString().isEmpty()) {
        return;
    }

    // Mapper sends data with parent directory path as keys to retain directory structure
    final FileSplit fileSplit = (FileSplit) context.getInputSplit();
    final Path filePath = fileSplit.getPath();
    final String parentFilePath = String.format("%s/", filePath.getParent().toString());
    log.debug("Parent file path {}", parentFilePath);

    if (!fileSizesMap.containsKey(filePath.toString())) {
        if (fileSystem == null){
            final URI uri = URI.create(filePath.toString());
            fileSystem = FileSystem.get(uri, configuration);
        }
        final FileStatus[] listStatuses = fileSystem.listStatus(filePath);
        for (FileStatus fileStatus : listStatuses) {
            if (!fileStatus.isDirectory()) {
                fileSizesMap.put(fileStatus.getPath().toString(), fileStatus.getLen());
                log.info("Entry added to fileSizes Map {} {}", fileStatus.getPath().toString(), fileStatus.getLen());
            }
        }
    }

    final Text parentFilePathKey = new Text(parentFilePath);
    final Text filePathKey = new Text(filePath.toString());
    final OrcValue orcValue = new OrcValue();
    orcValue.value = value;


    final Long fileSize = fileSizesMap.get(filePath.toString());

    if (fileSize < threshold) {
        context.write(parentFilePathKey, orcValue);
    } else {
        context.write(filePathKey, orcValue);
    }
}

開發者ID:ExpediaInceCommercePlatform，項目名稱:dataSqueeze，代碼行數:43，代碼來源:OrcCompactionMapper.java

示例4: splitFastq

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  //TODO: Handle also compressed files
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}

開發者ID:NGSeq，項目名稱:ViraPipe，代碼行數:17，代碼來源:InterleaveMulti.java

示例5: interleaveSplitFastq

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    String[] ns = fst.getPath().getName().split("\\.");
    //TODO: Handle also compressed files
    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir, path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }

開發者ID:NGSeq，項目名稱:ViraPipe，代碼行數:19，代碼來源:DecompressInterleave.java

示例6: XMLRecordReader

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
/**
 * 初始化讀取資源以及相關的參數也可以放到initialize（）方法中去執行
 * @param inputSplit
 * @param context
 * @throws IOException
 */
public XMLRecordReader(InputSplit inputSplit, Configuration context) throws IOException {
    /**
     * 獲取開傳入的開始和結束標簽
     */
    startTag = context.get(START_TAG_KEY).getBytes("UTF-8");
    endTag = context.get(END_TAG_KEY).getBytes("UTF-8");
    FileSplit fileSplit = (FileSplit) inputSplit;
    /**
     * 獲取分片的開始位置和結束的位置
     */
    start = fileSplit.getStart();
    end = start + fileSplit.getLength();
    Path file = fileSplit.getPath();
    FileSystem fs = file.getFileSystem(context);
    /**
     * 根據分片打開一個HDFS的文件輸入流
     */
    fsin = fs.open(fileSplit.getPath());
    /**
     * 定位到分片開始的位置
     */
    fsin.seek(start);
}

開發者ID:lzmhhh123，項目名稱:Wikipedia-Index，代碼行數:30，代碼來源:XmlInputFormat.java

示例7: createSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
private List<InputSplit> createSplits(JobContext jobContext, List<DynamicInputChunk> chunks) throws IOException {
  int numMaps = getNumMapTasks(jobContext.getConfiguration());

  final int nSplits = Math.min(numMaps, chunks.size());
  List<InputSplit> splits = new ArrayList<>(nSplits);

  for (int i = 0; i < nSplits; ++i) {
    TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
    chunks.get(i).assignTo(taskId);
    splits.add(new FileSplit(chunks.get(i).getPath(), 0,
        // Setting non-zero length for FileSplit size, to avoid a possible
        // future when 0-sized file-splits are considered "empty" and skipped
        // over.
        getMinRecordsPerChunk(jobContext.getConfiguration()), null));
  }
  ConfigurationUtil.publish(jobContext.getConfiguration(), CONF_LABEL_NUM_SPLITS, splits.size());
  return splits;
}

開發者ID:HotelsDotCom，項目名稱:circus-train，代碼行數:19，代碼來源:DynamicInputFormat.java

示例8: setup

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
@Override
protected void setup(Context context)
    throws IOException, InterruptedException {
  Configuration conf = context.getConfiguration();
  keyColName = conf.get(MergeJob.MERGE_KEY_COL_KEY);

  InputSplit is = context.getInputSplit();
  FileSplit fs = (FileSplit) is;
  Path splitPath = fs.getPath();

  if (splitPath.toString().startsWith(
      conf.get(MergeJob.MERGE_NEW_PATH_KEY))) {
    this.isNew = true;
  } else if (splitPath.toString().startsWith(
      conf.get(MergeJob.MERGE_OLD_PATH_KEY))) {
    this.isNew = false;
  } else {
    throw new IOException("File " + splitPath + " is not under new path "
        + conf.get(MergeJob.MERGE_NEW_PATH_KEY) + " or old path "
        + conf.get(MergeJob.MERGE_OLD_PATH_KEY));
  }
}

開發者ID:aliyun，項目名稱:aliyun-maxcompute-data-collectors，代碼行數:23，代碼來源:MergeMapperBase.java

示例9: testMaxBlockLocationsNewSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
@Test
public void testMaxBlockLocationsNewSplits() throws Exception {
  TEST_DIR.mkdirs();
  try {
    Configuration conf = new Configuration();
    conf.setInt(MRConfig.MAX_BLOCK_LOCATIONS_KEY, 4);
    Path submitDir = new Path(TEST_DIR.getAbsolutePath());
    FileSystem fs = FileSystem.getLocal(conf);
    FileSplit split = new FileSplit(new Path("/some/path"), 0, 1,
        new String[] { "loc1", "loc2", "loc3", "loc4", "loc5" });
    JobSplitWriter.createSplitFiles(submitDir, conf, fs,
        new FileSplit[] { split });
    JobSplit.TaskSplitMetaInfo[] infos =
        SplitMetaInfoReader.readSplitMetaInfo(new JobID(), fs, conf,
            submitDir);
    assertEquals("unexpected number of splits", 1, infos.length);
    assertEquals("unexpected number of split locations",
        4, infos[0].getLocations().length);
  } finally {
    FileUtil.fullyDelete(TEST_DIR);
  }
}

開發者ID:naver，項目名稱:hadoop，代碼行數:23，代碼來源:TestJobSplitWriter.java

示例10: testMaxBlockLocationsOldSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
@Test
public void testMaxBlockLocationsOldSplits() throws Exception {
  TEST_DIR.mkdirs();
  try {
    Configuration conf = new Configuration();
    conf.setInt(MRConfig.MAX_BLOCK_LOCATIONS_KEY, 4);
    Path submitDir = new Path(TEST_DIR.getAbsolutePath());
    FileSystem fs = FileSystem.getLocal(conf);
    org.apache.hadoop.mapred.FileSplit split =
        new org.apache.hadoop.mapred.FileSplit(new Path("/some/path"), 0, 1,
            new String[] { "loc1", "loc2", "loc3", "loc4", "loc5" });
    JobSplitWriter.createSplitFiles(submitDir, conf, fs,
        new org.apache.hadoop.mapred.InputSplit[] { split });
    JobSplit.TaskSplitMetaInfo[] infos =
        SplitMetaInfoReader.readSplitMetaInfo(new JobID(), fs, conf,
            submitDir);
    assertEquals("unexpected number of splits", 1, infos.length);
    assertEquals("unexpected number of split locations",
        4, infos[0].getLocations().length);
  } finally {
    FileUtil.fullyDelete(TEST_DIR);
  }
}

開發者ID:naver，項目名稱:hadoop，代碼行數:24，代碼來源:TestJobSplitWriter.java

示例11: getSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
  if (job == lastContext) {
    return lastResult;
  }
  long t1, t2, t3;
  t1 = System.currentTimeMillis();
  lastContext = job;
  lastResult = super.getSplits(job);
  t2 = System.currentTimeMillis();
  System.out.println("Spent " + (t2 - t1) + "ms computing base-splits.");
  if (job.getConfiguration().getBoolean(TeraScheduler.USE, true)) {
    TeraScheduler scheduler = new TeraScheduler(
      lastResult.toArray(new FileSplit[0]), job.getConfiguration());
    lastResult = scheduler.getNewFileSplits();
    t3 = System.currentTimeMillis(); 
    System.out.println("Spent " + (t3 - t2) + "ms computing TeraScheduler splits.");
  }
  return lastResult;
}

開發者ID:naver，項目名稱:hadoop，代碼行數:21，代碼來源:TeraInputFormat.java

示例12: TeraScheduler

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
public TeraScheduler(FileSplit[] realSplits,
                     Configuration conf) throws IOException {
  this.realSplits = realSplits;
  this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4);
  Map<String, Host> hostTable = new HashMap<String, Host>();
  splits = new Split[realSplits.length];
  for(FileSplit realSplit: realSplits) {
    Split split = new Split(realSplit.getPath().toString());
    splits[remainingSplits++] = split;
    for(String hostname: realSplit.getLocations()) {
      Host host = hostTable.get(hostname);
      if (host == null) {
        host = new Host(hostname);
        hostTable.put(hostname, host);
        hosts.add(host);
      }
      host.splits.add(split);
      split.locations.add(host);
    }
  }
}

開發者ID:naver，項目名稱:hadoop，代碼行數:22，代碼來源:TeraScheduler.java

示例13: getNewFileSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
/**
 * Solve the schedule and modify the FileSplit array to reflect the new
 * schedule. It will move placed splits to front and unplacable splits
 * to the end.
 * @return a new list of FileSplits that are modified to have the
 *    best host as the only host.
 * @throws IOException
 */
public List<InputSplit> getNewFileSplits() throws IOException {
  solve();
  FileSplit[] result = new FileSplit[realSplits.length];
  int left = 0;
  int right = realSplits.length - 1;
  for(int i=0; i < splits.length; ++i) {
    if (splits[i].isAssigned) {
      // copy the split and fix up the locations
      String[] newLocations = {splits[i].locations.get(0).hostname};
      realSplits[i] = new FileSplit(realSplits[i].getPath(),
          realSplits[i].getStart(), realSplits[i].getLength(), newLocations);
      result[left++] = realSplits[i];
    } else {
      result[right--] = realSplits[i];
    }
  }
  List<InputSplit> ret = new ArrayList<InputSplit>();
  for (FileSplit fs : result) {
    ret.add(fs);
  }
  return ret;
}

開發者ID:naver，項目名稱:hadoop，代碼行數:31，代碼來源:TeraScheduler.java

示例14: createSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
private List<InputSplit> createSplits(JobContext jobContext,
                                      List<DynamicInputChunk> chunks)
        throws IOException {
  int numMaps = getNumMapTasks(jobContext.getConfiguration());

  final int nSplits = Math.min(numMaps, chunks.size());
  List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);
  
  for (int i=0; i< nSplits; ++i) {
    TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
    chunks.get(i).assignTo(taskId);
    splits.add(new FileSplit(chunks.get(i).getPath(), 0,
        // Setting non-zero length for FileSplit size, to avoid a possible
        // future when 0-sized file-splits are considered "empty" and skipped
        // over.
        getMinRecordsPerChunk(jobContext.getConfiguration()),
        null));
  }
  DistCpUtils.publish(jobContext.getConfiguration(),
                      CONF_LABEL_NUM_SPLITS, splits.size());
  return splits;
}

開發者ID:naver，項目名稱:hadoop，代碼行數:23，代碼來源:DynamicInputFormat.java

示例15: checkSplits

import org.apache.hadoop.mapreduce.lib.input.FileSplit; //導入依賴的package包/類
private void checkSplits(Path listFile, List<InputSplit> splits) throws IOException {
  long lastEnd = 0;

  //Verify if each split's start is matching with the previous end and
  //we are not missing anything
  for (InputSplit split : splits) {
    FileSplit fileSplit = (FileSplit) split;
    long start = fileSplit.getStart();
    Assert.assertEquals(lastEnd, start);
    lastEnd = start + fileSplit.getLength();
  }

  //Verify there is nothing more to read from the input file
  SequenceFile.Reader reader
          = new SequenceFile.Reader(cluster.getFileSystem().getConf(),
                  SequenceFile.Reader.file(listFile));

  try {
    reader.seek(lastEnd);
    CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
    Text srcRelPath = new Text();
    Assert.assertFalse(reader.next(srcRelPath, srcFileStatus));
  } finally {
    IOUtils.closeStream(reader);
  }
}

開發者ID:naver，項目名稱:hadoop，代碼行數:27，代碼來源:TestUniformSizeInputFormat.java

注：本文中的org.apache.hadoop.mapreduce.lib.input.FileSplit類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。