本文整理汇总了Java中org.apache.hadoop.mapreduce.lib.input.FileSplit类的典型用法代码示例。如果您正苦于以下问题:Java FileSplit类的具体用法?Java FileSplit怎么用?Java FileSplit使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
FileSplit类属于org.apache.hadoop.mapreduce.lib.input包,在下文中一共展示了FileSplit类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: interleaveSplitFastq
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
zips.foreach( splits -> {
Path path = splits._1.getPath();
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
});
}
示例2: interleaveSplitFastq
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
zips.foreach( splits -> {
Path path = splits._1.getPath();
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
});
}
示例3: map
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
/**
* {@inheritDoc}
*/
protected void map(final Object key, final OrcStruct value, final Context context) throws IOException, InterruptedException {
if (value!= null && value.toString() != null && value.toString().isEmpty()) {
return;
}
// Mapper sends data with parent directory path as keys to retain directory structure
final FileSplit fileSplit = (FileSplit) context.getInputSplit();
final Path filePath = fileSplit.getPath();
final String parentFilePath = String.format("%s/", filePath.getParent().toString());
log.debug("Parent file path {}", parentFilePath);
if (!fileSizesMap.containsKey(filePath.toString())) {
if (fileSystem == null){
final URI uri = URI.create(filePath.toString());
fileSystem = FileSystem.get(uri, configuration);
}
final FileStatus[] listStatuses = fileSystem.listStatus(filePath);
for (FileStatus fileStatus : listStatuses) {
if (!fileStatus.isDirectory()) {
fileSizesMap.put(fileStatus.getPath().toString(), fileStatus.getLen());
log.info("Entry added to fileSizes Map {} {}", fileStatus.getPath().toString(), fileStatus.getLen());
}
}
}
final Text parentFilePathKey = new Text(parentFilePath);
final Text filePathKey = new Text(filePath.toString());
final OrcValue orcValue = new OrcValue();
orcValue.value = value;
final Long fileSize = fileSizesMap.get(filePath.toString());
if (fileSize < threshold) {
context.write(parentFilePathKey, orcValue);
} else {
context.write(filePathKey, orcValue);
}
}
示例4: splitFastq
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
Path fqpath = new Path(fqPath);
String fqname = fqpath.getName();
String[] ns = fqname.split("\\.");
//TODO: Handle also compressed files
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
splitRDD.foreach( split -> {
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);
});
}
示例5: interleaveSplitFastq
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
String[] ns = fst.getPath().getName().split("\\.");
//TODO: Handle also compressed files
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
zips.foreach( splits -> {
Path path = splits._1.getPath();
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir, path.getParent().getName()+"_"+splits._1.getStart()+".fq");
});
}
示例6: XMLRecordReader
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
/**
* 初始化读取资源以及相关的参数也可以放到initialize()方法中去执行
* @param inputSplit
* @param context
* @throws IOException
*/
public XMLRecordReader(InputSplit inputSplit, Configuration context) throws IOException {
/**
* 获取开传入的开始和结束标签
*/
startTag = context.get(START_TAG_KEY).getBytes("UTF-8");
endTag = context.get(END_TAG_KEY).getBytes("UTF-8");
FileSplit fileSplit = (FileSplit) inputSplit;
/**
* 获取分片的开始位置和结束的位置
*/
start = fileSplit.getStart();
end = start + fileSplit.getLength();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(context);
/**
* 根据分片打开一个HDFS的文件输入流
*/
fsin = fs.open(fileSplit.getPath());
/**
* 定位到分片开始的位置
*/
fsin.seek(start);
}
示例7: createSplits
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
private List<InputSplit> createSplits(JobContext jobContext, List<DynamicInputChunk> chunks) throws IOException {
int numMaps = getNumMapTasks(jobContext.getConfiguration());
final int nSplits = Math.min(numMaps, chunks.size());
List<InputSplit> splits = new ArrayList<>(nSplits);
for (int i = 0; i < nSplits; ++i) {
TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
chunks.get(i).assignTo(taskId);
splits.add(new FileSplit(chunks.get(i).getPath(), 0,
// Setting non-zero length for FileSplit size, to avoid a possible
// future when 0-sized file-splits are considered "empty" and skipped
// over.
getMinRecordsPerChunk(jobContext.getConfiguration()), null));
}
ConfigurationUtil.publish(jobContext.getConfiguration(), CONF_LABEL_NUM_SPLITS, splits.size());
return splits;
}
示例8: setup
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
keyColName = conf.get(MergeJob.MERGE_KEY_COL_KEY);
InputSplit is = context.getInputSplit();
FileSplit fs = (FileSplit) is;
Path splitPath = fs.getPath();
if (splitPath.toString().startsWith(
conf.get(MergeJob.MERGE_NEW_PATH_KEY))) {
this.isNew = true;
} else if (splitPath.toString().startsWith(
conf.get(MergeJob.MERGE_OLD_PATH_KEY))) {
this.isNew = false;
} else {
throw new IOException("File " + splitPath + " is not under new path "
+ conf.get(MergeJob.MERGE_NEW_PATH_KEY) + " or old path "
+ conf.get(MergeJob.MERGE_OLD_PATH_KEY));
}
}
示例9: testMaxBlockLocationsNewSplits
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
@Test
public void testMaxBlockLocationsNewSplits() throws Exception {
TEST_DIR.mkdirs();
try {
Configuration conf = new Configuration();
conf.setInt(MRConfig.MAX_BLOCK_LOCATIONS_KEY, 4);
Path submitDir = new Path(TEST_DIR.getAbsolutePath());
FileSystem fs = FileSystem.getLocal(conf);
FileSplit split = new FileSplit(new Path("/some/path"), 0, 1,
new String[] { "loc1", "loc2", "loc3", "loc4", "loc5" });
JobSplitWriter.createSplitFiles(submitDir, conf, fs,
new FileSplit[] { split });
JobSplit.TaskSplitMetaInfo[] infos =
SplitMetaInfoReader.readSplitMetaInfo(new JobID(), fs, conf,
submitDir);
assertEquals("unexpected number of splits", 1, infos.length);
assertEquals("unexpected number of split locations",
4, infos[0].getLocations().length);
} finally {
FileUtil.fullyDelete(TEST_DIR);
}
}
示例10: testMaxBlockLocationsOldSplits
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
@Test
public void testMaxBlockLocationsOldSplits() throws Exception {
TEST_DIR.mkdirs();
try {
Configuration conf = new Configuration();
conf.setInt(MRConfig.MAX_BLOCK_LOCATIONS_KEY, 4);
Path submitDir = new Path(TEST_DIR.getAbsolutePath());
FileSystem fs = FileSystem.getLocal(conf);
org.apache.hadoop.mapred.FileSplit split =
new org.apache.hadoop.mapred.FileSplit(new Path("/some/path"), 0, 1,
new String[] { "loc1", "loc2", "loc3", "loc4", "loc5" });
JobSplitWriter.createSplitFiles(submitDir, conf, fs,
new org.apache.hadoop.mapred.InputSplit[] { split });
JobSplit.TaskSplitMetaInfo[] infos =
SplitMetaInfoReader.readSplitMetaInfo(new JobID(), fs, conf,
submitDir);
assertEquals("unexpected number of splits", 1, infos.length);
assertEquals("unexpected number of split locations",
4, infos[0].getLocations().length);
} finally {
FileUtil.fullyDelete(TEST_DIR);
}
}
示例11: getSplits
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
if (job == lastContext) {
return lastResult;
}
long t1, t2, t3;
t1 = System.currentTimeMillis();
lastContext = job;
lastResult = super.getSplits(job);
t2 = System.currentTimeMillis();
System.out.println("Spent " + (t2 - t1) + "ms computing base-splits.");
if (job.getConfiguration().getBoolean(TeraScheduler.USE, true)) {
TeraScheduler scheduler = new TeraScheduler(
lastResult.toArray(new FileSplit[0]), job.getConfiguration());
lastResult = scheduler.getNewFileSplits();
t3 = System.currentTimeMillis();
System.out.println("Spent " + (t3 - t2) + "ms computing TeraScheduler splits.");
}
return lastResult;
}
示例12: TeraScheduler
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
public TeraScheduler(FileSplit[] realSplits,
Configuration conf) throws IOException {
this.realSplits = realSplits;
this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4);
Map<String, Host> hostTable = new HashMap<String, Host>();
splits = new Split[realSplits.length];
for(FileSplit realSplit: realSplits) {
Split split = new Split(realSplit.getPath().toString());
splits[remainingSplits++] = split;
for(String hostname: realSplit.getLocations()) {
Host host = hostTable.get(hostname);
if (host == null) {
host = new Host(hostname);
hostTable.put(hostname, host);
hosts.add(host);
}
host.splits.add(split);
split.locations.add(host);
}
}
}
示例13: getNewFileSplits
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
/**
* Solve the schedule and modify the FileSplit array to reflect the new
* schedule. It will move placed splits to front and unplacable splits
* to the end.
* @return a new list of FileSplits that are modified to have the
* best host as the only host.
* @throws IOException
*/
public List<InputSplit> getNewFileSplits() throws IOException {
solve();
FileSplit[] result = new FileSplit[realSplits.length];
int left = 0;
int right = realSplits.length - 1;
for(int i=0; i < splits.length; ++i) {
if (splits[i].isAssigned) {
// copy the split and fix up the locations
String[] newLocations = {splits[i].locations.get(0).hostname};
realSplits[i] = new FileSplit(realSplits[i].getPath(),
realSplits[i].getStart(), realSplits[i].getLength(), newLocations);
result[left++] = realSplits[i];
} else {
result[right--] = realSplits[i];
}
}
List<InputSplit> ret = new ArrayList<InputSplit>();
for (FileSplit fs : result) {
ret.add(fs);
}
return ret;
}
示例14: createSplits
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
private List<InputSplit> createSplits(JobContext jobContext,
List<DynamicInputChunk> chunks)
throws IOException {
int numMaps = getNumMapTasks(jobContext.getConfiguration());
final int nSplits = Math.min(numMaps, chunks.size());
List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);
for (int i=0; i< nSplits; ++i) {
TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
chunks.get(i).assignTo(taskId);
splits.add(new FileSplit(chunks.get(i).getPath(), 0,
// Setting non-zero length for FileSplit size, to avoid a possible
// future when 0-sized file-splits are considered "empty" and skipped
// over.
getMinRecordsPerChunk(jobContext.getConfiguration()),
null));
}
DistCpUtils.publish(jobContext.getConfiguration(),
CONF_LABEL_NUM_SPLITS, splits.size());
return splits;
}
示例15: checkSplits
import org.apache.hadoop.mapreduce.lib.input.FileSplit; //导入依赖的package包/类
private void checkSplits(Path listFile, List<InputSplit> splits) throws IOException {
long lastEnd = 0;
//Verify if each split's start is matching with the previous end and
//we are not missing anything
for (InputSplit split : splits) {
FileSplit fileSplit = (FileSplit) split;
long start = fileSplit.getStart();
Assert.assertEquals(lastEnd, start);
lastEnd = start + fileSplit.getLength();
}
//Verify there is nothing more to read from the input file
SequenceFile.Reader reader
= new SequenceFile.Reader(cluster.getFileSystem().getConf(),
SequenceFile.Reader.file(listFile));
try {
reader.seek(lastEnd);
CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
Text srcRelPath = new Text();
Assert.assertFalse(reader.next(srcRelPath, srcFileStatus));
} finally {
IOUtils.closeStream(reader);
}
}