本文整理汇总了Java中org.apache.hadoop.mapreduce.InputSplit类的典型用法代码示例。如果您正苦于以下问题:Java InputSplit类的具体用法?Java InputSplit怎么用?Java InputSplit使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
InputSplit类属于org.apache.hadoop.mapreduce包,在下文中一共展示了InputSplit类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: addCreatedSplit
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
private void addCreatedSplit(List<InputSplit> splitList,
Collection<String> locations,
ArrayList<OneBlockInfo> validBlocks) {
// create an input split
Path[] fl = new Path[validBlocks.size()];
long[] offset = new long[validBlocks.size()];
long[] length = new long[validBlocks.size()];
for (int i = 0; i < validBlocks.size(); i++) {
fl[i] = validBlocks.get(i).onepath;
offset[i] = validBlocks.get(i).offset;
length[i] = validBlocks.get(i).length;
}
// add this split to the list that is returned
CombineFileSplit thissplit = new CombineFileSplit(fl, offset,
length, locations.toArray(new String[0]));
splitList.add(thissplit);
}
示例2: XMLRecordReader
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
/**
* 初始化读取资源以及相关的参数也可以放到initialize()方法中去执行
* @param inputSplit
* @param context
* @throws IOException
*/
public XMLRecordReader(InputSplit inputSplit, Configuration context) throws IOException {
/**
* 获取开传入的开始和结束标签
*/
startTag = context.get(START_TAG_KEY).getBytes("UTF-8");
endTag = context.get(END_TAG_KEY).getBytes("UTF-8");
FileSplit fileSplit = (FileSplit) inputSplit;
/**
* 获取分片的开始位置和结束的位置
*/
start = fileSplit.getStart();
end = start + fileSplit.getLength();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(context);
/**
* 根据分片打开一个HDFS的文件输入流
*/
fsin = fs.open(fileSplit.getPath());
/**
* 定位到分片开始的位置
*/
fsin.seek(start);
}
示例3: createSplits
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
private List<InputSplit> createSplits(JobContext jobContext, List<DynamicInputChunk> chunks) throws IOException {
int numMaps = getNumMapTasks(jobContext.getConfiguration());
final int nSplits = Math.min(numMaps, chunks.size());
List<InputSplit> splits = new ArrayList<>(nSplits);
for (int i = 0; i < nSplits; ++i) {
TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
chunks.get(i).assignTo(taskId);
splits.add(new FileSplit(chunks.get(i).getPath(), 0,
// Setting non-zero length for FileSplit size, to avoid a possible
// future when 0-sized file-splits are considered "empty" and skipped
// over.
getMinRecordsPerChunk(jobContext.getConfiguration()), null));
}
ConfigurationUtil.publish(jobContext.getConfiguration(), CONF_LABEL_NUM_SPLITS, splits.size());
return splits;
}
示例4: getSplits
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
List<InputSplit> allSplits = new ArrayList<InputSplit>();
Scan originalScan = getScan();
Scan[] scans = rowKeyDistributor.getDistributedScans(originalScan);
for (Scan scan : scans) {
// Internally super.getSplits(...) uses scan object stored in private variable,
// to re-use the code of super class we switch scan object with scans we
setScan(scan);
List<InputSplit> splits = super.getSplits(context);
allSplits.addAll(splits);
}
// Setting original scan back
setScan(originalScan);
return allSplits;
}
示例5: getSplits
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
int numMappers = ConfigurationHelper.getJobNumMaps(job);
String boundaryQuery = getDBConf().getInputBoundingQuery();
// Resort to base class if
// dataslice aligned import is not requested
// Not table extract
// No boundary query
// Only one mapper.
if (!getConf().getBoolean(
NetezzaManager.NETEZZA_DATASLICE_ALIGNED_ACCESS_OPT, false)
|| getDBConf().getInputTableName() == null
|| numMappers == 1
|| (boundaryQuery != null && !boundaryQuery.isEmpty())) {
return super.getSplits(job);
}
// Generate a splitter that splits only on datasliceid. It is an
// integer split. We will just use the lower bounding query to specify
// the restriction of dataslice and set the upper bound to a constant
NetezzaDBDataSliceSplitter splitter = new NetezzaDBDataSliceSplitter();
return splitter.split(getConf(), null, null);
}
开发者ID:aliyun,项目名称:aliyun-maxcompute-data-collectors,代码行数:27,代码来源:NetezzaDataDrivenDBInputFormat.java
示例6: testSplitLocationInfo
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Test
public void testSplitLocationInfo() throws Exception {
Configuration conf = getConfiguration();
conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
"test:///a1/a2");
Job job = Job.getInstance(conf);
TextInputFormat fileInputFormat = new TextInputFormat();
List<InputSplit> splits = fileInputFormat.getSplits(job);
String[] locations = splits.get(0).getLocations();
Assert.assertEquals(2, locations.length);
SplitLocationInfo[] locationInfo = splits.get(0).getLocationInfo();
Assert.assertEquals(2, locationInfo.length);
SplitLocationInfo localhostInfo = locations[0].equals("localhost") ?
locationInfo[0] : locationInfo[1];
SplitLocationInfo otherhostInfo = locations[0].equals("otherhost") ?
locationInfo[0] : locationInfo[1];
Assert.assertTrue(localhostInfo.isOnDisk());
Assert.assertTrue(localhostInfo.isInMemory());
Assert.assertTrue(otherhostInfo.isOnDisk());
Assert.assertFalse(otherhostInfo.isInMemory());
}
示例7: getSplits
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
// Set the max split size based on the number of map tasks we want.
long numTasks = getNumMapTasks(job);
long numFileBytes = getJobSize(job);
long maxSplitSize = numFileBytes / numTasks;
setMaxSplitSize(maxSplitSize);
LOG.debug("Target numMapTasks=" + numTasks);
LOG.debug("Total input bytes=" + numFileBytes);
LOG.debug("maxSplitSize=" + maxSplitSize);
List<InputSplit> splits = super.getSplits(job);
if (LOG.isDebugEnabled()) {
LOG.debug("Generated splits:");
for (InputSplit split : splits) {
LOG.debug(" " + split);
}
}
return splits;
}
示例8: split
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Override
public List<InputSplit> split(Configuration conf, ResultSet results,
String colName) {
// For each map we will add a split such that
// the datasliceid % the mapper index equals the mapper index.
// The query will only be on the lower bound where clause.
// For upper bounds, we will specify a constant clause which always
// evaluates to true
int numSplits = ConfigurationHelper.getConfNumMaps(conf);
List<InputSplit> splitList = new ArrayList<InputSplit>(numSplits);
for (int i = 0; i < numSplits; ++i) {
StringBuilder lowerBoundClause = new StringBuilder(128);
lowerBoundClause.append(" datasliceid % ").append(numSplits)
.append(" = ").append(i);
splitList.add(new DataDrivenDBInputSplit(lowerBoundClause.toString(),
"1 = 1"));
}
return splitList;
}
示例9: getSplits
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
/** Partitions the summation into parts and then return them as splits */
@Override
public List<InputSplit> getSplits(JobContext context) {
//read sigma from conf
final Configuration conf = context.getConfiguration();
final Summation sigma = SummationWritable.read(DistSum.class, conf);
final int nParts = conf.getInt(N_PARTS, 0);
//create splits
final List<InputSplit> splits = new ArrayList<InputSplit>(nParts);
final Summation[] parts = sigma.partition(nParts);
for(int i = 0; i < parts.length; ++i) {
splits.add(new SummationSplit(parts[i]));
//LOG.info("parts[" + i + "] = " + parts[i]);
}
return splits;
}
示例10: readFields
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
/**
* {@inheritDoc}
* @throws IOException If the child InputSplit cannot be read, typically
* for failing access checks.
*/
@SuppressWarnings("unchecked") // Generic array assignment
public void readFields(DataInput in) throws IOException {
int card = WritableUtils.readVInt(in);
if (splits == null || splits.length != card) {
splits = new InputSplit[card];
}
Class<? extends InputSplit>[] cls = new Class[card];
try {
for (int i = 0; i < card; ++i) {
cls[i] =
Class.forName(Text.readString(in)).asSubclass(InputSplit.class);
}
for (int i = 0; i < card; ++i) {
splits[i] = ReflectionUtils.newInstance(cls[i], null);
SerializationFactory factory = new SerializationFactory(conf);
Deserializer deserializer = factory.getDeserializer(cls[i]);
deserializer.open((DataInputStream)in);
splits[i] = (InputSplit)deserializer.deserialize(splits[i]);
}
} catch (ClassNotFoundException e) {
throw new IOException("Failed split init", e);
}
}
示例11: getNewFileSplits
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
/**
* Solve the schedule and modify the FileSplit array to reflect the new
* schedule. It will move placed splits to front and unplacable splits
* to the end.
* @return a new list of FileSplits that are modified to have the
* best host as the only host.
* @throws IOException
*/
public List<InputSplit> getNewFileSplits() throws IOException {
solve();
FileSplit[] result = new FileSplit[realSplits.length];
int left = 0;
int right = realSplits.length - 1;
for(int i=0; i < splits.length; ++i) {
if (splits[i].isAssigned) {
// copy the split and fix up the locations
String[] newLocations = {splits[i].locations.get(0).hostname};
realSplits[i] = new FileSplit(realSplits[i].getPath(),
realSplits[i].getStart(), realSplits[i].getLength(), newLocations);
result[left++] = realSplits[i];
} else {
result[right--] = realSplits[i];
}
}
List<InputSplit> ret = new ArrayList<InputSplit>();
for (FileSplit fs : result) {
ret.add(fs);
}
return ret;
}
示例12: createRecordReader
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Override
public RecordReader<LongWritable, LongWritable> createRecordReader(InputSplit split,
TaskAttemptContext context)
throws IOException, InterruptedException {
int taskId = context.getTaskAttemptID().getTaskID().getId();
int numMapTasks = context.getConfiguration().getInt(NUM_MAPS_KEY, NUM_MAPS);
int numIterations = context.getConfiguration().getInt(NUM_IMPORT_ROUNDS_KEY, NUM_IMPORT_ROUNDS);
int iteration = context.getConfiguration().getInt(ROUND_NUM_KEY, 0);
taskId = taskId + iteration * numMapTasks;
numMapTasks = numMapTasks * numIterations;
long chainId = Math.abs(new Random().nextLong());
chainId = chainId - (chainId % numMapTasks) + taskId; // ensure that chainId is unique per task and across iterations
LongWritable[] keys = new LongWritable[] {new LongWritable(chainId)};
return new FixedRecordReader<LongWritable, LongWritable>(keys, keys);
}
示例13: map
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
public void map(LongWritable key, Record val, Context context)
throws IOException, InterruptedException{
try {
odpsImpl.parse(val);
context.write(odpsImpl, NullWritable.get());
} catch (Exception e) {
LOG.error("Exception raised during data export");
LOG.error("Exception: ", e);
LOG.error("On input: " + val);
LOG.error("At position " + key);
InputSplit is = context.getInputSplit();
LOG.error("");
LOG.error("Currently processing split:");
LOG.error(is);
LOG.error("");
LOG.error("This issue might not necessarily be caused by current input");
LOG.error("due to the batching nature of export.");
LOG.error("");
throw new IOException("Can't export data, please check failed map task logs", e);
}
}
示例14: testRandomLocation
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
private void testRandomLocation(int locations, int njobs,
UserGroupInformation ugi) throws Exception {
Configuration configuration = new Configuration();
DebugJobProducer jobProducer = new DebugJobProducer(njobs, configuration);
Configuration jconf = GridmixTestUtils.mrvl.getConfig();
jconf.setInt(JobCreator.SLEEPJOB_RANDOM_LOCATIONS, locations);
JobStory story;
int seq = 1;
while ((story = jobProducer.getNextJob()) != null) {
GridmixJob gridmixJob = JobCreator.SLEEPJOB.createGridmixJob(jconf, 0,
story, new Path("ignored"), ugi, seq++);
gridmixJob.buildSplits(null);
List<InputSplit> splits = new SleepJob.SleepInputFormat()
.getSplits(gridmixJob.getJob());
for (InputSplit split : splits) {
assertEquals(locations, split.getLocations().length);
}
}
jobProducer.close();
}
示例15: testNumInputFilesRecursively
import org.apache.hadoop.mapreduce.InputSplit; //导入依赖的package包/类
@Test
public void testNumInputFilesRecursively() throws Exception {
Configuration conf = getConfiguration();
conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
Job job = Job.getInstance(conf);
FileInputFormat<?, ?> fileInputFormat = new TextInputFormat();
List<InputSplit> splits = fileInputFormat.getSplits(job);
Assert.assertEquals("Input splits are not correct", 3, splits.size());
verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3",
"test:/a1/file1"), splits);
// Using the deprecated configuration
conf = getConfiguration();
conf.set("mapred.input.dir.recursive", "true");
job = Job.getInstance(conf);
splits = fileInputFormat.getSplits(job);
verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3",
"test:/a1/file1"), splits);
}