本文整理汇总了Java中org.apache.hadoop.mapred.InputFormat.getSplits方法的典型用法代码示例。如果您正苦于以下问题:Java InputFormat.getSplits方法的具体用法?Java InputFormat.getSplits怎么用?Java InputFormat.getSplits使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.hadoop.mapred.InputFormat
的用法示例。
在下文中一共展示了InputFormat.getSplits方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getSample
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
samples.add(key);
key = reader.createKey();
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
示例2: init
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
@Override
public void init(@Nonnull Context context) {
logger = context.jetInstance().getHazelcastInstance().getLoggingService().getLogger(ReadHdfsP.class);
try {
int totalParallelism = context.totalParallelism();
InputFormat inputFormat = jobConf.getInputFormat();
InputSplit[] splits = inputFormat.getSplits(jobConf, totalParallelism);
IndexedInputSplit[] indexedInputSplits = new IndexedInputSplit[splits.length];
Arrays.setAll(indexedInputSplits, i -> new IndexedInputSplit(i, splits[i]));
Address[] addrs = context.jetInstance().getCluster().getMembers()
.stream().map(Member::getAddress).toArray(Address[]::new);
assigned = assignSplitsToMembers(indexedInputSplits, addrs);
printAssignments(assigned);
} catch (IOException e) {
throw rethrow(e);
}
}
示例3: splitInput
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
private void splitInput(final Properties properties, final StorageDescriptor sd, final Partition partition)
throws ReflectiveOperationException, IOException {
final JobConf job = new JobConf();
for (final Object obj : properties.keySet()) {
job.set((String) obj, (String) properties.get(obj));
}
for (final Map.Entry<String, String> entry : hiveReadEntry.hiveConfigOverride.entrySet()) {
job.set(entry.getKey(), entry.getValue());
}
InputFormat<?, ?> format = (InputFormat<?, ?>)
Class.forName(sd.getInputFormat()).getConstructor().newInstance();
job.setInputFormat(format.getClass());
final Path path = new Path(sd.getLocation());
final FileSystem fs = path.getFileSystem(job);
if (fs.exists(path)) {
FileInputFormat.addInputPath(job, path);
format = job.getInputFormat();
for (final InputSplit split : format.getSplits(job, 1)) {
inputSplits.add(split);
partitionMap.put(split, partition);
}
}
final String numRowsProp = properties.getProperty("numRows");
logger.trace("HiveScan num rows property = {}", numRowsProp);
if (numRowsProp != null) {
final long numRows = Long.valueOf(numRowsProp);
// starting from hive-0.13, when no statistics are available, this property is set to -1
// it's important to note that the value returned by hive may not be up to date
if (numRows > 0) {
rowCount += numRows;
}
}
}
示例4: getSplits
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
private InputSplit[] getSplits() throws IOException {
InputFormat<Key, Row> inputFormat = this.gfxdManager.getInputFormat();
try {
return inputFormat.getSplits(this.jobConf, 1);
} catch (FileNotFoundException fnfe) {
throw new FileNotFoundException(
"Table "
+ this.gfxdManager.getTable()
+ " not found. "
+ "The LOCATION string may contain incorrect value for one or more of the following:"
+ "1. Path to HDFSSTORE (homeDir), 2. Schema name or 3. Table name. "
+ GemFireXDManager.LOCATION_FORMAT);
}
}
示例5: addSplitsForGroup
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
private void addSplitsForGroup(List<Path> dirs, TableScanOperator tableScan, JobConf conf,
InputFormat inputFormat, Class<? extends InputFormat> inputFormatClass, int splits,
TableDesc table, List<InputSplit> result) throws IOException {
Utilities.copyTablePropertiesToConf(table, conf);
if (tableScan != null) {
pushFilters(conf, tableScan);
}
FileInputFormat.setInputPaths(conf, dirs.toArray(new Path[dirs.size()]));
conf.setInputFormat(inputFormat.getClass());
int headerCount = 0;
int footerCount = 0;
if (table != null) {
headerCount = Utilities.getHeaderCount(table);
footerCount = Utilities.getFooterCount(table, conf);
if (headerCount != 0 || footerCount != 0) {
// Input file has header or footer, cannot be splitted.
conf.setLong(
ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"),
Long.MAX_VALUE);
}
}
InputSplit[] iss = inputFormat.getSplits(conf, splits);
for (InputSplit is : iss) {
result.add(new HiveInputSplit(is, inputFormatClass.getName()));
}
}