本文整理汇总了Java中org.apache.hadoop.mapred.InputSplit类的典型用法代码示例。如果您正苦于以下问题:Java InputSplit类的具体用法?Java InputSplit怎么用?Java InputSplit使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
InputSplit类属于org.apache.hadoop.mapred包,在下文中一共展示了InputSplit类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getRecordReader
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Override
public RecordReader<NullWritable,ColumnAndIndex> getRecordReader( final InputSplit split, final JobConf job, final Reporter reporter ) throws IOException {
FileSplit fileSplit = (FileSplit)split;
Path path = fileSplit.getPath();
FileSystem fs = path.getFileSystem( job );
long fileLength = fs.getLength( path );
long start = fileSplit.getStart();
long length = fileSplit.getLength();
InputStream in = fs.open( path );
IJobReporter jobReporter = new HadoopJobReporter( reporter );
jobReporter.setStatus( String.format( "Read file : %s" , path.toString() ) );
HiveReaderSetting hiveConfig = new HiveReaderSetting( fileSplit , job );
if ( hiveConfig.isVectorMode() ){
IVectorizedReaderSetting vectorizedSetting = new HiveVectorizedReaderSetting( fileSplit , job , hiveConfig );
return (RecordReader)new MDSHiveDirectVectorizedReader( in , fileLength , start , length , vectorizedSetting , jobReporter );
}
else{
return new MDSHiveLineReader( in , fileLength , start , length , hiveConfig , jobReporter , spreadCounter );
}
}
示例2: HiveTextRecordReader
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
public HiveTextRecordReader(Table table, Partition partition, InputSplit inputSplit, List<SchemaPath> projectedColumns, FragmentContext context) throws ExecutionSetupException {
super(table, partition, inputSplit, projectedColumns, context, null);
String d = table.getSd().getSerdeInfo().getParameters().get("field.delim");
if (d != null) {
delimiter = d.getBytes()[0];
} else {
delimiter = (byte) 1;
}
assert delimiter > 0;
List<Integer> ids = Lists.newArrayList();
for (int i = 0; i < tableColumns.size(); i++) {
if (selectedColumnNames.contains(tableColumns.get(i))) {
ids.add(i);
}
}
columnIds = ids;
numCols = tableColumns.size();
}
示例3: getSpecificScan
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Override
public SubScan getSpecificScan(final int minorFragmentId) throws ExecutionSetupException {
try {
final List<InputSplit> splits = mappings.get(minorFragmentId);
List<HivePartition> parts = Lists.newArrayList();
final List<String> encodedInputSplits = Lists.newArrayList();
final List<String> splitTypes = Lists.newArrayList();
for (final InputSplit split : splits) {
HivePartition partition = null;
if (partitionMap.get(split) != null) {
partition = new HivePartition(partitionMap.get(split));
}
parts.add(partition);
encodedInputSplits.add(serializeInputSplit(split));
splitTypes.add(split.getClass().getName());
}
if (parts.contains(null)) {
parts = null;
}
final HiveReadEntry subEntry = new HiveReadEntry(hiveReadEntry.table, parts, hiveReadEntry.hiveConfigOverride);
return new HiveSubScan(getUserName(), encodedInputSplits, subEntry, splitTypes, columns);
} catch (IOException | ReflectiveOperationException e) {
throw new ExecutionSetupException(e);
}
}
示例4: getScanStats
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Override
public ScanStats getScanStats() {
try {
long data =0;
for (final InputSplit split : inputSplits) {
data += split.getLength();
}
long estRowCount = rowCount;
if (estRowCount == 0) {
// having a rowCount of 0 can mean the statistics were never computed
estRowCount = data/1024;
}
// Hive's native reader is neither memory efficient nor fast. Increase the CPU cost
// by a factor to let the planner choose HiveDrillNativeScan over HiveScan with SerDes.
float cpuCost = 1 * getSerDeOverheadFactor();
logger.debug("estimated row count = {}, stats row count = {}", estRowCount, rowCount);
return new ScanStats(GroupScanProperty.NO_EXACT_ROW_COUNT, estRowCount, cpuCost, data);
} catch (final IOException e) {
throw new DrillRuntimeException(e);
}
}
示例5: readUsingRecordReader
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
/**
* Read from Geode, using MonarchRecordReader, all the records from the provided split.
* The split contains the range of records to be read by the record reader. It
* returns the total number of records read by this method.
*
* @param conf the reader configuration -- must have the region name
* @param split the input-split containing the records to be read
* @param predicates the predicates to filter out unwanted results
* @return the total number of records read
*/
private long readUsingRecordReader(final Configuration conf, final InputSplit split,
final Filter... predicates) {
MonarchRecordReader mrr = new MonarchRecordReader(conf);
FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL);
for (int i=0; i<predicates.length; i++) {
filterList.addFilter(predicates[i]);
}
mrr.pushDownfilters = filterList;
long size = 0;
try {
mrr.initialize(split, conf);
Writable key = mrr.createKey();
Writable value = mrr.createValue();
while (mrr.next(key, value)) {
++size;
}
mrr.close();
} catch (IOException e) {
e.printStackTrace();
}
return size;
}
示例6: readFields
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
/**
* {@inheritDoc}
* @throws IOException If the child InputSplit cannot be read, typically
* for faliing access checks.
*/
@SuppressWarnings("unchecked") // Generic array assignment
public void readFields(DataInput in) throws IOException {
int card = WritableUtils.readVInt(in);
if (splits == null || splits.length != card) {
splits = new InputSplit[card];
}
Class<? extends InputSplit>[] cls = new Class[card];
try {
for (int i = 0; i < card; ++i) {
cls[i] =
Class.forName(Text.readString(in)).asSubclass(InputSplit.class);
}
for (int i = 0; i < card; ++i) {
splits[i] = ReflectionUtils.newInstance(cls[i], null);
splits[i].readFields(in);
}
} catch (ClassNotFoundException e) {
throw (IOException)new IOException("Failed split init").initCause(e);
}
}
示例7: getSample
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
samples.add(key);
key = reader.createKey();
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
示例8: testDBInputFormat
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
/**
* test DBInputFormat class. Class should split result for chunks
* @throws Exception
*/
@Test(timeout = 10000)
public void testDBInputFormat() throws Exception {
JobConf configuration = new JobConf();
setupDriver(configuration);
DBInputFormat<NullDBWritable> format = new DBInputFormat<NullDBWritable>();
format.setConf(configuration);
format.setConf(configuration);
DBInputFormat.DBInputSplit splitter = new DBInputFormat.DBInputSplit(1, 10);
Reporter reporter = mock(Reporter.class);
RecordReader<LongWritable, NullDBWritable> reader = format.getRecordReader(
splitter, configuration, reporter);
configuration.setInt(MRJobConfig.NUM_MAPS, 3);
InputSplit[] lSplits = format.getSplits(configuration, 3);
assertEquals(5, lSplits[0].getLength());
assertEquals(3, lSplits.length);
// test reader .Some simple tests
assertEquals(LongWritable.class, reader.createKey().getClass());
assertEquals(0, reader.getPos());
assertEquals(0, reader.getProgress(), 0.001);
reader.close();
}
示例9: getRecordReader
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
public RecordReader getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
FSDataInputStream is = fs.open(fileSplit.getPath());
byte[] header = new byte[3];
RecordReader reader = null;
try {
is.readFully(header);
} catch (EOFException eof) {
reader = textInputFormat.getRecordReader(split, job, reporter);
} finally {
is.close();
}
if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
reader = seqFileInputFormat.getRecordReader(split, job, reporter);
} else {
reader = textInputFormat.getRecordReader(split, job, reporter);
}
return reader;
}
示例10: readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint() throws IOException {
JobConf job = new JobConf(defaultConf);
ClassLoader classLoader = getClass().getClassLoader();
String fileName="excel2013encrypt.xlsx";
String fileNameSpreadSheet=classLoader.getResource(fileName).getFile();
Path file = new Path(fileNameSpreadSheet);
FileInputFormat.setInputPaths(job, file);
// set locale to the one of the test data
job.set("hadoopoffice.read.locale.bcp47","de");
// low footprint
job.set("hadoopoffice.read.lowFootprint", "true");
// for decryption simply set the password
job.set("hadoopoffice.read.security.crypt.password","test2");
ExcelFileInputFormat format = new ExcelFileInputFormat();
format.configure(job);
InputSplit[] inputSplits = format.getSplits(job,1);
assertEquals(1,inputSplits.length,"Only one split generated for Excel file");
RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNull(reader,"Null record reader implies invalid password");
}
示例11: testOpenClose
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Test
public void testOpenClose() throws Exception {
DummyRecordReader recordReader = mock(DummyRecordReader.class);
DummyInputFormat inputFormat = mock(DummyInputFormat.class);
when(inputFormat.getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class))).thenReturn(recordReader);
HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat, String.class, Long.class, new JobConf());
hadoopInputFormat.open(getHadoopInputSplit());
verify(inputFormat, times(1)).getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class));
verify(recordReader, times(1)).createKey();
verify(recordReader, times(1)).createValue();
assertThat(hadoopInputFormat.fetched, is(false));
hadoopInputFormat.close();
verify(recordReader, times(1)).close();
}
示例12: readExcelInputFormatExcel2013SingleSheetEncryptedNegative
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedNegative() throws IOException {
JobConf job = new JobConf(defaultConf);
ClassLoader classLoader = getClass().getClassLoader();
String fileName="excel2013encrypt.xlsx";
String fileNameSpreadSheet=classLoader.getResource(fileName).getFile();
Path file = new Path(fileNameSpreadSheet);
FileInputFormat.setInputPaths(job, file);
// set locale to the one of the test data
job.set("hadoopoffice.read.locale.bcp47","de");
// for decryption simply set the password
job.set("hadoopoffice.read.security.crypt.password","test2");
ExcelFileInputFormat format = new ExcelFileInputFormat();
format.configure(job);
InputSplit[] inputSplits = format.getSplits(job,1);
assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNull(reader, "Null record reader implies invalid password");
}
示例13: readEthereumBlockInputFormatBlock1346406Bzip2Compressed
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Test
public void readEthereumBlockInputFormatBlock1346406Bzip2Compressed() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
JobConf job = new JobConf(defaultConf);
ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth1346406.bin.bz2";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();
Path file = new Path(fileNameBlock);
FileInputFormat.setInputPaths(job, file);
EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
format.configure(job);
InputSplit[] inputSplits = format.getSplits(job,1);
assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned null RecordReader");
BytesWritable key = new BytesWritable();
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 1346406 contains at least one block");
assertEquals( 6, block.getEthereumTransactions().size(),"Block 1346406 must have 6 transactions");
assertFalse( reader.next(key,block),"No further blocks in block 1346406");
reader.close();
}
示例14: getRecordReader
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
// CombineFileSplit indicates the new export format which includes a manifest file
if (inputSplit instanceof CombineFileSplit) {
int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
throw new IOException("Unknown version: " + job.get(DynamoDBConstants
.EXPORT_FORMAT_VERSION));
}
return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
} else if (inputSplit instanceof FileSplit) {
// FileSplit indicates the old data pipeline format which doesn't include a manifest file
Path path = ((FileSplit) inputSplit).getPath();
return new ImportRecordReader(job, path);
} else {
throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
+ " " + inputSplit.getClass());
}
}
示例15: readEthereumBlockInputFormatBlock1
import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Test
public void readEthereumBlockInputFormatBlock1() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
JobConf job = new JobConf(defaultConf);
ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth1.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();
Path file = new Path(fileNameBlock);
FileInputFormat.setInputPaths(job, file);
EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
format.configure(job);
InputSplit[] inputSplits = format.getSplits(job,1);
assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned null RecordReader");
BytesWritable key = new BytesWritable();
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 1 contains at least one block");
assertEquals( 0, block.getEthereumTransactions().size(),"Block 1 must have 0 transactions");
assertFalse( reader.next(key,block),"No further blocks in block 1");
reader.close();
}