本文整理匯總了Java中org.apache.hadoop.mapred.FileSplit類的典型用法代碼示例。如果您正苦於以下問題:Java FileSplit類的具體用法?Java FileSplit怎麽用?Java FileSplit使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
FileSplit類屬於org.apache.hadoop.mapred包,在下文中一共展示了FileSplit類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: getRowGroupNumbersFromFileSplit
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
示例2: HiveVectorizedReaderSetting
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public HiveVectorizedReaderSetting( final FileSplit split , final JobConf job , final HiveReaderSetting hiveReaderConfig ) throws IOException{
this.hiveReaderConfig = hiveReaderConfig;
rbCtx = Utilities.getVectorizedRowBatchCtx( job );
partitionValues = new Object[rbCtx.getPartitionColumnCount()];
if( 0 < partitionValues.length ){
rbCtx.getPartitionValues( rbCtx, job, split, partitionValues );
}
TypeInfo[] typeInfos = rbCtx.getRowColumnTypeInfos();
columnNames = rbCtx.getRowColumnNames();
needColumnIds = createNeedColumnId( ColumnProjectionUtils.getReadColumnIDs( job ) );
projectionColumn = new boolean[columnNames.length];
assignors = new IColumnVectorAssignor[columnNames.length];
for( int id : needColumnIds ){
projectionColumn[id] = true;
assignors[id] = ColumnVectorAssignorFactory.create( typeInfos[id] );
}
}
示例3: getRecordReader
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
@Override
public RecordReader<NullWritable,ColumnAndIndex> getRecordReader( final InputSplit split, final JobConf job, final Reporter reporter ) throws IOException {
FileSplit fileSplit = (FileSplit)split;
Path path = fileSplit.getPath();
FileSystem fs = path.getFileSystem( job );
long fileLength = fs.getLength( path );
long start = fileSplit.getStart();
long length = fileSplit.getLength();
InputStream in = fs.open( path );
IJobReporter jobReporter = new HadoopJobReporter( reporter );
jobReporter.setStatus( String.format( "Read file : %s" , path.toString() ) );
HiveReaderSetting hiveConfig = new HiveReaderSetting( fileSplit , job );
if ( hiveConfig.isVectorMode() ){
IVectorizedReaderSetting vectorizedSetting = new HiveVectorizedReaderSetting( fileSplit , job , hiveConfig );
return (RecordReader)new MDSHiveDirectVectorizedReader( in , fileLength , start , length , vectorizedSetting , jobReporter );
}
else{
return new MDSHiveLineReader( in , fileLength , start , length , hiveConfig , jobReporter , spreadCounter );
}
}
示例4: DelimitedAndFixedWidthRecordReader
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public DelimitedAndFixedWidthRecordReader(JobConf conf, FileSplit split)
throws IOException {
lengthsAndDelimiters = DelimitedAndFixedWidthHelper
.modifyIdentifier(conf.get("lengthsAndDelimiters").split(Constants.LENGTHS_AND_DELIMITERS_SEPARATOR));
lengthsAndDelimitersType = conf.get("lengthsAndDelimitersType").split(Constants.LENGTHS_AND_DELIMITERS_SEPARATOR);
quote = conf.get("quote");
charsetName = conf.get("charsetName");
start = split.getStart();
pos = start;
end = start + split.getLength();
file = split.getPath();
fs = file.getFileSystem(conf);
fileIn = fs.open(split.getPath());
fileIn.seek(start);
inputStreamReader = new InputStreamReader(fileIn, charsetName);
singleChar = new char[1];
stringBuilder = new StringBuilder();
isQuotePresent = isQuotePresent(quote);
}
示例5: StreamXmlRecordReader
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
JobConf job, FileSystem fs) throws IOException {
super(in, split, reporter, job, fs);
beginMark_ = checkJobGet(CONF_NS + "begin");
endMark_ = checkJobGet(CONF_NS + "end");
maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
synched_ = false;
slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
if (slowMatch_) {
beginPat_ = makePatternCDataOrMark(beginMark_);
endPat_ = makePatternCDataOrMark(endMark_);
}
init();
}
示例6: getRecordReader
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public RecordReader getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
FSDataInputStream is = fs.open(fileSplit.getPath());
byte[] header = new byte[3];
RecordReader reader = null;
try {
is.readFully(header);
} catch (EOFException eof) {
reader = textInputFormat.getRecordReader(split, job, reporter);
} finally {
is.close();
}
if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
reader = seqFileInputFormat.getRecordReader(split, job, reporter);
} else {
reader = textInputFormat.getRecordReader(split, job, reporter);
}
return reader;
}
示例7: FileSplitParquetRecordReader
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public FileSplitParquetRecordReader(
final OperatorContext oContext,
final ParquetReaderFactory readerFactory,
final List<SchemaPath> columnsToRead,
final List<SchemaPath> groupScanColumns,
final List<FilterCondition> conditions,
final FileSplit fileSplit,
final ParquetMetadata footer,
final JobConf jobConf,
final boolean vectorize,
final boolean enableDetailedTracing
) {
this.oContext = oContext;
this.columnsToRead = columnsToRead;
this.groupScanColumns = groupScanColumns;
this.conditions = conditions;
this.fileSplit = fileSplit;
this.footer = footer;
this.jobConf = jobConf;
this.readerFactory = readerFactory;
this.vectorize = vectorize;
this.enableDetailedTracing = enableDetailedTracing;
}
示例8: getRowGroupNumbersFromFileSplit
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private static List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
示例9: getRecordReader
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
// CombineFileSplit indicates the new export format which includes a manifest file
if (inputSplit instanceof CombineFileSplit) {
int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
throw new IOException("Unknown version: " + job.get(DynamoDBConstants
.EXPORT_FORMAT_VERSION));
}
return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
} else if (inputSplit instanceof FileSplit) {
// FileSplit indicates the old data pipeline format which doesn't include a manifest file
Path path = ((FileSplit) inputSplit).getPath();
return new ImportRecordReader(job, path);
} else {
throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
+ " " + inputSplit.getClass());
}
}
示例10: IndexRRecordReader
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public IndexRRecordReader(InputSplit inputSplit, Configuration configuration) throws IOException {
FileSplit fileSplit = (FileSplit) inputSplit;
Preconditions.checkState(fileSplit.getStart() == 0, "Segment should not splited");
Path filePath = fileSplit.getPath();
// Hive may ask to read a file located on local file system.
// We have to get the real file system by path's schema.
FileSystem fileSystem = FileSystem.get(filePath.toUri(), FileSystem.get(configuration).getConf());
if (SegmentHelper.checkSegmentByPath(filePath)) {
ByteBufferReader.Opener opener = ByteBufferReader.Opener.create(fileSystem, filePath);
IntegratedSegment.Fd fd = IntegratedSegment.Fd.create(filePath.toString(), opener);
if (fd != null) {
segment = fd.open();
offset = 0L;
rowIterator = segment.rowTraversal().iterator();
getIncludeColumns(configuration, segment);
}
} else {
LOG.warn("ignore " + filePath);
}
}
示例11: getSplits
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
@Override
public FileSplit[] getSplits(JobConf job, int numSplits) throws IOException {
// first, merge input table properties (since there's no access to them ...)
Settings settings = HadoopSettingsManager.loadFrom(job);
//settings.merge(IOUtils.propsFromString(settings.getProperty(HiveConstants.INPUT_TBL_PROPERTIES)));
Log log = LogFactory.getLog(getClass());
// move on to initialization
InitializationUtils.setValueReaderIfNotSet(settings, HiveValueReader.class, log);
settings.setProperty(InternalConfigurationOptions.INTERNAL_ES_TARGET_FIELDS, StringUtils.concatenateAndUriEncode(HiveUtils.columnToAlias(settings), ","));
// set read resource
settings.setResourceRead(settings.getResourceRead());
HiveUtils.init(settings, log);
// decorate original splits as FileSplit
InputSplit[] shardSplits = super.getSplits(job, numSplits);
FileSplit[] wrappers = new FileSplit[shardSplits.length];
Path path = new Path(job.get(HiveConstants.TABLE_LOCATION));
for (int i = 0; i < wrappers.length; i++) {
wrappers[i] = new EsHiveSplit(shardSplits[i], path);
}
return wrappers;
}
示例12: testOrcDataStream
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
@Test
public void testOrcDataStream()
throws Exception
{
HiveOutputFormat<?, ?> outputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat();
InputFormat<?, ?> inputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcInputFormat();
@SuppressWarnings("deprecation")
SerDe serde = new org.apache.hadoop.hive.ql.io.orc.OrcSerde();
File file = File.createTempFile("presto_test", "orc");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
testPageSourceFactory(new OrcPageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
示例13: testRcTextPageSource
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
@Test(enabled = false)
public void testRcTextPageSource()
throws Exception
{
HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat();
InputFormat<?, ?> inputFormat = new RCFileInputFormat<>();
@SuppressWarnings("deprecation")
SerDe serde = new ColumnarSerDe();
File file = File.createTempFile("presto_test", "rc-binary");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
testPageSourceFactory(new RcFilePageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
示例14: getSplits
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException
{
InputSplit[] tmp = super.getSplits(job, numSplits);
//get partitioning information
MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job);
PDataPartitionFormat dpf = MRJobConfiguration.getPartitioningFormat(job);
PartitionFormat pf = new PartitionFormat(dpf, -1);
int blen = (int) (pf.isRowwise() ? pf.getNumRows(mc) : pf.getNumColumns(mc));
String fname = MRJobConfiguration.getPartitioningFilename(job);
//create wrapper splits
InputSplit[] ret = new InputSplit[ tmp.length ];
for( int i=0; i<tmp.length; i++ ) {
//check for robustness of subsequent cast
if( tmp[i] instanceof FileSplit )
ret[i] = new RemoteParForColocatedFileSplit( (FileSplit) tmp[i], fname, blen );
else
ret[i] = tmp[i];
}
return ret;
}
示例15: LineDocRecordReader
import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
/**
* Constructor
* @param job
* @param split
* @throws IOException
*/
public LineDocRecordReader(Configuration job, FileSplit split)
throws IOException {
long start = split.getStart();
long end = start + split.getLength();
final Path file = split.getPath();
// open the file and seek to the start of the split
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
InputStream in = fileIn;
boolean skipFirstLine = false;
if (start != 0) {
skipFirstLine = true; // wait till BufferedInputStream to skip
--start;
fileIn.seek(start);
}
this.in = new BufferedInputStream(in);
if (skipFirstLine) { // skip first line and re-establish "start".
start += LineDocRecordReader.readData(this.in, null, EOL);
}
this.start = start;
this.pos = start;
this.end = end;
}