本文整理汇总了Java中org.apache.hadoop.mapred.TextInputFormat.getSplits方法的典型用法代码示例。如果您正苦于以下问题:Java TextInputFormat.getSplits方法的具体用法?Java TextInputFormat.getSplits怎么用?Java TextInputFormat.getSplits使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.hadoop.mapred.TextInputFormat
的用法示例。
在下文中一共展示了TextInputFormat.getSplits方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: readTextCellFrameFromHDFS
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest,
ValueType[] schema, String[] names, long rlen, long clen)
throws IOException
{
if( fs.isDirectory(path) ) {
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
for(InputSplit split: splits)
readTextCellFrameFromInputSplit(split, informat, job, dest);
}
else {
readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
}
}
示例2: testNodeProcessingSchema
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
public void testNodeProcessingSchema(MapOperator<LongWritable, Text, Text, IntWritable> oper) throws IOException
{
CollectorTestSink sortSink = new CollectorTestSink();
oper.output.setSink(sortSink);
oper.setMapClass(WordCount.Map.class);
oper.setCombineClass(WordCount.Reduce.class);
oper.setDirName(testMeta.testDir);
oper.setConfigFile(null);
oper.setInputFormatClass(TextInputFormat.class);
Configuration conf = new Configuration();
JobConf jobConf = new JobConf(conf);
FileInputFormat.setInputPaths(jobConf, new Path(testMeta.testDir));
TextInputFormat inputFormat = new TextInputFormat();
inputFormat.configure(jobConf);
InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
SerializationFactory serializationFactory = new SerializationFactory(conf);
Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
keySerializer.open(oper.getOutstream());
keySerializer.serialize(splits[0]);
oper.setInputSplitClass(splits[0].getClass());
keySerializer.close();
oper.setup(null);
oper.beginWindow(0);
oper.emitTuples();
oper.emitTuples();
oper.endWindow();
oper.beginWindow(1);
oper.emitTuples();
oper.endWindow();
Assert.assertEquals("number emitted tuples", 3, sortSink.collectedTuples.size());
for (Object o : sortSink.collectedTuples) {
LOG.debug(o.toString());
}
LOG.debug("Done testing round\n");
oper.teardown();
}
示例3: computeCSVSize
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
@Override
protected Pair<Integer,Integer> computeCSVSize( Path path, JobConf job, FileSystem fs)
throws IOException
{
int numThreads = OptimizerUtils.getParallelTextReadParallelism();
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, numThreads);
//compute number of columns
int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
//compute number of rows
int nrow = 0;
ExecutorService pool = Executors.newFixedThreadPool(numThreads);
try {
ArrayList<CountRowsTask> tasks = new ArrayList<>();
for( int i=0; i<splits.length; i++ )
tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i==0));
List<Future<Long>> cret = pool.invokeAll(tasks);
for( Future<Long> count : cret )
nrow += count.get().intValue();
}
catch (Exception e) {
throw new IOException("Failed parallel read of text csv input.", e);
}
finally {
pool.shutdown();
}
return new Pair<>(nrow, ncol);
}
示例4: readCSVFrameFromHDFS
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
protected void readCSVFrameFromHDFS( Path path, JobConf job, FileSystem fs,
FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen)
throws IOException
{
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
splits = IOUtilFunctions.sortInputSplits(splits);
for( int i=0, rpos=0; i<splits.length; i++ )
rpos = readCSVFrameFromInputSplit(splits[i], informat,
job, dest, schema, names, rlen, clen, rpos, i==0);
}
示例5: computeCSVSize
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
protected Pair<Integer,Integer> computeCSVSize( Path path, JobConf job, FileSystem fs)
throws IOException
{
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
splits = IOUtilFunctions.sortInputSplits(splits);
//compute number of columns
int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
//compute number of rows
int nrow = 0;
for( int i=0; i<splits.length; i++ )
{
RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
LongWritable key = new LongWritable();
Text value = new Text();
try
{
//ignore header of first split
if( i==0 && _props.hasHeader() )
reader.next(key, value);
//count remaining number of rows, ignore meta data
while ( reader.next(key, value) ) {
String val = value.toString();
nrow += ( val.startsWith(TfUtils.TXMTD_MVPREFIX)
|| val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1;
}
}
finally {
IOUtilFunctions.closeSilently(reader);
}
}
return new Pair<>(nrow, ncol);
}
示例6: readTextCellFrameFromHDFS
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
@Override
protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest,
ValueType[] schema, String[] names, long rlen, long clen)
throws IOException
{
int numThreads = OptimizerUtils.getParallelTextReadParallelism();
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
try
{
//create read tasks for all splits
ExecutorService pool = Executors.newFixedThreadPool(numThreads);
InputSplit[] splits = informat.getSplits(job, numThreads);
ArrayList<ReadTask> tasks = new ArrayList<>();
for( InputSplit split : splits )
tasks.add(new ReadTask(split, informat, job, dest));
//wait until all tasks have been executed
List<Future<Object>> rt = pool.invokeAll(tasks);
pool.shutdown();
//check for exceptions
for( Future<Object> task : rt )
task.get();
}
catch (Exception e) {
throw new IOException("Failed parallel read of text cell input.", e);
}
}
示例7: mergeTextCellWithoutComp
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
private static void mergeTextCellWithoutComp( String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO )
throws DMLRuntimeException
{
try
{
//delete target file if already exists
MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
if( ALLOW_COPY_CELLFILES )
{
copyAllFiles(fnameNew, inMO);
return; //we're done
}
//actual merge
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path( fnameNew );
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path,true)));
String valueStr = null;
try
{
for( MatrixObject in : inMO ) //read/write all inputs
{
if( LOG.isTraceEnabled() )
LOG.trace("ResultMerge (local, file): Merge input "+in.hashCode()+" (fname="
+in.getFileName()+") via stream merge");
JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
Path tmpPath = new Path(in.getFileName());
FileInputFormat.addInputPath(tmpJob, tmpPath);
TextInputFormat informat = new TextInputFormat();
informat.configure(tmpJob);
InputSplit[] splits = informat.getSplits(tmpJob, 1);
LongWritable key = new LongWritable();
Text value = new Text();
for(InputSplit split: splits)
{
RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL);
try
{
while(reader.next(key, value))
{
valueStr = value.toString().trim();
out.write( valueStr+"\n" );
}
}
finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
}
finally {
IOUtilFunctions.closeSilently(out);
}
}
catch(Exception ex)
{
throw new DMLRuntimeException("Unable to merge text cell results.", ex);
}
}
示例8: createTextCellStagingFile
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
private static void createTextCellStagingFile( String fnameStaging, MatrixObject mo, long ID )
throws IOException, DMLRuntimeException
{
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(mo.getFileName());
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LinkedList<Cell> buffer = new LinkedList<>();
LongWritable key = new LongWritable();
Text value = new Text();
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
//long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
//NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
// errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
// It works fine with int row, col but we require long for larger matrices.
// Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
// we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
FastStringTokenizer st = new FastStringTokenizer(' ');
for(InputSplit split : splits)
{
RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try
{
while(reader.next(key, value))
{
st.reset( value.toString() ); //reset tokenizer
long row = st.nextLong();
long col = st.nextLong();
double lvalue = Double.parseDouble( st.nextToken() );
Cell tmp = new Cell( row, col, lvalue );
buffer.addLast( tmp );
if( buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE ) //periodic flush
{
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
}
//final flush
if( !buffer.isEmpty() )
{
appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
buffer.clear();
}
}
finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
示例9: createTextCellStagingFile
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
public void createTextCellStagingFile( String fnameOld, String stagingDir )
throws IOException, DMLRuntimeException
{
//prepare input
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameOld);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
if( !fs.exists(path) )
throw new IOException("File "+fnameOld+" does not exist on HDFS.");
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LinkedList<Cell> buffer = new LinkedList<>();
LongWritable key = new LongWritable();
Text value = new Text();
FastStringTokenizer st = new FastStringTokenizer(' ');
for(InputSplit split: splits)
{
RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try
{
while( reader.next(key, value) )
{
st.reset( value.toString() ); //reset tokenizer
long row = st.nextLong();
long col = st.nextLong();
double lvalue = st.nextDouble();
buffer.add(new Cell(row,col,lvalue));
if( buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE )
{
appendCellBufferToStagingArea(stagingDir, buffer, ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize());
buffer.clear();
}
}
if( !buffer.isEmpty() )
{
appendCellBufferToStagingArea(stagingDir, buffer, ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize());
buffer.clear();
}
}
finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
示例10: readMatrixFromHDFS
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen,
int brlen, int bclen, long estnnz)
throws IOException, DMLRuntimeException
{
// prepare file access
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fname);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, _numThreads);
splits = IOUtilFunctions.sortInputSplits(splits);
// check existence and non-empty file
checkValidInputFile(fs, path);
// allocate output matrix block
// First Read Pass (count rows/cols, determine offsets, allocate matrix block)
MatrixBlock ret = computeCSVSizeAndCreateOutputMatrixBlock(splits,
path, job, _props.hasHeader(), _props.getDelim(), estnnz);
rlen = ret.getNumRows();
clen = ret.getNumColumns();
// Second Read Pass (read, parse strings, append to matrix block)
readCSVMatrixFromHDFS(splits, path, job, ret, rlen, clen, brlen, bclen,
_props.hasHeader(), _props.getDelim(), _props.isFill(),
_props.getFillValue());
//post-processing (representation-specific, change of sparse/dense block representation)
// - no sorting required for CSV because it is read in sorted order per row
// - nnz explicitly maintained in parallel for the individual splits
ret.examSparsity();
// sanity check for parallel row count (since determined internally)
if (rlen > 0 && rlen != ret.getNumRows())
throw new DMLRuntimeException("Read matrix inconsistent with given meta data: "
+ "expected nrow="+ rlen + ", real nrow=" + ret.getNumRows());
return ret;
}
示例11: readCSVFrameFromHDFS
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
@Override
protected void readCSVFrameFromHDFS( Path path, JobConf job, FileSystem fs,
FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen)
throws IOException
{
int numThreads = OptimizerUtils.getParallelTextReadParallelism();
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, numThreads);
splits = IOUtilFunctions.sortInputSplits(splits);
try
{
ExecutorService pool = Executors.newFixedThreadPool(
Math.min(numThreads, splits.length));
//compute num rows per split
ArrayList<CountRowsTask> tasks = new ArrayList<>();
for( int i=0; i<splits.length; i++ )
tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i==0));
List<Future<Long>> cret = pool.invokeAll(tasks);
//compute row offset per split via cumsum on row counts
long offset = 0;
List<Long> offsets = new ArrayList<>();
for( Future<Long> count : cret ) {
offsets.add(offset);
offset += count.get();
}
//read individual splits
ArrayList<ReadRowsTask> tasks2 = new ArrayList<>();
for( int i=0; i<splits.length; i++ )
tasks2.add( new ReadRowsTask(splits[i], informat, job, dest, offsets.get(i).intValue(), i==0));
List<Future<Object>> rret = pool.invokeAll(tasks2);
pool.shutdown();
//error handling
for( Future<Object> read : rret )
read.get();
}
catch (Exception e) {
throw new IOException("Failed parallel read of text csv input.", e);
}
}
示例12: readTextCellMatrixFromHDFS
import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
private void readTextCellMatrixFromHDFS( Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket )
throws IOException
{
int par = _numThreads;
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
//check for min file size for matrix market (adjust num splits if necessary)
if( _isMMFile ){
long len = MapReduceTool.getFilesizeOnHDFS(path);
par = ( len < MIN_FILESIZE_MM ) ? 1: par;
}
try
{
//create read tasks for all splits
ExecutorService pool = Executors.newFixedThreadPool(par);
InputSplit[] splits = informat.getSplits(job, par);
ArrayList<ReadTask> tasks = new ArrayList<>();
for( InputSplit split : splits ){
ReadTask t = new ReadTask(split, informat, job, dest, rlen, clen, matrixMarket);
tasks.add(t);
}
//wait until all tasks have been executed
List<Future<Long>> rt = pool.invokeAll(tasks);
//check for exceptions and aggregate nnz
long lnnz = 0;
for( Future<Long> task : rt )
lnnz += task.get();
//post-processing
dest.setNonZeros( lnnz );
if( dest.isInSparseFormat() )
sortSparseRowsParallel(dest, rlen, _numThreads, pool);
pool.shutdown();
}
catch (Exception e) {
throw new IOException("Threadpool issue, while parallel read.", e);
}
}