Java TextInputFormat.getSplits方法代码示例

本文整理汇总了Java中org.apache.hadoop.mapred.TextInputFormat.getSplits方法的典型用法代码示例。如果您正苦于以下问题：Java TextInputFormat.getSplits方法的具体用法？Java TextInputFormat.getSplits怎么用？Java TextInputFormat.getSplits使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.hadoop.mapred.TextInputFormat的用法示例。

在下文中一共展示了TextInputFormat.getSplits方法的12个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: readTextCellFrameFromHDFS

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest, 
		ValueType[] schema, String[] names, long rlen, long clen)
	throws IOException
{
	if( fs.isDirectory(path) ) {
		FileInputFormat.addInputPath(job, path);
		TextInputFormat informat = new TextInputFormat();
		informat.configure(job);
		InputSplit[] splits = informat.getSplits(job, 1);
		for(InputSplit split: splits)
			readTextCellFrameFromInputSplit(split, informat, job, dest);
	}
	else {
		readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
	}
}

开发者ID:apache，项目名称:systemml，代码行数:17，代码来源:FrameReaderTextCell.java

示例2: testNodeProcessingSchema

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
public void testNodeProcessingSchema(MapOperator<LongWritable, Text, Text, IntWritable> oper) throws IOException
{

  CollectorTestSink sortSink = new CollectorTestSink();
  oper.output.setSink(sortSink);

  oper.setMapClass(WordCount.Map.class);
  oper.setCombineClass(WordCount.Reduce.class);
  oper.setDirName(testMeta.testDir);
  oper.setConfigFile(null);
  oper.setInputFormatClass(TextInputFormat.class);

  Configuration conf = new Configuration();
  JobConf jobConf = new JobConf(conf);
  FileInputFormat.setInputPaths(jobConf, new Path(testMeta.testDir));
  TextInputFormat inputFormat = new TextInputFormat();
  inputFormat.configure(jobConf);
  InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
  SerializationFactory serializationFactory = new SerializationFactory(conf);
  Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
  keySerializer.open(oper.getOutstream());
  keySerializer.serialize(splits[0]);
  oper.setInputSplitClass(splits[0].getClass());
  keySerializer.close();
  oper.setup(null);
  oper.beginWindow(0);
  oper.emitTuples();
  oper.emitTuples();
  oper.endWindow();
  oper.beginWindow(1);
  oper.emitTuples();
  oper.endWindow();

  Assert.assertEquals("number emitted tuples", 3, sortSink.collectedTuples.size());
  for (Object o : sortSink.collectedTuples) {
    LOG.debug(o.toString());
  }
  LOG.debug("Done testing round\n");
  oper.teardown();
}

开发者ID:apache，项目名称:apex-malhar，代码行数:41，代码来源:MapOperatorTest.java

示例3: computeCSVSize

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
@Override
protected Pair<Integer,Integer> computeCSVSize( Path path, JobConf job, FileSystem fs) 
	throws IOException 
{
	int numThreads = OptimizerUtils.getParallelTextReadParallelism();
	
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);
	InputSplit[] splits = informat.getSplits(job, numThreads);
	
	//compute number of columns
	int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
	
	//compute number of rows
	int nrow = 0;
	ExecutorService pool = Executors.newFixedThreadPool(numThreads);
	try {
		ArrayList<CountRowsTask> tasks = new ArrayList<>();
		for( int i=0; i<splits.length; i++ )
			tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i==0));
		List<Future<Long>> cret = pool.invokeAll(tasks);
		for( Future<Long> count : cret ) 
			nrow += count.get().intValue();
	}
	catch (Exception e) {
		throw new IOException("Failed parallel read of text csv input.", e);
	}
	finally {
		pool.shutdown();
	}
	return new Pair<>(nrow, ncol);
}

开发者ID:apache，项目名称:systemml，代码行数:33，代码来源:FrameReaderTextCSVParallel.java

示例4: readCSVFrameFromHDFS

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
protected void readCSVFrameFromHDFS( Path path, JobConf job, FileSystem fs, 
		FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) 
	throws IOException
{
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);
	InputSplit[] splits = informat.getSplits(job, 1);
	splits = IOUtilFunctions.sortInputSplits(splits);
	for( int i=0, rpos=0; i<splits.length; i++ )
		rpos = readCSVFrameFromInputSplit(splits[i], informat,
			job, dest, schema, names, rlen, clen, rpos, i==0);
}

开发者ID:apache，项目名称:systemml，代码行数:13，代码来源:FrameReaderTextCSV.java

示例5: computeCSVSize

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
protected Pair<Integer,Integer> computeCSVSize( Path path, JobConf job, FileSystem fs) 
	throws IOException 
{	
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);
	InputSplit[] splits = informat.getSplits(job, 1);
	splits = IOUtilFunctions.sortInputSplits(splits);
	
	//compute number of columns
	int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
	
	//compute number of rows
	int nrow = 0;
	for( int i=0; i<splits.length; i++ ) 
	{
		RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
		LongWritable key = new LongWritable();
		Text value = new Text();
		
		try
		{
			//ignore header of first split
			if( i==0 && _props.hasHeader() )
				reader.next(key, value);
			
			//count remaining number of rows, ignore meta data
			while ( reader.next(key, value) ) {
				String val = value.toString();
				nrow += ( val.startsWith(TfUtils.TXMTD_MVPREFIX)
					|| val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1; 
			}
		}
		finally {
			IOUtilFunctions.closeSilently(reader);
		}
	}
	return new Pair<>(nrow, ncol);
}

开发者ID:apache，项目名称:systemml，代码行数:39，代码来源:FrameReaderTextCSV.java

示例6: readTextCellFrameFromHDFS

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
@Override
protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest, 
		ValueType[] schema, String[] names, long rlen, long clen)
	throws IOException
{
	int numThreads = OptimizerUtils.getParallelTextReadParallelism();
	
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);
	
	try 
	{
		//create read tasks for all splits
		ExecutorService pool = Executors.newFixedThreadPool(numThreads);
		InputSplit[] splits = informat.getSplits(job, numThreads);
		ArrayList<ReadTask> tasks = new ArrayList<>();
		for( InputSplit split : splits )
			tasks.add(new ReadTask(split, informat, job, dest));
		
		//wait until all tasks have been executed
		List<Future<Object>> rt = pool.invokeAll(tasks);
		pool.shutdown();
			
		//check for exceptions
		for( Future<Object> task : rt )
			task.get();
	} 
	catch (Exception e) {
		throw new IOException("Failed parallel read of text cell input.", e);
	}
}

开发者ID:apache，项目名称:systemml，代码行数:33，代码来源:FrameReaderTextCellParallel.java

示例7: mergeTextCellWithoutComp

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
private static void mergeTextCellWithoutComp( String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO ) 
	throws DMLRuntimeException
{
	try
	{
		//delete target file if already exists
		MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
		
		if( ALLOW_COPY_CELLFILES )
		{
			copyAllFiles(fnameNew, inMO);
			return; //we're done
		}
		
		//actual merge
		JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
		Path path = new Path( fnameNew );
		FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
		BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path,true)));		
		
		String valueStr = null;
		
		try
		{
			for( MatrixObject in : inMO ) //read/write all inputs
			{
				if( LOG.isTraceEnabled() )
					LOG.trace("ResultMerge (local, file): Merge input "+in.hashCode()+" (fname="
						+in.getFileName()+") via stream merge");
				
				JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
				Path tmpPath = new Path(in.getFileName());
				FileInputFormat.addInputPath(tmpJob, tmpPath);
				TextInputFormat informat = new TextInputFormat();
				informat.configure(tmpJob);
				InputSplit[] splits = informat.getSplits(tmpJob, 1);
				
				LongWritable key = new LongWritable();
				Text value = new Text();
	
				for(InputSplit split: splits)
				{
					RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL);
					try
					{
						while(reader.next(key, value))
						{
							valueStr = value.toString().trim();	
							out.write( valueStr+"\n" );
						}
					}
					finally {
						IOUtilFunctions.closeSilently(reader);
					}
				}
			}
		}
		finally {
			IOUtilFunctions.closeSilently(out);
		}
	}
	catch(Exception ex)
	{
		throw new DMLRuntimeException("Unable to merge text cell results.", ex);
	}
}

开发者ID:apache，项目名称:systemml，代码行数:67，代码来源:ResultMergeLocalFile.java

示例8: createTextCellStagingFile

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
private static void createTextCellStagingFile( String fnameStaging, MatrixObject mo, long ID ) 
	throws IOException, DMLRuntimeException
{		
	JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
	Path path = new Path(mo.getFileName());
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);
	InputSplit[] splits = informat.getSplits(job, 1);
	
	LinkedList<Cell> buffer = new LinkedList<>();
	LongWritable key = new LongWritable();
	Text value = new Text();

	MatrixCharacteristics mc = mo.getMatrixCharacteristics();
	int brlen = mc.getRowsPerBlock(); 
	int bclen = mc.getColsPerBlock();
	//long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
	//NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
	// errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
	// It works fine with int row, col but we require long for larger matrices.
	// Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
	// we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
	
	FastStringTokenizer st = new FastStringTokenizer(' ');
	
	for(InputSplit split : splits)
	{
		RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
		try
		{
			while(reader.next(key, value))
			{
				st.reset( value.toString() ); //reset tokenizer
				long row = st.nextLong();
			    long col = st.nextLong();
				double lvalue = Double.parseDouble( st.nextToken() );
				
				Cell tmp = new Cell( row, col, lvalue ); 
				
				buffer.addLast( tmp );
				if( buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE ) //periodic flush
				{
					appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
					buffer.clear();
				}
			}
			
			//final flush
			if( !buffer.isEmpty() )
			{
				appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
				buffer.clear();
			}
		}
		finally {
			IOUtilFunctions.closeSilently(reader);
		}
	}
}

开发者ID:apache，项目名称:systemml，代码行数:61，代码来源:ResultMergeLocalFile.java

示例9: createTextCellStagingFile

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
public void createTextCellStagingFile( String fnameOld, String stagingDir ) 
	throws IOException, DMLRuntimeException
{	
	//prepare input
	JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());	
	Path path = new Path(fnameOld);
	FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
	if( !fs.exists(path) )	
		throw new IOException("File "+fnameOld+" does not exist on HDFS.");
	FileInputFormat.addInputPath(job, path); 
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);
	InputSplit[] splits = informat.getSplits(job, 1);

	LinkedList<Cell> buffer = new LinkedList<>();
	
	LongWritable key = new LongWritable();
	Text value = new Text();
	FastStringTokenizer st = new FastStringTokenizer(' ');		
	
	for(InputSplit split: splits)
	{
		RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, job, Reporter.NULL);				
		try
		{
			while( reader.next(key, value) )
			{
				st.reset( value.toString() ); //reset tokenizer
				long row = st.nextLong();
				long col = st.nextLong();
				double lvalue = st.nextDouble();
				
				buffer.add(new Cell(row,col,lvalue));
				
				if( buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE )
				{
					appendCellBufferToStagingArea(stagingDir, buffer, ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize());
					buffer.clear();
				}
			}
			
			if( !buffer.isEmpty() )
			{
				appendCellBufferToStagingArea(stagingDir, buffer, ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize());
				buffer.clear();
			}
		}
		finally {
			IOUtilFunctions.closeSilently(reader);
		}
	}
}

开发者ID:apache，项目名称:systemml，代码行数:53，代码来源:ParameterizedBuiltinCPFileInstruction.java

示例10: readMatrixFromHDFS

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen,
		int brlen, int bclen, long estnnz) 
	throws IOException, DMLRuntimeException 
{
	// prepare file access
	JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
	Path path = new Path(fname);
	FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
	
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);

	InputSplit[] splits = informat.getSplits(job, _numThreads);
	splits = IOUtilFunctions.sortInputSplits(splits);

	// check existence and non-empty file
	checkValidInputFile(fs, path);

	// allocate output matrix block
	// First Read Pass (count rows/cols, determine offsets, allocate matrix block)
	MatrixBlock ret = computeCSVSizeAndCreateOutputMatrixBlock(splits,
			path, job, _props.hasHeader(), _props.getDelim(), estnnz);
	rlen = ret.getNumRows();
	clen = ret.getNumColumns();

	// Second Read Pass (read, parse strings, append to matrix block)
	readCSVMatrixFromHDFS(splits, path, job, ret, rlen, clen, brlen, bclen,
			_props.hasHeader(), _props.getDelim(), _props.isFill(),
			_props.getFillValue());
	
	//post-processing (representation-specific, change of sparse/dense block representation)
	// - no sorting required for CSV because it is read in sorted order per row
	// - nnz explicitly maintained in parallel for the individual splits
	ret.examSparsity();

	// sanity check for parallel row count (since determined internally)
	if (rlen > 0 && rlen != ret.getNumRows())
		throw new DMLRuntimeException("Read matrix inconsistent with given meta data: "
				+ "expected nrow="+ rlen + ", real nrow=" + ret.getNumRows());

	return ret;
}

开发者ID:apache，项目名称:systemml，代码行数:45，代码来源:ReaderTextCSVParallel.java

示例11: readCSVFrameFromHDFS

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
@Override
protected void readCSVFrameFromHDFS( Path path, JobConf job, FileSystem fs, 
		FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) 
	throws IOException
{
	int numThreads = OptimizerUtils.getParallelTextReadParallelism();
	
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);
	InputSplit[] splits = informat.getSplits(job, numThreads); 
	splits = IOUtilFunctions.sortInputSplits(splits);

	try 
	{
		ExecutorService pool = Executors.newFixedThreadPool(
			Math.min(numThreads, splits.length));
		
		//compute num rows per split
		ArrayList<CountRowsTask> tasks = new ArrayList<>();
		for( int i=0; i<splits.length; i++ )
			tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i==0));
		List<Future<Long>> cret = pool.invokeAll(tasks);

		//compute row offset per split via cumsum on row counts
		long offset = 0;
		List<Long> offsets = new ArrayList<>();
		for( Future<Long> count : cret ) {
			offsets.add(offset);
			offset += count.get();
		}
		
		//read individual splits
		ArrayList<ReadRowsTask> tasks2 = new ArrayList<>();
		for( int i=0; i<splits.length; i++ )
			tasks2.add( new ReadRowsTask(splits[i], informat, job, dest, offsets.get(i).intValue(), i==0));
		List<Future<Object>> rret = pool.invokeAll(tasks2);
		pool.shutdown();
		
		//error handling
		for( Future<Object> read : rret )
			read.get();
	} 
	catch (Exception e) {
		throw new IOException("Failed parallel read of text csv input.", e);
	}
}

开发者ID:apache，项目名称:systemml，代码行数:47，代码来源:FrameReaderTextCSVParallel.java

示例12: readTextCellMatrixFromHDFS

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
private void readTextCellMatrixFromHDFS( Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket )
	throws IOException
{
	int par = _numThreads;
	
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);
	
	//check for min file size for matrix market (adjust num splits if necessary)
	if( _isMMFile ){
		long len = MapReduceTool.getFilesizeOnHDFS(path);
		par = ( len < MIN_FILESIZE_MM ) ? 1: par; 
	}	
	
	try 
	{
		//create read tasks for all splits
		ExecutorService pool = Executors.newFixedThreadPool(par);
		InputSplit[] splits = informat.getSplits(job, par);
		ArrayList<ReadTask> tasks = new ArrayList<>();
		for( InputSplit split : splits ){
			ReadTask t = new ReadTask(split, informat, job, dest, rlen, clen, matrixMarket);
			tasks.add(t);
		}
		
		//wait until all tasks have been executed
		List<Future<Long>> rt = pool.invokeAll(tasks);	
		
		//check for exceptions and aggregate nnz
		long lnnz = 0;
		for( Future<Long> task : rt )
			lnnz += task.get();
			
		//post-processing
		dest.setNonZeros( lnnz );
		if( dest.isInSparseFormat() ) 
			sortSparseRowsParallel(dest, rlen, _numThreads, pool);
		
		pool.shutdown();
	} 
	catch (Exception e) {
		throw new IOException("Threadpool issue, while parallel read.", e);
	}
}

开发者ID:apache，项目名称:systemml，代码行数:46，代码来源:ReaderTextCellParallel.java

注：本文中的org.apache.hadoop.mapred.TextInputFormat.getSplits方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。