当前位置: 首页>>代码示例>>Java>>正文


Java TextInputFormat.getRecordReader方法代码示例

本文整理汇总了Java中org.apache.hadoop.mapred.TextInputFormat.getRecordReader方法的典型用法代码示例。如果您正苦于以下问题:Java TextInputFormat.getRecordReader方法的具体用法?Java TextInputFormat.getRecordReader怎么用?Java TextInputFormat.getRecordReader使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.hadoop.mapred.TextInputFormat的用法示例。


在下文中一共展示了TextInputFormat.getRecordReader方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: computeCSVSize

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
protected Pair<Integer,Integer> computeCSVSize( Path path, JobConf job, FileSystem fs) 
	throws IOException 
{	
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);
	InputSplit[] splits = informat.getSplits(job, 1);
	splits = IOUtilFunctions.sortInputSplits(splits);
	
	//compute number of columns
	int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
	
	//compute number of rows
	int nrow = 0;
	for( int i=0; i<splits.length; i++ ) 
	{
		RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
		LongWritable key = new LongWritable();
		Text value = new Text();
		
		try
		{
			//ignore header of first split
			if( i==0 && _props.hasHeader() )
				reader.next(key, value);
			
			//count remaining number of rows, ignore meta data
			while ( reader.next(key, value) ) {
				String val = value.toString();
				nrow += ( val.startsWith(TfUtils.TXMTD_MVPREFIX)
					|| val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1; 
			}
		}
		finally {
			IOUtilFunctions.closeSilently(reader);
		}
	}
	return new Pair<>(nrow, ncol);
}
 
开发者ID:apache,项目名称:systemml,代码行数:39,代码来源:FrameReaderTextCSV.java

示例2: mergeTextCellWithoutComp

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
private static void mergeTextCellWithoutComp( String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO ) 
	throws DMLRuntimeException
{
	try
	{
		//delete target file if already exists
		MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
		
		if( ALLOW_COPY_CELLFILES )
		{
			copyAllFiles(fnameNew, inMO);
			return; //we're done
		}
		
		//actual merge
		JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
		Path path = new Path( fnameNew );
		FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
		BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path,true)));		
		
		String valueStr = null;
		
		try
		{
			for( MatrixObject in : inMO ) //read/write all inputs
			{
				if( LOG.isTraceEnabled() )
					LOG.trace("ResultMerge (local, file): Merge input "+in.hashCode()+" (fname="
						+in.getFileName()+") via stream merge");
				
				JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
				Path tmpPath = new Path(in.getFileName());
				FileInputFormat.addInputPath(tmpJob, tmpPath);
				TextInputFormat informat = new TextInputFormat();
				informat.configure(tmpJob);
				InputSplit[] splits = informat.getSplits(tmpJob, 1);
				
				LongWritable key = new LongWritable();
				Text value = new Text();
	
				for(InputSplit split: splits)
				{
					RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL);
					try
					{
						while(reader.next(key, value))
						{
							valueStr = value.toString().trim();	
							out.write( valueStr+"\n" );
						}
					}
					finally {
						IOUtilFunctions.closeSilently(reader);
					}
				}
			}
		}
		finally {
			IOUtilFunctions.closeSilently(out);
		}
	}
	catch(Exception ex)
	{
		throw new DMLRuntimeException("Unable to merge text cell results.", ex);
	}
}
 
开发者ID:apache,项目名称:systemml,代码行数:67,代码来源:ResultMergeLocalFile.java

示例3: createTextCellStagingFile

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
private static void createTextCellStagingFile( String fnameStaging, MatrixObject mo, long ID ) 
	throws IOException, DMLRuntimeException
{		
	JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
	Path path = new Path(mo.getFileName());
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);
	InputSplit[] splits = informat.getSplits(job, 1);
	
	LinkedList<Cell> buffer = new LinkedList<>();
	LongWritable key = new LongWritable();
	Text value = new Text();

	MatrixCharacteristics mc = mo.getMatrixCharacteristics();
	int brlen = mc.getRowsPerBlock(); 
	int bclen = mc.getColsPerBlock();
	//long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
	//NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
	// errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
	// It works fine with int row, col but we require long for larger matrices.
	// Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
	// we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
	
	FastStringTokenizer st = new FastStringTokenizer(' ');
	
	for(InputSplit split : splits)
	{
		RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
		try
		{
			while(reader.next(key, value))
			{
				st.reset( value.toString() ); //reset tokenizer
				long row = st.nextLong();
			    long col = st.nextLong();
				double lvalue = Double.parseDouble( st.nextToken() );
				
				Cell tmp = new Cell( row, col, lvalue ); 
				
				buffer.addLast( tmp );
				if( buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE ) //periodic flush
				{
					appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
					buffer.clear();
				}
			}
			
			//final flush
			if( !buffer.isEmpty() )
			{
				appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
				buffer.clear();
			}
		}
		finally {
			IOUtilFunctions.closeSilently(reader);
		}
	}
}
 
开发者ID:apache,项目名称:systemml,代码行数:61,代码来源:ResultMergeLocalFile.java

示例4: createTextCellStagingFile

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
public void createTextCellStagingFile( String fnameOld, String stagingDir ) 
	throws IOException, DMLRuntimeException
{	
	//prepare input
	JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());	
	Path path = new Path(fnameOld);
	FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
	if( !fs.exists(path) )	
		throw new IOException("File "+fnameOld+" does not exist on HDFS.");
	FileInputFormat.addInputPath(job, path); 
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);
	InputSplit[] splits = informat.getSplits(job, 1);

	LinkedList<Cell> buffer = new LinkedList<>();
	
	LongWritable key = new LongWritable();
	Text value = new Text();
	FastStringTokenizer st = new FastStringTokenizer(' ');		
	
	for(InputSplit split: splits)
	{
		RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, job, Reporter.NULL);				
		try
		{
			while( reader.next(key, value) )
			{
				st.reset( value.toString() ); //reset tokenizer
				long row = st.nextLong();
				long col = st.nextLong();
				double lvalue = st.nextDouble();
				
				buffer.add(new Cell(row,col,lvalue));
				
				if( buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE )
				{
					appendCellBufferToStagingArea(stagingDir, buffer, ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize());
					buffer.clear();
				}
			}
			
			if( !buffer.isEmpty() )
			{
				appendCellBufferToStagingArea(stagingDir, buffer, ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize());
				buffer.clear();
			}
		}
		finally {
			IOUtilFunctions.closeSilently(reader);
		}
	}
}
 
开发者ID:apache,项目名称:systemml,代码行数:53,代码来源:ParameterizedBuiltinCPFileInstruction.java

示例5: computeCSVSizeAndCreateOutputMatrixBlock

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
private MatrixBlock computeCSVSizeAndCreateOutputMatrixBlock(
		InputSplit[] splits, Path path, JobConf job, boolean hasHeader,
		String delim, long estnnz) throws IOException, DMLRuntimeException 
{
	int nrow = 0;
	int ncol = 0;
	
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);

	// count no of entities in the first non-header row
	LongWritable key = new LongWritable();
	Text oneLine = new Text();
	RecordReader<LongWritable, Text> reader = informat
			.getRecordReader(splits[0], job, Reporter.NULL);
	try {
		if (reader.next(key, oneLine)) {
			String cellStr = oneLine.toString().trim();
			ncol = StringUtils.countMatches(cellStr, delim) + 1;
		}
	} 
	finally {
		IOUtilFunctions.closeSilently(reader);
	}

	// count rows in parallel per split
	try 
	{
		ExecutorService pool = Executors.newFixedThreadPool(_numThreads);
		ArrayList<CountRowsTask> tasks = new ArrayList<>();
		for (InputSplit split : splits) {
			tasks.add(new CountRowsTask(split, informat, job, hasHeader));
			hasHeader = false;
		}
		pool.invokeAll(tasks);
		pool.shutdown();

		// collect row counts for offset computation
		// early error notify in case not all tasks successful
		_offsets = new SplitOffsetInfos(tasks.size());
		for (CountRowsTask rt : tasks) {
			if (!rt.getReturnCode())
				throw new IOException("Count task for csv input failed: "+ rt.getErrMsg());
			_offsets.setOffsetPerSplit(tasks.indexOf(rt), nrow);
			_offsets.setLenghtPerSplit(tasks.indexOf(rt), rt.getRowCount());
			nrow = nrow + rt.getRowCount();
		}
	} 
	catch (Exception e) {
		throw new IOException("Threadpool Error " + e.getMessage(), e);
	}
	
	// allocate target matrix block based on given size; 
	// need to allocate sparse as well since lock-free insert into target
	long estnnz2 = (estnnz < 0) ? (long)nrow * ncol : estnnz;
	return createOutputMatrixBlock(nrow, ncol, nrow, ncol, estnnz2, true, true);
}
 
开发者ID:apache,项目名称:systemml,代码行数:59,代码来源:ReaderTextCSVParallel.java

示例6: readTextCellFrameFromInputSplit

import org.apache.hadoop.mapred.TextInputFormat; //导入方法依赖的package包/类
protected static void readTextCellFrameFromInputSplit( InputSplit split, TextInputFormat informat, JobConf job, FrameBlock dest)
	throws IOException
{
	ValueType[] schema = dest.getSchema();
	int rlen = dest.getNumRows();
	int clen = dest.getNumColumns();
	
	//create record reader
	RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
	
	LongWritable key = new LongWritable();
	Text value = new Text();
	FastStringTokenizer st = new FastStringTokenizer(' ');
	int row = -1;
	int col = -1;
	
	try
	{
		while( reader.next(key, value) ) {
			st.reset( value.toString() ); //reinit tokenizer
			row = st.nextInt()-1;
			col = st.nextInt()-1;
			if( row == -3 )
				dest.getColumnMetadata(col).setMvValue(st.nextToken());
			else if( row == -2 )
				dest.getColumnMetadata(col).setNumDistinct(st.nextLong());
			else
				dest.set(row, col, UtilFunctions.stringToObject(schema[col], st.nextToken()));
		}
	}
	catch(Exception ex) 
	{
		//post-mortem error handling and bounds checking
		if( row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen ) {
			throw new IOException("Frame cell ["+(row+1)+","+(col+1)+"] " +
								  "out of overall frame range [1:"+rlen+",1:"+clen+"].");
		}
		else {
			throw new IOException( "Unable to read frame in text cell format.", ex );
		}
	}
	finally {
		IOUtilFunctions.closeSilently(reader);
	}		
}
 
开发者ID:apache,项目名称:systemml,代码行数:46,代码来源:FrameReaderTextCell.java


注:本文中的org.apache.hadoop.mapred.TextInputFormat.getRecordReader方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。