当前位置: 首页>>代码示例>>Java>>正文


Java InputFormat.getRecordReader方法代码示例

本文整理汇总了Java中org.apache.hadoop.mapred.InputFormat.getRecordReader方法的典型用法代码示例。如果您正苦于以下问题:Java InputFormat.getRecordReader方法的具体用法?Java InputFormat.getRecordReader怎么用?Java InputFormat.getRecordReader使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.hadoop.mapred.InputFormat的用法示例。


在下文中一共展示了InputFormat.getRecordReader方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getSample

import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      samples.add(key);
      key = reader.createKey();
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
开发者ID:naver,项目名称:hadoop,代码行数:29,代码来源:InputSampler.java

示例2: getSample

import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      ++records;
      if ((double) kept / records < freq) {
        ++kept;
        samples.add(key);
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
开发者ID:hopshadoop,项目名称:hops,代码行数:31,代码来源:InputSampler.java

示例3: getRecordReader

import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws IOException {

  // Find the InputFormat and then the RecordReader from the
  // TaggedInputSplit.
 
  TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
  InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
     .newInstance(taggedInputSplit.getInputFormatClass(), conf);
  InputSplit inputSplit = taggedInputSplit.getInputSplit();
  if (inputSplit instanceof FileSplit) {
     FileSplit fileSplit = (FileSplit) inputSplit;
     conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString());
     conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart());
     conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength());
   }
  
  return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
     reporter);
}
 
开发者ID:apache,项目名称:systemml,代码行数:22,代码来源:DelegatingInputFormat.java

示例4: getRecordReader

import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws IOException {

  // Find the InputFormat and then the RecordReader from the
  // TaggedInputSplit.

  TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
  InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
     .newInstance(taggedInputSplit.getInputFormatClass(), conf);
  return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
     reporter);
}
 
开发者ID:naver,项目名称:hadoop,代码行数:14,代码来源:DelegatingInputFormat.java

示例5: getRecordReader

import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
private org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> getRecordReader(
  final InputFormat<BytesWritable, BytesWritable> inputFormat,
  final JobConf jobConf) throws ExecutionSetupException {
  try {
    return inputFormat.getRecordReader(split, jobConf, Reporter.NULL);
  } catch (IOException e) {
    throw new ExecutionSetupException(
        String.format("Error in creating sequencefile reader for file: %s, start: %d, length: %d",
            split.getPath(), split.getStart(), split.getLength()), e);
  }
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:12,代码来源:SequenceFileRecordReader.java

示例6: countNumColumnsCSV

import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
/**
 * Counts the number of columns in a given collection of csv file splits. This primitive aborts 
 * if a row with more than 0 columns is found and hence is robust against empty file splits etc.
 * 
 * @param splits input splits
 * @param informat input format
 * @param job job configruation
 * @param delim delimiter
 * @return the number of columns in the collection of csv file splits
 * @throws IOException if IOException occurs
 */
@SuppressWarnings({ "rawtypes" })
public static int countNumColumnsCSV(InputSplit[] splits, InputFormat informat, JobConf job, String delim ) 
	throws IOException 
{
	LongWritable key = new LongWritable();
	Text value = new Text();
	int ncol = -1; 
	for( int i=0; i<splits.length && ncol<=0; i++ ) {
		RecordReader<LongWritable, Text> reader = 
				informat.getRecordReader(splits[i], job, Reporter.NULL);
		try {
			if( reader.next(key, value) ) {
				boolean hasValue = true;
				if( value.toString().startsWith(TfUtils.TXMTD_MVPREFIX) )
					hasValue = reader.next(key, value);
				if( value.toString().startsWith(TfUtils.TXMTD_NDPREFIX) )
					hasValue = reader.next(key, value);
				String row = value.toString().trim();
				if( hasValue && !row.isEmpty() ) {
					ncol = IOUtilFunctions.countTokensCSV(row, delim);
				}
			}
		}
		finally {
			closeSilently(reader);	
		}
	}
	return ncol;
}
 
开发者ID:apache,项目名称:systemml,代码行数:41,代码来源:IOUtilFunctions.java

示例7: getSample

import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
/**
 * Randomize the split order, then take the specified number of keys from
 * each split sampled, where each key is selected with the specified
 * probability and possibly replaced by a subsequently selected key when
 * the quota of keys from that split is satisfied.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);

  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);
  LOG.debug("seed: " + seed);
  // shuffle splits
  for (int i = 0; i < splits.length; ++i) {
    InputSplit tmp = splits[i];
    int j = r.nextInt(splits.length);
    splits[i] = splits[j];
    splits[j] = tmp;
  }
  // our target rate is in terms of the maximum number of sample splits,
  // but we accept the possibility of sampling additional splits to hit
  // the target sample keyset
  for (int i = 0; i < splitsToSample ||
                 (i < splits.length && samples.size() < numSamples); ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i], job,
        Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      if (r.nextDouble() <= freq) {
        if (samples.size() < numSamples) {
          samples.add(key);
        } else {
          // When exceeding the maximum number of samples, replace a
          // random element with this one, then adjust the frequency
          // to reflect the possibility of existing elements being
          // pushed out
          int ind = r.nextInt(numSamples);
          if (ind != numSamples) {
            samples.set(ind, key);
          }
          freq *= (numSamples - 1) / (double) numSamples;
        }
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
开发者ID:Nextzero,项目名称:hadoop-2.6.0-cdh5.4.3,代码行数:55,代码来源:InputSampler.java

示例8: createPushRuntime

import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
                throws HyracksDataException {
    final InputSplit[] inputSplits = splitsFactory.getSplits();

    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId();

        @SuppressWarnings("unchecked")
        @Override
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                writer.open();
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                JobConf conf = confFactory.getConf();
                conf.setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                try {
                    parser.open(writer);
                    InputFormat inputFormat = conf.getInputFormat();
                    for (int i = 0; i < inputSplits.length; i++) {
                        /**
                         * read all the partitions scheduled to the current node
                         */
                        if (scheduledLocations[i].equals(nodeName)) {
                            /**
                             * pick an unread split to read
                             * synchronize among simultaneous partitions in the same machine
                             */
                            synchronized (executed) {
                                if (executed[i] == false) {
                                    executed[i] = true;
                                } else {
                                    continue;
                                }
                            }

                            /**
                             * read the split
                             */
                            RecordReader reader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);
                            Object key = reader.createKey();
                            Object value = reader.createValue();
                            while (reader.next(key, value) == true) {
                                parser.parse(key, value, writer, inputSplits[i].toString());
                            }
                        }
                    }
                } finally {
                    parser.close(writer);
                }
            } catch (Throwable th) {
                writer.fail();
                throw new HyracksDataException(th);
            } finally {
                writer.close();
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}
 
开发者ID:apache,项目名称:incubator-asterixdb-hyracks,代码行数:64,代码来源:HDFSReadOperatorDescriptor.java

示例9: readCSVFrameFromInputSplit

import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
protected final int readCSVFrameFromInputSplit( InputSplit split, InputFormat<LongWritable,Text> informat, JobConf job, 
		FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen, int rl, boolean first)
	throws IOException
{
	boolean hasHeader = _props.hasHeader();
	boolean isFill = _props.isFill();
	double dfillValue = _props.getFillValue();
	String sfillValue = String.valueOf(_props.getFillValue());
	String delim = _props.getDelim();
	
	//create record reader
	RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
	LongWritable key = new LongWritable();
	Text value = new Text();
	int row = rl;
	int col = -1;
	
	//handle header if existing
	if(first && hasHeader ) {
		reader.next(key, value); //read header
		dest.setColumnNames(value.toString().split(delim));
	}
		
	// Read the data
	boolean emptyValuesFound = false;
	try
	{
		while( reader.next(key, value) ) //foreach line
		{
			String cellStr = value.toString().trim();
			emptyValuesFound = false; col = 0;
			String[] parts = IOUtilFunctions.splitCSV(cellStr, delim);
			
			//parse frame meta data (missing values / num distinct)
			if( parts[0].equals(TfUtils.TXMTD_MVPREFIX) || parts[0].equals(TfUtils.TXMTD_NDPREFIX) ) {
				if( parts[0].equals(TfUtils.TXMTD_MVPREFIX) )
					for( int j=0; j<dest.getNumColumns(); j++ )
						dest.getColumnMetadata(j).setMvValue(parts[j+1]);
				else if( parts[0].equals(TfUtils.TXMTD_NDPREFIX) )
					for( int j=0; j<dest.getNumColumns(); j++ )
						dest.getColumnMetadata(j).setNumDistinct(Long.parseLong(parts[j+1]));
				continue;
			}
			
			for( String part : parts ) //foreach cell
			{
				part = part.trim();
				if ( part.isEmpty() ) {
					if( isFill && dfillValue!=0 )
						dest.set(row, col, UtilFunctions.stringToObject(schema[col], sfillValue));
					emptyValuesFound = true;
				}
				else {
					dest.set(row, col, UtilFunctions.stringToObject(schema[col], part));
				}
				col++;
			}
			
			//sanity checks for empty values and number of columns
			IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, isFill, emptyValuesFound);
			IOUtilFunctions.checkAndRaiseErrorCSVNumColumns("", cellStr, parts, clen);
			row++;
		}
	}
	finally {
		IOUtilFunctions.closeSilently(reader);
	}
	
	return row;
}
 
开发者ID:apache,项目名称:systemml,代码行数:71,代码来源:FrameReaderTextCSV.java


注:本文中的org.apache.hadoop.mapred.InputFormat.getRecordReader方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。