本文整理汇总了Java中org.apache.hadoop.mapred.InputFormat.getRecordReader方法的典型用法代码示例。如果您正苦于以下问题:Java InputFormat.getRecordReader方法的具体用法?Java InputFormat.getRecordReader怎么用?Java InputFormat.getRecordReader使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.hadoop.mapred.InputFormat
的用法示例。
在下文中一共展示了InputFormat.getRecordReader方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getSample
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
samples.add(key);
key = reader.createKey();
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
示例2: getSample
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
/**
* For each split sampled, emit when the ratio of the number of records
* retained to the total record count is less than the specified
* frequency.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>();
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
long records = 0;
long kept = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
++records;
if ((double) kept / records < freq) {
++kept;
samples.add(key);
key = reader.createKey();
}
}
reader.close();
}
return (K[])samples.toArray();
}
示例3: getRecordReader
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
Reporter reporter) throws IOException {
// Find the InputFormat and then the RecordReader from the
// TaggedInputSplit.
TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
.newInstance(taggedInputSplit.getInputFormatClass(), conf);
InputSplit inputSplit = taggedInputSplit.getInputSplit();
if (inputSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) inputSplit;
conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString());
conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart());
conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength());
}
return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
reporter);
}
示例4: getRecordReader
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
Reporter reporter) throws IOException {
// Find the InputFormat and then the RecordReader from the
// TaggedInputSplit.
TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
.newInstance(taggedInputSplit.getInputFormatClass(), conf);
return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
reporter);
}
示例5: getRecordReader
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
private org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> getRecordReader(
final InputFormat<BytesWritable, BytesWritable> inputFormat,
final JobConf jobConf) throws ExecutionSetupException {
try {
return inputFormat.getRecordReader(split, jobConf, Reporter.NULL);
} catch (IOException e) {
throw new ExecutionSetupException(
String.format("Error in creating sequencefile reader for file: %s, start: %d, length: %d",
split.getPath(), split.getStart(), split.getLength()), e);
}
}
示例6: countNumColumnsCSV
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
/**
* Counts the number of columns in a given collection of csv file splits. This primitive aborts
* if a row with more than 0 columns is found and hence is robust against empty file splits etc.
*
* @param splits input splits
* @param informat input format
* @param job job configruation
* @param delim delimiter
* @return the number of columns in the collection of csv file splits
* @throws IOException if IOException occurs
*/
@SuppressWarnings({ "rawtypes" })
public static int countNumColumnsCSV(InputSplit[] splits, InputFormat informat, JobConf job, String delim )
throws IOException
{
LongWritable key = new LongWritable();
Text value = new Text();
int ncol = -1;
for( int i=0; i<splits.length && ncol<=0; i++ ) {
RecordReader<LongWritable, Text> reader =
informat.getRecordReader(splits[i], job, Reporter.NULL);
try {
if( reader.next(key, value) ) {
boolean hasValue = true;
if( value.toString().startsWith(TfUtils.TXMTD_MVPREFIX) )
hasValue = reader.next(key, value);
if( value.toString().startsWith(TfUtils.TXMTD_NDPREFIX) )
hasValue = reader.next(key, value);
String row = value.toString().trim();
if( hasValue && !row.isEmpty() ) {
ncol = IOUtilFunctions.countTokensCSV(row, delim);
}
}
}
finally {
closeSilently(reader);
}
}
return ncol;
}
示例7: getSample
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
/**
* Randomize the split order, then take the specified number of keys from
* each split sampled, where each key is selected with the specified
* probability and possibly replaced by a subsequently selected key when
* the quota of keys from that split is satisfied.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
LOG.debug("seed: " + seed);
// shuffle splits
for (int i = 0; i < splits.length; ++i) {
InputSplit tmp = splits[i];
int j = r.nextInt(splits.length);
splits[i] = splits[j];
splits[j] = tmp;
}
// our target rate is in terms of the maximum number of sample splits,
// but we accept the possibility of sampling additional splits to hit
// the target sample keyset
for (int i = 0; i < splitsToSample ||
(i < splits.length && samples.size() < numSamples); ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i], job,
Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
if (r.nextDouble() <= freq) {
if (samples.size() < numSamples) {
samples.add(key);
} else {
// When exceeding the maximum number of samples, replace a
// random element with this one, then adjust the frequency
// to reflect the possibility of existing elements being
// pushed out
int ind = r.nextInt(numSamples);
if (ind != numSamples) {
samples.set(ind, key);
}
freq *= (numSamples - 1) / (double) numSamples;
}
key = reader.createKey();
}
}
reader.close();
}
return (K[])samples.toArray();
}
示例8: createPushRuntime
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
throws HyracksDataException {
final InputSplit[] inputSplits = splitsFactory.getSplits();
return new AbstractUnaryOutputSourceOperatorNodePushable() {
private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId();
@SuppressWarnings("unchecked")
@Override
public void initialize() throws HyracksDataException {
ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
try {
writer.open();
Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
JobConf conf = confFactory.getConf();
conf.setClassLoader(ctx.getJobletContext().getClassLoader());
IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
try {
parser.open(writer);
InputFormat inputFormat = conf.getInputFormat();
for (int i = 0; i < inputSplits.length; i++) {
/**
* read all the partitions scheduled to the current node
*/
if (scheduledLocations[i].equals(nodeName)) {
/**
* pick an unread split to read
* synchronize among simultaneous partitions in the same machine
*/
synchronized (executed) {
if (executed[i] == false) {
executed[i] = true;
} else {
continue;
}
}
/**
* read the split
*/
RecordReader reader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);
Object key = reader.createKey();
Object value = reader.createValue();
while (reader.next(key, value) == true) {
parser.parse(key, value, writer, inputSplits[i].toString());
}
}
}
} finally {
parser.close(writer);
}
} catch (Throwable th) {
writer.fail();
throw new HyracksDataException(th);
} finally {
writer.close();
Thread.currentThread().setContextClassLoader(ctxCL);
}
}
};
}
示例9: readCSVFrameFromInputSplit
import org.apache.hadoop.mapred.InputFormat; //导入方法依赖的package包/类
protected final int readCSVFrameFromInputSplit( InputSplit split, InputFormat<LongWritable,Text> informat, JobConf job,
FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen, int rl, boolean first)
throws IOException
{
boolean hasHeader = _props.hasHeader();
boolean isFill = _props.isFill();
double dfillValue = _props.getFillValue();
String sfillValue = String.valueOf(_props.getFillValue());
String delim = _props.getDelim();
//create record reader
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
LongWritable key = new LongWritable();
Text value = new Text();
int row = rl;
int col = -1;
//handle header if existing
if(first && hasHeader ) {
reader.next(key, value); //read header
dest.setColumnNames(value.toString().split(delim));
}
// Read the data
boolean emptyValuesFound = false;
try
{
while( reader.next(key, value) ) //foreach line
{
String cellStr = value.toString().trim();
emptyValuesFound = false; col = 0;
String[] parts = IOUtilFunctions.splitCSV(cellStr, delim);
//parse frame meta data (missing values / num distinct)
if( parts[0].equals(TfUtils.TXMTD_MVPREFIX) || parts[0].equals(TfUtils.TXMTD_NDPREFIX) ) {
if( parts[0].equals(TfUtils.TXMTD_MVPREFIX) )
for( int j=0; j<dest.getNumColumns(); j++ )
dest.getColumnMetadata(j).setMvValue(parts[j+1]);
else if( parts[0].equals(TfUtils.TXMTD_NDPREFIX) )
for( int j=0; j<dest.getNumColumns(); j++ )
dest.getColumnMetadata(j).setNumDistinct(Long.parseLong(parts[j+1]));
continue;
}
for( String part : parts ) //foreach cell
{
part = part.trim();
if ( part.isEmpty() ) {
if( isFill && dfillValue!=0 )
dest.set(row, col, UtilFunctions.stringToObject(schema[col], sfillValue));
emptyValuesFound = true;
}
else {
dest.set(row, col, UtilFunctions.stringToObject(schema[col], part));
}
col++;
}
//sanity checks for empty values and number of columns
IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, isFill, emptyValuesFound);
IOUtilFunctions.checkAndRaiseErrorCSVNumColumns("", cellStr, parts, clen);
row++;
}
}
finally {
IOUtilFunctions.closeSilently(reader);
}
return row;
}