当前位置: 首页>>代码示例>>Java>>正文


Java InputSplit类代码示例

本文整理汇总了Java中org.apache.hadoop.mapred.InputSplit的典型用法代码示例。如果您正苦于以下问题:Java InputSplit类的具体用法?Java InputSplit怎么用?Java InputSplit使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


InputSplit类属于org.apache.hadoop.mapred包,在下文中一共展示了InputSplit类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getRecordReader

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Override
public RecordReader<NullWritable,ColumnAndIndex> getRecordReader( final InputSplit split, final JobConf job, final Reporter reporter ) throws IOException {
  FileSplit fileSplit = (FileSplit)split;
  Path path = fileSplit.getPath();
  FileSystem fs = path.getFileSystem( job );
  long fileLength = fs.getLength( path );
  long start = fileSplit.getStart();
  long length = fileSplit.getLength();
  InputStream in = fs.open( path );
  IJobReporter jobReporter = new HadoopJobReporter( reporter );
  jobReporter.setStatus( String.format( "Read file : %s" , path.toString() ) );
  HiveReaderSetting hiveConfig = new HiveReaderSetting( fileSplit , job );
  if ( hiveConfig.isVectorMode() ){
    IVectorizedReaderSetting vectorizedSetting = new HiveVectorizedReaderSetting( fileSplit , job , hiveConfig );
    return (RecordReader)new MDSHiveDirectVectorizedReader( in , fileLength , start , length , vectorizedSetting , jobReporter );
  }
  else{
    return new MDSHiveLineReader( in , fileLength , start , length , hiveConfig , jobReporter , spreadCounter );
  }
}
 
开发者ID:yahoojapan,项目名称:multiple-dimension-spread,代码行数:21,代码来源:MDSHiveLineInputFormat.java

示例2: HiveTextRecordReader

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
public HiveTextRecordReader(Table table, Partition partition, InputSplit inputSplit, List<SchemaPath> projectedColumns, FragmentContext context) throws ExecutionSetupException {
  super(table, partition, inputSplit, projectedColumns, context, null);
  String d = table.getSd().getSerdeInfo().getParameters().get("field.delim");
  if (d != null) {
    delimiter = d.getBytes()[0];
  } else {
    delimiter = (byte) 1;
  }
  assert delimiter > 0;
  List<Integer> ids = Lists.newArrayList();
  for (int i = 0; i < tableColumns.size(); i++) {
    if (selectedColumnNames.contains(tableColumns.get(i))) {
      ids.add(i);
    }
  }
  columnIds = ids;
  numCols = tableColumns.size();
}
 
开发者ID:skhalifa,项目名称:QDrill,代码行数:19,代码来源:HiveTextRecordReader.java

示例3: getSpecificScan

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Override
public SubScan getSpecificScan(final int minorFragmentId) throws ExecutionSetupException {
  try {
    final List<InputSplit> splits = mappings.get(minorFragmentId);
    List<HivePartition> parts = Lists.newArrayList();
    final List<String> encodedInputSplits = Lists.newArrayList();
    final List<String> splitTypes = Lists.newArrayList();
    for (final InputSplit split : splits) {
      HivePartition partition = null;
      if (partitionMap.get(split) != null) {
        partition = new HivePartition(partitionMap.get(split));
      }
      parts.add(partition);
      encodedInputSplits.add(serializeInputSplit(split));
      splitTypes.add(split.getClass().getName());
    }
    if (parts.contains(null)) {
      parts = null;
    }

    final HiveReadEntry subEntry = new HiveReadEntry(hiveReadEntry.table, parts, hiveReadEntry.hiveConfigOverride);
    return new HiveSubScan(getUserName(), encodedInputSplits, subEntry, splitTypes, columns);
  } catch (IOException | ReflectiveOperationException e) {
    throw new ExecutionSetupException(e);
  }
}
 
开发者ID:skhalifa,项目名称:QDrill,代码行数:27,代码来源:HiveScan.java

示例4: getScanStats

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Override
public ScanStats getScanStats() {
  try {
    long data =0;
    for (final InputSplit split : inputSplits) {
        data += split.getLength();
    }

    long estRowCount = rowCount;
    if (estRowCount == 0) {
      // having a rowCount of 0 can mean the statistics were never computed
      estRowCount = data/1024;
    }

    // Hive's native reader is neither memory efficient nor fast. Increase the CPU cost
    // by a factor to let the planner choose HiveDrillNativeScan over HiveScan with SerDes.
    float cpuCost = 1 * getSerDeOverheadFactor();

    logger.debug("estimated row count = {}, stats row count = {}", estRowCount, rowCount);
    return new ScanStats(GroupScanProperty.NO_EXACT_ROW_COUNT, estRowCount, cpuCost, data);
  } catch (final IOException e) {
    throw new DrillRuntimeException(e);
  }
}
 
开发者ID:skhalifa,项目名称:QDrill,代码行数:25,代码来源:HiveScan.java

示例5: readUsingRecordReader

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
/**
 * Read from Geode, using MonarchRecordReader, all the records from the provided split.
 * The split contains the range of records to be read by the record reader. It
 * returns the total number of records read by this method.
 *
 * @param conf       the reader configuration -- must have the region name
 * @param split      the input-split containing the records to be read
 * @param predicates the predicates to filter out unwanted results
 * @return the total number of records read
 */
private long readUsingRecordReader(final Configuration conf, final InputSplit split,
                                   final Filter... predicates) {
  MonarchRecordReader mrr = new MonarchRecordReader(conf);
  FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL);
  for (int i=0; i<predicates.length; i++) {
    filterList.addFilter(predicates[i]);
  }
  mrr.pushDownfilters = filterList;
  long size = 0;
  try {
    mrr.initialize(split, conf);
    Writable key = mrr.createKey();
    Writable value = mrr.createValue();
    while (mrr.next(key, value)) {
      ++size;
    }
    mrr.close();
  } catch (IOException e) {
    e.printStackTrace();
  }
  return size;
}
 
开发者ID:ampool,项目名称:monarch,代码行数:33,代码来源:MonarchRecordReaderTest.java

示例6: readFields

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
/**
 * {@inheritDoc}
 * @throws IOException If the child InputSplit cannot be read, typically
 *                     for faliing access checks.
 */
@SuppressWarnings("unchecked")  // Generic array assignment
public void readFields(DataInput in) throws IOException {
  int card = WritableUtils.readVInt(in);
  if (splits == null || splits.length != card) {
    splits = new InputSplit[card];
  }
  Class<? extends InputSplit>[] cls = new Class[card];
  try {
    for (int i = 0; i < card; ++i) {
      cls[i] =
        Class.forName(Text.readString(in)).asSubclass(InputSplit.class);
    }
    for (int i = 0; i < card; ++i) {
      splits[i] = ReflectionUtils.newInstance(cls[i], null);
      splits[i].readFields(in);
    }
  } catch (ClassNotFoundException e) {
    throw (IOException)new IOException("Failed split init").initCause(e);
  }
}
 
开发者ID:naver,项目名称:hadoop,代码行数:26,代码来源:CompositeInputSplit.java

示例7: getSample

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      samples.add(key);
      key = reader.createKey();
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
开发者ID:naver,项目名称:hadoop,代码行数:29,代码来源:InputSampler.java

示例8: testDBInputFormat

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
/**
 * test DBInputFormat class. Class should split result for chunks
 * @throws Exception
 */
@Test(timeout = 10000)
public void testDBInputFormat() throws Exception {
  JobConf configuration = new JobConf();
  setupDriver(configuration);
  
  DBInputFormat<NullDBWritable> format = new DBInputFormat<NullDBWritable>();
  format.setConf(configuration);
  format.setConf(configuration);
  DBInputFormat.DBInputSplit splitter = new DBInputFormat.DBInputSplit(1, 10);
  Reporter reporter = mock(Reporter.class);
  RecordReader<LongWritable, NullDBWritable> reader = format.getRecordReader(
      splitter, configuration, reporter);

  configuration.setInt(MRJobConfig.NUM_MAPS, 3);
  InputSplit[] lSplits = format.getSplits(configuration, 3);
  assertEquals(5, lSplits[0].getLength());
  assertEquals(3, lSplits.length);

  // test reader .Some simple tests
  assertEquals(LongWritable.class, reader.createKey().getClass());
  assertEquals(0, reader.getPos());
  assertEquals(0, reader.getProgress(), 0.001);
  reader.close();
}
 
开发者ID:naver,项目名称:hadoop,代码行数:29,代码来源:TestDBInputFormat.java

示例9: getRecordReader

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
public RecordReader getRecordReader(InputSplit split, JobConf job,
  Reporter reporter) throws IOException {
  FileSplit fileSplit = (FileSplit) split;
  FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
  FSDataInputStream is = fs.open(fileSplit.getPath());
  byte[] header = new byte[3];
  RecordReader reader = null;
  try {
    is.readFully(header);
  } catch (EOFException eof) {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  } finally {
    is.close();
  }
  if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
    reader = seqFileInputFormat.getRecordReader(split, job, reporter);
  } else {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  }
  return reader;
}
 
开发者ID:naver,项目名称:hadoop,代码行数:22,代码来源:AutoInputFormat.java

示例10: readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Test
   public void readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint() throws IOException {
   	JobConf job = new JobConf(defaultConf);
   	ClassLoader classLoader = getClass().getClassLoader();
   	String fileName="excel2013encrypt.xlsx";
   	String fileNameSpreadSheet=classLoader.getResource(fileName).getFile();	
   	Path file = new Path(fileNameSpreadSheet);
   	FileInputFormat.setInputPaths(job, file);
// set locale to the one of the test data
job.set("hadoopoffice.read.locale.bcp47","de");
// low footprint
job.set("hadoopoffice.read.lowFootprint", "true");
// for decryption simply set the password
job.set("hadoopoffice.read.security.crypt.password","test2");
  	ExcelFileInputFormat format = new ExcelFileInputFormat();
   	format.configure(job);
   	InputSplit[] inputSplits = format.getSplits(job,1);
   	assertEquals(1,inputSplits.length,"Only one split generated for Excel file");
   	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);	
   	assertNull(reader,"Null record reader implies invalid password");
   }
 
开发者ID:ZuInnoTe,项目名称:hadoopoffice,代码行数:22,代码来源:OfficeFormatHadoopExcelTest.java

示例11: testOpenClose

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Test
public void testOpenClose() throws Exception {
	DummyRecordReader recordReader = mock(DummyRecordReader.class);
	DummyInputFormat inputFormat = mock(DummyInputFormat.class);
	when(inputFormat.getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class))).thenReturn(recordReader);

	HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat, String.class, Long.class, new JobConf());
	hadoopInputFormat.open(getHadoopInputSplit());

	verify(inputFormat, times(1)).getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class));
	verify(recordReader, times(1)).createKey();
	verify(recordReader, times(1)).createValue();

	assertThat(hadoopInputFormat.fetched, is(false));

	hadoopInputFormat.close();
	verify(recordReader, times(1)).close();
}
 
开发者ID:axbaretto,项目名称:flink,代码行数:19,代码来源:HadoopInputFormatTest.java

示例12: readExcelInputFormatExcel2013SingleSheetEncryptedNegative

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Test
   public void readExcelInputFormatExcel2013SingleSheetEncryptedNegative() throws IOException {
   	JobConf job = new JobConf(defaultConf);
   	ClassLoader classLoader = getClass().getClassLoader();
   	String fileName="excel2013encrypt.xlsx";
   	String fileNameSpreadSheet=classLoader.getResource(fileName).getFile();	
   	Path file = new Path(fileNameSpreadSheet);
   	FileInputFormat.setInputPaths(job, file);
// set locale to the one of the test data
job.set("hadoopoffice.read.locale.bcp47","de");
// for decryption simply set the password
job.set("hadoopoffice.read.security.crypt.password","test2");
  	ExcelFileInputFormat format = new ExcelFileInputFormat();
   	format.configure(job);
   	InputSplit[] inputSplits = format.getSplits(job,1);
   	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
   	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);	
   	assertNull(reader, "Null record reader implies invalid password");
   }
 
开发者ID:ZuInnoTe,项目名称:hadoopoffice,代码行数:20,代码来源:OfficeFormatHadoopExcelTest.java

示例13: readEthereumBlockInputFormatBlock1346406Bzip2Compressed

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Test
 public void readEthereumBlockInputFormatBlock1346406Bzip2Compressed() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth1346406.bin.bz2";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
	
BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 1346406 contains at least one block");
	
assertEquals( 6, block.getEthereumTransactions().size(),"Block 1346406 must have 6 transactions");
   	assertFalse( reader.next(key,block),"No further blocks in block 1346406");
   	reader.close();
}
 
开发者ID:ZuInnoTe,项目名称:hadoopcryptoledger,代码行数:25,代码来源:EthereumFormatHadoopTest.java

示例14: getRecordReader

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
    InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
  // CombineFileSplit indicates the new export format which includes a manifest file
  if (inputSplit instanceof CombineFileSplit) {
    int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
    if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
      throw new IOException("Unknown version: " + job.get(DynamoDBConstants
          .EXPORT_FORMAT_VERSION));
    }
    return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
  } else if (inputSplit instanceof FileSplit) {
    // FileSplit indicates the old data pipeline format which doesn't include a manifest file
    Path path = ((FileSplit) inputSplit).getPath();
    return new ImportRecordReader(job, path);
  } else {
    throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
        + " " + inputSplit.getClass());
  }
}
 
开发者ID:awslabs,项目名称:emr-dynamodb-connector,代码行数:20,代码来源:ImportRecordReaderFactory.java

示例15: readEthereumBlockInputFormatBlock1

import org.apache.hadoop.mapred.InputSplit; //导入依赖的package包/类
@Test
 public void readEthereumBlockInputFormatBlock1() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
			ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth1.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 1 contains at least one block");
assertEquals( 0, block.getEthereumTransactions().size(),"Block 1 must have 0 transactions");
   	assertFalse( reader.next(key,block),"No further blocks in block 1");
   	reader.close();
}
 
开发者ID:ZuInnoTe,项目名称:hadoopcryptoledger,代码行数:23,代码来源:EthereumFormatHadoopTest.java


注:本文中的org.apache.hadoop.mapred.InputSplit类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。