Java FileSplit類代碼示例

本文整理匯總了Java中org.apache.hadoop.mapred.FileSplit類的典型用法代碼示例。如果您正苦於以下問題：Java FileSplit類的具體用法？Java FileSplit怎麽用？Java FileSplit使用的例子？那麽, 這裏精選的類代碼示例或許可以為您提供幫助。

FileSplit類屬於org.apache.hadoop.mapred包，在下文中一共展示了FileSplit類的15個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Java代碼示例。

示例1: getRowGroupNumbersFromFileSplit

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
    final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();

  final long splitStart = split.getStart();
  final long splitLength = split.getLength();

  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}

開發者ID:skhalifa，項目名稱:QDrill，代碼行數:25，代碼來源:HiveDrillNativeScanBatchCreator.java

示例2: HiveVectorizedReaderSetting

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public HiveVectorizedReaderSetting( final FileSplit split , final JobConf job , final HiveReaderSetting hiveReaderConfig ) throws IOException{
  this.hiveReaderConfig = hiveReaderConfig;

  rbCtx = Utilities.getVectorizedRowBatchCtx( job );
  partitionValues = new Object[rbCtx.getPartitionColumnCount()];
  if( 0 < partitionValues.length ){
    rbCtx.getPartitionValues( rbCtx, job, split, partitionValues );
  }

  TypeInfo[] typeInfos = rbCtx.getRowColumnTypeInfos();
  columnNames = rbCtx.getRowColumnNames();
  needColumnIds = createNeedColumnId( ColumnProjectionUtils.getReadColumnIDs( job ) );

  projectionColumn = new boolean[columnNames.length];
  assignors = new IColumnVectorAssignor[columnNames.length];
  for( int id : needColumnIds ){
    projectionColumn[id] = true;
    assignors[id] = ColumnVectorAssignorFactory.create( typeInfos[id] );
  }
}

開發者ID:yahoojapan，項目名稱:multiple-dimension-spread，代碼行數:21，代碼來源:HiveVectorizedReaderSetting.java

示例3: getRecordReader

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
@Override
public RecordReader<NullWritable,ColumnAndIndex> getRecordReader( final InputSplit split, final JobConf job, final Reporter reporter ) throws IOException {
  FileSplit fileSplit = (FileSplit)split;
  Path path = fileSplit.getPath();
  FileSystem fs = path.getFileSystem( job );
  long fileLength = fs.getLength( path );
  long start = fileSplit.getStart();
  long length = fileSplit.getLength();
  InputStream in = fs.open( path );
  IJobReporter jobReporter = new HadoopJobReporter( reporter );
  jobReporter.setStatus( String.format( "Read file : %s" , path.toString() ) );
  HiveReaderSetting hiveConfig = new HiveReaderSetting( fileSplit , job );
  if ( hiveConfig.isVectorMode() ){
    IVectorizedReaderSetting vectorizedSetting = new HiveVectorizedReaderSetting( fileSplit , job , hiveConfig );
    return (RecordReader)new MDSHiveDirectVectorizedReader( in , fileLength , start , length , vectorizedSetting , jobReporter );
  }
  else{
    return new MDSHiveLineReader( in , fileLength , start , length , hiveConfig , jobReporter , spreadCounter );
  }
}

開發者ID:yahoojapan，項目名稱:multiple-dimension-spread，代碼行數:21，代碼來源:MDSHiveLineInputFormat.java

示例4: DelimitedAndFixedWidthRecordReader

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public DelimitedAndFixedWidthRecordReader(JobConf conf, FileSplit split)
		throws IOException {
	lengthsAndDelimiters = DelimitedAndFixedWidthHelper
			.modifyIdentifier(conf.get("lengthsAndDelimiters").split(Constants.LENGTHS_AND_DELIMITERS_SEPARATOR));
	lengthsAndDelimitersType = conf.get("lengthsAndDelimitersType").split(Constants.LENGTHS_AND_DELIMITERS_SEPARATOR);
	quote = conf.get("quote");
	charsetName = conf.get("charsetName");
	start = split.getStart();
	pos = start;
	end = start + split.getLength();
	file = split.getPath();
	fs = file.getFileSystem(conf);
	fileIn = fs.open(split.getPath());
	fileIn.seek(start);
	inputStreamReader = new InputStreamReader(fileIn, charsetName);
	singleChar = new char[1];
	stringBuilder = new StringBuilder();
	isQuotePresent = isQuotePresent(quote);
}

開發者ID:capitalone，項目名稱:Hydrograph，代碼行數:20，代碼來源:DelimitedAndFixedWidthRecordReader.java

示例5: StreamXmlRecordReader

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
                             JobConf job, FileSystem fs) throws IOException {
  super(in, split, reporter, job, fs);

  beginMark_ = checkJobGet(CONF_NS + "begin");
  endMark_ = checkJobGet(CONF_NS + "end");

  maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
  lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
  synched_ = false;

  slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
  if (slowMatch_) {
    beginPat_ = makePatternCDataOrMark(beginMark_);
    endPat_ = makePatternCDataOrMark(endMark_);
  }
  init();
}

開發者ID:naver，項目名稱:hadoop，代碼行數:19，代碼來源:StreamXmlRecordReader.java

示例6: getRecordReader

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public RecordReader getRecordReader(InputSplit split, JobConf job,
  Reporter reporter) throws IOException {
  FileSplit fileSplit = (FileSplit) split;
  FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
  FSDataInputStream is = fs.open(fileSplit.getPath());
  byte[] header = new byte[3];
  RecordReader reader = null;
  try {
    is.readFully(header);
  } catch (EOFException eof) {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  } finally {
    is.close();
  }
  if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
    reader = seqFileInputFormat.getRecordReader(split, job, reporter);
  } else {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  }
  return reader;
}

開發者ID:naver，項目名稱:hadoop，代碼行數:22，代碼來源:AutoInputFormat.java

示例7: FileSplitParquetRecordReader

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public FileSplitParquetRecordReader(
    final OperatorContext oContext,
    final ParquetReaderFactory readerFactory,
    final List<SchemaPath> columnsToRead,
    final List<SchemaPath> groupScanColumns,
    final List<FilterCondition> conditions,
    final FileSplit fileSplit,
    final ParquetMetadata footer,
    final JobConf jobConf,
    final boolean vectorize,
    final boolean enableDetailedTracing
) {
  this.oContext = oContext;
  this.columnsToRead = columnsToRead;
  this.groupScanColumns = groupScanColumns;
  this.conditions = conditions;
  this.fileSplit = fileSplit;
  this.footer = footer;
  this.jobConf = jobConf;
  this.readerFactory = readerFactory;
  this.vectorize = vectorize;
  this.enableDetailedTracing = enableDetailedTracing;
}

開發者ID:dremio，項目名稱:dremio-oss，代碼行數:24，代碼來源:FileSplitParquetRecordReader.java

示例8: getRowGroupNumbersFromFileSplit

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
private static List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
    final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();

  final long splitStart = split.getStart();
  final long splitLength = split.getLength();

  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}

開發者ID:dremio，項目名稱:dremio-oss，代碼行數:25，代碼來源:FileSplitParquetRecordReader.java

示例9: getRecordReader

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
    InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
  // CombineFileSplit indicates the new export format which includes a manifest file
  if (inputSplit instanceof CombineFileSplit) {
    int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
    if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
      throw new IOException("Unknown version: " + job.get(DynamoDBConstants
          .EXPORT_FORMAT_VERSION));
    }
    return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
  } else if (inputSplit instanceof FileSplit) {
    // FileSplit indicates the old data pipeline format which doesn't include a manifest file
    Path path = ((FileSplit) inputSplit).getPath();
    return new ImportRecordReader(job, path);
  } else {
    throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
        + " " + inputSplit.getClass());
  }
}

開發者ID:awslabs，項目名稱:emr-dynamodb-connector，代碼行數:20，代碼來源:ImportRecordReaderFactory.java

示例10: IndexRRecordReader

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
public IndexRRecordReader(InputSplit inputSplit, Configuration configuration) throws IOException {
    FileSplit fileSplit = (FileSplit) inputSplit;
    Preconditions.checkState(fileSplit.getStart() == 0, "Segment should not splited");

    Path filePath = fileSplit.getPath();
    // Hive may ask to read a file located on local file system.
    // We have to get the real file system by path's schema.
    FileSystem fileSystem = FileSystem.get(filePath.toUri(), FileSystem.get(configuration).getConf());

    if (SegmentHelper.checkSegmentByPath(filePath)) {
        ByteBufferReader.Opener opener = ByteBufferReader.Opener.create(fileSystem, filePath);
        IntegratedSegment.Fd fd = IntegratedSegment.Fd.create(filePath.toString(), opener);
        if (fd != null) {
            segment = fd.open();
            offset = 0L;
            rowIterator = segment.rowTraversal().iterator();
            getIncludeColumns(configuration, segment);
        }
    } else {
        LOG.warn("ignore " + filePath);
    }
}

開發者ID:shunfei，項目名稱:indexr，代碼行數:23，代碼來源:IndexRRecordReader.java

示例11: getSplits

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
@Override
public FileSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    // first, merge input table properties (since there's no access to them ...)
    Settings settings = HadoopSettingsManager.loadFrom(job);
    //settings.merge(IOUtils.propsFromString(settings.getProperty(HiveConstants.INPUT_TBL_PROPERTIES)));

    Log log = LogFactory.getLog(getClass());
    // move on to initialization
    InitializationUtils.setValueReaderIfNotSet(settings, HiveValueReader.class, log);
    settings.setProperty(InternalConfigurationOptions.INTERNAL_ES_TARGET_FIELDS, StringUtils.concatenateAndUriEncode(HiveUtils.columnToAlias(settings), ","));
    // set read resource
    settings.setResourceRead(settings.getResourceRead());
    HiveUtils.init(settings, log);

    // decorate original splits as FileSplit
    InputSplit[] shardSplits = super.getSplits(job, numSplits);
    FileSplit[] wrappers = new FileSplit[shardSplits.length];
    Path path = new Path(job.get(HiveConstants.TABLE_LOCATION));
    for (int i = 0; i < wrappers.length; i++) {
        wrappers[i] = new EsHiveSplit(shardSplits[i], path);
    }
    return wrappers;
}

開發者ID:xushjie1987，項目名稱:es-hadoop-v2.2.0，代碼行數:24，代碼來源:EsHiveInputFormat.java

示例12: testOrcDataStream

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
@Test
public void testOrcDataStream()
        throws Exception
{
    HiveOutputFormat<?, ?> outputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat();
    InputFormat<?, ?> inputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcInputFormat();
    @SuppressWarnings("deprecation")
    SerDe serde = new org.apache.hadoop.hive.ql.io.orc.OrcSerde();
    File file = File.createTempFile("presto_test", "orc");
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
        testPageSourceFactory(new OrcPageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}

開發者ID:y-lan，項目名稱:presto，代碼行數:20，代碼來源:TestHiveFileFormats.java

示例13: testRcTextPageSource

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
@Test(enabled = false)
public void testRcTextPageSource()
        throws Exception
{
    HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat();
    InputFormat<?, ?> inputFormat = new RCFileInputFormat<>();
    @SuppressWarnings("deprecation")
    SerDe serde = new ColumnarSerDe();
    File file = File.createTempFile("presto_test", "rc-binary");
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
        testPageSourceFactory(new RcFilePageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}

開發者ID:y-lan，項目名稱:presto，代碼行數:20，代碼來源:TestHiveFileFormats.java

示例14: getSplits

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException 
{
	InputSplit[] tmp = super.getSplits(job, numSplits);
	
	//get partitioning information
	MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job);
	PDataPartitionFormat dpf = MRJobConfiguration.getPartitioningFormat(job);
	PartitionFormat pf = new PartitionFormat(dpf, -1);
	int blen = (int) (pf.isRowwise() ? pf.getNumRows(mc) : pf.getNumColumns(mc));
	String fname = MRJobConfiguration.getPartitioningFilename(job);

	//create wrapper splits 
	InputSplit[] ret = new InputSplit[ tmp.length ];
	for( int i=0; i<tmp.length; i++ ) {
		//check for robustness of subsequent cast
		if( tmp[i] instanceof FileSplit ) 
			ret[i] = new RemoteParForColocatedFileSplit( (FileSplit) tmp[i], fname, blen );
		else
			ret[i] = tmp[i];
	}
	return ret;
}

開發者ID:apache，項目名稱:systemml，代碼行數:24，代碼來源:RemoteParForColocatedNLineInputFormat.java

示例15: LineDocRecordReader

import org.apache.hadoop.mapred.FileSplit; //導入依賴的package包/類
/**
 * Constructor
 * @param job
 * @param split  
 * @throws IOException
 */
public LineDocRecordReader(Configuration job, FileSplit split)
    throws IOException {
  long start = split.getStart();
  long end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  FileSystem fs = file.getFileSystem(job);
  FSDataInputStream fileIn = fs.open(split.getPath());
  InputStream in = fileIn;
  boolean skipFirstLine = false;
  if (start != 0) {
    skipFirstLine = true; // wait till BufferedInputStream to skip
    --start;
    fileIn.seek(start);
  }

  this.in = new BufferedInputStream(in);
  if (skipFirstLine) { // skip first line and re-establish "start".
    start += LineDocRecordReader.readData(this.in, null, EOL);
  }
  this.start = start;
  this.pos = start;
  this.end = end;
}

開發者ID:Nextzero，項目名稱:hadoop-2.6.0-cdh5.4.3，代碼行數:32，代碼來源:LineDocRecordReader.java

注：本文中的org.apache.hadoop.mapred.FileSplit類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。