本文整理汇总了Java中org.apache.hadoop.mapred.InputFormat类的典型用法代码示例。如果您正苦于以下问题:Java InputFormat类的具体用法?Java InputFormat怎么用?Java InputFormat使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
InputFormat类属于org.apache.hadoop.mapred包,在下文中一共展示了InputFormat类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testInputFormat
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
void testInputFormat(Class<? extends InputFormat> clazz) throws IOException {
final JobConf job = MapreduceTestingShim.getJobConf(mrCluster);
job.setInputFormat(clazz);
job.setOutputFormat(NullOutputFormat.class);
job.setMapperClass(ExampleVerifier.class);
job.setNumReduceTasks(0);
LOG.debug("submitting job.");
final RunningJob run = JobClient.runJob(job);
assertTrue("job failed!", run.isSuccessful());
assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getCounter());
assertEquals("Saw any instances of the filtered out row.", 0, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getCounter());
assertEquals("Saw the wrong number of instances of columnA.", 1, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getCounter());
assertEquals("Saw the wrong number of instances of columnB.", 1, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getCounter());
assertEquals("Saw the wrong count of values for the filtered-for row.", 2, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getCounter());
assertEquals("Saw the wrong count of values for the filtered-out row.", 0, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getCounter());
}
示例2: testAddInputPathWithMapper
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
public void testAddInputPathWithMapper() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
MapClass.class);
MultipleInputs.addInputPath(conf, new Path("/bar"),
KeyValueTextInputFormat.class, MapClass2.class);
final Map<Path, InputFormat> inputs = MultipleInputs
.getInputFormatMap(conf);
final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
.getMapperTypeMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
.getClass());
assertEquals(MapClass.class, maps.get(new Path("/foo")));
assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
示例3: getSample
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
samples.add(key);
key = reader.createKey();
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
示例4: addDependencyJars
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
/**
* @see org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job)
*/
public static void addDependencyJars(JobConf job) throws IOException {
org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addHBaseDependencyJars(job);
org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addDependencyJars(
job,
// when making changes here, consider also mapreduce.TableMapReduceUtil
// pull job classes
job.getMapOutputKeyClass(),
job.getMapOutputValueClass(),
job.getOutputKeyClass(),
job.getOutputValueClass(),
job.getPartitionerClass(),
job.getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class),
job.getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class),
job.getCombinerClass());
}
示例5: getInputFormatFromCache
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
public static InputFormat<WritableComparable, Writable> getInputFormatFromCache(
Class inputFormatClass, JobConf job) throws IOException {
InputFormat<WritableComparable, Writable> instance = inputFormats.get(inputFormatClass);
if (instance == null) {
try {
instance = (InputFormat<WritableComparable, Writable>) ReflectionUtil
.newInstance(inputFormatClass, job);
// HBase input formats are not thread safe today. See HIVE-8808.
String inputFormatName = inputFormatClass.getName().toLowerCase();
if (!inputFormatName.contains("hbase")) {
inputFormats.put(inputFormatClass, instance);
}
} catch (Exception e) {
throw new IOException("Cannot create an instance of InputFormat class "
+ inputFormatClass.getName() + " as specified in mapredWork!", e);
}
}
return instance;
}
示例6: getRecordReader
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
private org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> getRecordReader(
final InputFormat<BytesWritable, BytesWritable> inputFormat,
final JobConf jobConf) throws ExecutionSetupException {
try {
final UserGroupInformation ugi = ImpersonationUtil.createProxyUgi(this.opUserName, this.queryUserName);
return ugi.doAs(new PrivilegedExceptionAction<org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable>>() {
@Override
public org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> run() throws Exception {
return inputFormat.getRecordReader(split, jobConf, Reporter.NULL);
}
});
} catch (IOException | InterruptedException e) {
throw new ExecutionSetupException(
String.format("Error in creating sequencefile reader for file: %s, start: %d, length: %d",
split.getPath(), split.getStart(), split.getLength()), e);
}
}
示例7: getInputFormatFromSD
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
/**
* Get the input format from given {@link StorageDescriptor}
* @param properties
* @param hiveReadEntry
* @param sd
* @return {@link InputFormat} class or null if a failure has occurred. Failure is logged as warning.
*/
private Class<? extends InputFormat<?, ?>> getInputFormatFromSD(final Properties properties,
final HiveReadEntry hiveReadEntry, final StorageDescriptor sd, final HiveConf hiveConf) {
final Table hiveTable = hiveReadEntry.getTable();
try {
final String inputFormatName = sd.getInputFormat();
if (!Strings.isNullOrEmpty(inputFormatName)) {
return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormatName);
}
final JobConf job = new JobConf(hiveConf);
HiveUtilities.addConfToJob(job, properties);
return HiveUtilities.getInputFormatClass(job, sd, hiveTable);
} catch (final Exception e) {
logger.warn("Failed to get InputFormat class from Hive table '{}.{}'. StorageDescriptor [{}]",
hiveTable.getDbName(), hiveTable.getTableName(), sd.toString(), e);
return null;
}
}
示例8: getInputFormatClass
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
/**
* Utility method which gets table or partition {@link InputFormat} class. First it
* tries to get the class name from given StorageDescriptor object. If it doesn't contain it tries to get it from
* StorageHandler class set in table properties. If not found throws an exception.
* @param job {@link JobConf} instance needed incase the table is StorageHandler based table.
* @param sd {@link StorageDescriptor} instance of currently reading partition or table (for non-partitioned tables).
* @param table Table object
* @throws Exception
*/
public static Class<? extends InputFormat<?, ?>> getInputFormatClass(final JobConf job, final StorageDescriptor sd,
final Table table) throws Exception {
final String inputFormatName = sd.getInputFormat();
if (Strings.isNullOrEmpty(inputFormatName)) {
final String storageHandlerClass = table.getParameters().get(META_TABLE_STORAGE);
if (Strings.isNullOrEmpty(storageHandlerClass)) {
throw new ExecutionSetupException("Unable to get Hive table InputFormat class. There is neither " +
"InputFormat class explicitly specified nor StorageHandler class");
}
final HiveStorageHandler storageHandler = HiveUtils.getStorageHandler(job, storageHandlerClass);
return (Class<? extends InputFormat<?, ?>>) storageHandler.getInputFormatClass();
} else {
return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormatName) ;
}
}
示例9: runImportRCFile
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
private void runImportRCFile(ExaIterator ctx, List<HCatTableColumn> columns, List<HCatTableColumn> partitionColumns, List<OutputColumnSpec> outputColumns, String file) throws Exception {
List<HCatSerDeParameter> serDeParameters = new ArrayList<>();
serDeParameters.add(new HCatSerDeParameter("serialization.format", "1"));
String inputFormatClassName = "org.apache.hadoop.hive.ql.io.RCFileInputFormat";
String serDeClassName = "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe";
String hdfsUser = "hdfs";
boolean useKerberos = false;
List<String> hdfsServers = new ArrayList<>();
hdfsServers.add("file:///");
final Configuration conf = new Configuration();
FileSystem fs = HdfsService.getFileSystem(hdfsServers,conf);
InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) UdfUtils.getInstanceByName(inputFormatClassName);
SerDe serDe = (SerDe) UdfUtils.getInstanceByName(serDeClassName);
HdfsSerDeImportService.importFile(fs, file, partitionColumns, inputFormat, serDe, serDeParameters, hdfsServers, hdfsUser, columns, outputColumns, useKerberos, ctx);
}
示例10: HiveFileIterator
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
public HiveFileIterator(
Path path,
FileSystem fileSystem,
DirectoryLister directoryLister,
NamenodeStats namenodeStats,
String partitionName,
InputFormat<?, ?> inputFormat,
Properties schema,
List<HivePartitionKey> partitionKeys,
TupleDomain<HiveColumnHandle> effectivePredicate)
{
this.partitionName = requireNonNull(partitionName, "partitionName is null");
this.inputFormat = requireNonNull(inputFormat, "inputFormat is null");
this.schema = requireNonNull(schema, "schema is null");
this.partitionKeys = requireNonNull(partitionKeys, "partitionKeys is null");
this.effectivePredicate = requireNonNull(effectivePredicate, "effectivePredicate is null");
this.path = requireNonNull(path, "path is null");
this.fileSystem = requireNonNull(fileSystem, "fileSystem is null");
this.directoryLister = requireNonNull(directoryLister, "directoryLister is null");
this.namenodeStats = requireNonNull(namenodeStats, "namenodeStats is null");
}
示例11: getSample
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
/**
* For each split sampled, emit when the ratio of the number of records
* retained to the total record count is less than the specified
* frequency.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>();
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
long records = 0;
long kept = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
++records;
if ((double) kept / records < freq) {
++kept;
samples.add(key);
key = reader.createKey();
}
}
reader.close();
}
return (K[])samples.toArray();
}
示例12: testAddInputPathWithMapper
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
@Test
public void testAddInputPathWithMapper() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
MapClass.class);
MultipleInputs.addInputPath(conf, new Path("/bar"),
KeyValueTextInputFormat.class, MapClass2.class);
final Map<Path, InputFormat> inputs = MultipleInputs
.getInputFormatMap(conf);
final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
.getMapperTypeMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
.getClass());
assertEquals(MapClass.class, maps.get(new Path("/foo")));
assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
示例13: testParquet
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
@Test
public void testParquet()
throws Exception
{
List<TestColumn> testColumns = getTestColumnsSupportedByParquet();
HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat();
InputFormat<?, ?> inputFormat = new MapredParquetInputFormat();
@SuppressWarnings("deprecation")
SerDe serde = new ParquetHiveSerDe();
File file = File.createTempFile("presto_test", "parquet");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(false);
testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
示例14: createJobConf
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
private static JobConf createJobConf(Configuration conf, boolean useFastCopy) {
Class<? extends InputFormat> inputFormat =
(useFastCopy) ? FastCopyInputFormat.class : CopyInputFormat.class;
JobConf jobconf = new JobConf(conf, DistCp.class);
jobconf.setJobName(NAME);
// turn off speculative execution, because DFS doesn't handle
// multiple writers to the same file.
jobconf.setReduceSpeculativeExecution(false);
jobconf.setMapOutputKeyClass(FilePairComparable.class);
jobconf.setMapOutputValueClass(Text.class);
jobconf.setOutputKeyClass(FilePairComparable.class);
jobconf.setOutputValueClass(Text.class);
jobconf.setInputFormat(inputFormat);
jobconf.setMapperClass(CopyFilesTask.class);
jobconf.setReducerClass(CopyFilesTask.class);
// Prevent the reducer from starting until all maps are done.
jobconf.setInt("mapred.job.rushreduce.reduce.threshold", 0);
jobconf.setFloat("mapred.reduce.slowstart.completed.maps", 1.0f);
return jobconf;
}
示例15: testRCBinary
import org.apache.hadoop.mapred.InputFormat; //导入依赖的package包/类
@Test
public void testRCBinary()
throws Exception
{
List<TestColumn> testColumns = ImmutableList.copyOf(filter(TEST_COLUMNS, testColumn -> {
// RC file does not support complex type as key of a map
return !testColumn.getName().equals("t_map_null_key_complex_key_value");
}));
HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat();
InputFormat<?, ?> inputFormat = new RCFileInputFormat<>();
@SuppressWarnings("deprecation")
SerDe serde = new LazyBinaryColumnarSerDe();
File file = File.createTempFile("presto_test", "rc-binary");
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
testCursorProvider(new ColumnarBinaryHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS);
testCursorProvider(new GenericHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}