本文整理汇总了Java中org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat类的典型用法代码示例。如果您正苦于以下问题:Java MapFileOutputFormat类的具体用法?Java MapFileOutputFormat怎么用?Java MapFileOutputFormat使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
MapFileOutputFormat类属于org.apache.hadoop.mapreduce.lib.output包,在下文中一共展示了MapFileOutputFormat类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: OutLinks <input path> <output path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(OutLinks.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setOutputFormatClass(MapFileOutputFormat.class);
job.setMapperClass(OutLinkNumMapper.class);
job.setReducerClass(OutLinkNumReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
job.waitForCompletion(true);
}
示例2: main
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if(args.length != 2) {
System.err.println("Usage: UrlModulus <input path> <output path>");
System.exit(-1);
}
Configuration conf = new Configuration();
Job job = new Job(conf, "UrlModulus");
job.setJarByClass(UrlModulus.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setOutputFormatClass(MapFileOutputFormat.class);
job.setMapperClass(UrlModulusMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setReducerClass(UrlModulusReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
job.setNumReduceTasks(1);
job.waitForCompletion(true);
}
示例3: main
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: OutLinks <input path> <output path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(PageRank.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setOutputFormatClass(MapFileOutputFormat.class);
job.setMapperClass(PageRankMapper.class);
job.setReducerClass(PageRankReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
PageRankRead = new MapFileRead("/input/PageRankMap");
//OutLinksRead = new MapFileRead("/input/OutLinksMap");
job.waitForCompletion(true);
}
示例4: getNumPages
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
private int getNumPages(Configuration conf, Path titlesDir)
throws Exception {
int numPages = 0;
IntWritable pageNumber = new IntWritable();
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
for (int i = 0; i < readers.length; i++) {
readers[i].finalKey(pageNumber);
if (pageNumber.get() > numPages) {
numPages = pageNumber.get();
}
}
for (MapFile.Reader reader : readers) {
reader.close();
}
return numPages;
}
示例5: run
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
JobBuilder.printUsage(this, "<path> <key>");
return -1;
}
Path path = new Path(args[0]);
IntWritable key = new IntWritable(Integer.parseInt(args[1]));
Reader[] readers = MapFileOutputFormat.getReaders(path, getConf());
Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
Text val = new Text();
Writable entry = MapFileOutputFormat.getEntry(readers, partitioner,
key, val);
if (entry == null) {
System.err.println("Key not found: " + key);
return -1;
}
NcdcRecordParser parser = new NcdcRecordParser();
parser.parse(val.toString());
System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
return 0;
}
示例6: run
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {
Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
if (job == null) {
return -1;
}
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputFormatClass(MapFileOutputFormat.class);
SequenceFileOutputFormat.setCompressOutput(job, true);
SequenceFileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
return job.waitForCompletion(true) ? 0 : 1;
}
示例7: createSubmittableJob
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public Job createSubmittableJob(String[] args) throws IOException {
Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
generatePartitions(partitionsPath);
Job job = Job.getInstance(getConf(),
getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
Configuration jobConf = job.getConfiguration();
jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
job.setJarByClass(HashTable.class);
TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
// use a TotalOrderPartitioner and reducers to group region output into hash files
job.setPartitionerClass(TotalOrderPartitioner.class);
TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
job.setReducerClass(Reducer.class); // identity reducer
job.setNumReduceTasks(tableHash.numHashFiles);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(ImmutableBytesWritable.class);
job.setOutputFormatClass(MapFileOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));
return job;
}
示例8: createSubmittableJob
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public Job createSubmittableJob(String[] args) throws IOException {
Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
generatePartitions(partitionsPath);
Job job = Job.getInstance(getConf(),
getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
Configuration jobConf = job.getConfiguration();
jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
job.setJarByClass(HashTable.class);
TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
// use a TotalOrderPartitioner and reducers to group region output into hash files
job.setPartitionerClass(TotalOrderPartitioner.class);
TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
job.setReducerClass(Reducer.class); // identity reducer
job.setNumReduceTasks(tableHash.numHashFiles);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(ImmutableBytesWritable.class);
job.setOutputFormatClass(MapFileOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));
return job;
}
示例9: run
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out.println("Usage: TitleIndex <titles-sorted.txt> <output-dir>");
ToolRunner.printGenericCommandUsage(System.out);
return 2;
}
Path titlesFile = new Path(args[0]);
Path outputDir = new Path(args[1]);
Configuration conf = getConf();
// Do not create _SUCCESS files. MapFileOutputFormat.getReaders calls
// try to read the _SUCCESS as another MapFile dir.
conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false");
// This job creates a MapFile of the titles indexed by the page id.
// UnsplittableTextInputFormat is used to ensure that the same map task
// gets all the lines in the titlesFile and it can count the line
// numbers. The number of reduce tasks is set to 0.
Job job = Job.getInstance(conf, "TitleIndex");
job.setJarByClass(InLinks.class);
job.setInputFormatClass(UnsplittableTextInputFormat.class);
job.setMapperClass(TitleIndexMapper.class);
job.setOutputFormatClass(MapFileOutputFormat.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, titlesFile);
FileOutputFormat.setOutputPath(job, outputDir);
job.setNumReduceTasks(0);
job.waitForCompletion(true);
return 0;
}
示例10: cleanup
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
Path titlesDir = new Path(conf.get("pagerank.titles_dir"));
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
IntWritable page = new IntWritable();
Text title = new Text();
float[] pageRanks = new float[topN.size()];
String[] titles = new String[topN.size()];
// The order of the entries is reversed. The priority queue is in
// non-decreasing order and we want the highest PageRank first.
for (int i = pageRanks.length - 1; i >= 0; i--) {
Map.Entry<Float, Integer> entry = topN.poll();
// Get the title of the page from the title index.
page.set(entry.getValue());
MapFileOutputFormat.getEntry(readers, partitioner, page, title);
pageRanks[i] = entry.getKey();
titles[i] = title.toString();
}
for (MapFile.Reader reader : readers) {
reader.close();
}
for (int i = 0; i < pageRanks.length; i++) {
context.write(new FloatWritable(pageRanks[i]), new Text(titles[i]));
}
}
示例11: pageRankIteration
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
private void pageRankIteration(int iter, Configuration conf, Path outputDir)
throws Exception {
// This job performs an iteration of the power iteration method to
// compute PageRank. The map task processes each block M_{i,j}, loads
// the corresponding stripe j of the vector v_{k-1} and produces the
// partial result of the stripe i of the vector v_k. The reduce task
// sums all the partial results of v_k and adds the teleportation factor
// (the combiner only sums all the partial results). See Section 5.2
// (and 5.2.3 in particular) of Mining of Massive Datasets
// (http://infolab.stanford.edu/~ullman/mmds.html) for details. The
// output is written in a "vk" subdir of the output dir, where k is the
// iteration number. MapFileOutputFormat is used to keep an array of the
// stripes of v.
Job job = Job.getInstance(conf, "PageRank:Iteration");
job.setJarByClass(PageRank.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapperClass(PageRankIterationMapper.class);
job.setMapOutputKeyClass(ShortWritable.class);
job.setMapOutputValueClass(FloatArrayWritable.class);
job.setCombinerClass(PageRankIterationCombiner.class);
job.setReducerClass(PageRankIterationReducer.class);
job.setOutputFormatClass(MapFileOutputFormat.class);
job.setOutputKeyClass(ShortWritable.class);
job.setOutputValueClass(FloatArrayWritable.class);
FileInputFormat.addInputPath(job, new Path(outputDir, "M"));
FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter));
job.waitForCompletion(true);
}
示例12: cleanup
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
Path titlesDir = new Path(conf.get("inlinks.titles_dir"));
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
IntWritable page = new IntWritable();
Text title = new Text();
int[] inLinks = new int[topN.size()];
String[] titles = new String[topN.size()];
for (int i = inLinks.length - 1; i >= 0; i--) {
Map.Entry<Integer, Integer> entry = topN.poll();
page.set(entry.getValue());
MapFileOutputFormat.getEntry(readers, partitioner, page, title);
inLinks[i] = entry.getKey();
titles[i] = title.toString();
}
for (MapFile.Reader reader : readers) {
reader.close();
}
for (int i = 0; i < inLinks.length; i++) {
context.write(new IntWritable(inLinks[i]), new Text(titles[i]));
}
}
示例13: run
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
JobBuilder.printUsage(this, "<path> <key>");
return -1;
}
Path path = new Path(args[0]);
IntWritable key = new IntWritable(Integer.parseInt(args[1]));
Reader[] readers = MapFileOutputFormat.getReaders(path, getConf());
Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
Text val = new Text();
Reader reader = readers[partitioner.getPartition(key, val,
readers.length)];
Writable entry = reader.get(key, val);
if (entry == null) {
System.err.println("Key not found: " + key);
return -1;
}
NcdcRecordParser parser = new NcdcRecordParser();
IntWritable nextKey = new IntWritable();
do {
parser.parse(val.toString());
System.out.printf("%s\t%s\n", parser.getStationId(),
parser.getYear());
} while (reader.next(nextKey, val) && key.equals(nextKey));
return 0;
}
示例14: map
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
public void map(ShortArrayWritable inKey, MatrixBlockWritable inValue,
Context context) throws IOException, InterruptedException {
// This task gets each block M_{i,j}, loads the corresponding stripe j
// of the vector v_{k-1} and produces the partial result of the stripe i
// of the vector v_k.
Configuration conf = context.getConfiguration();
int iter = Integer.parseInt(conf.get("pagerank.iteration"));
int numPages = Integer.parseInt(conf.get("pagerank.num_pages"));
short blockSize = Short.parseShort(conf.get("pagerank.block_size"));
Writable[] blockIndexes = inKey.get();
short i = ((ShortWritable) blockIndexes[0]).get();
short j = ((ShortWritable) blockIndexes[1]).get();
int vjSize = (j > numPages / blockSize) ? (numPages % blockSize) : blockSize;
FloatWritable[] vj = new FloatWritable[vjSize];
if (iter == 1) {
// Initial PageRank vector with 1/n for all pages.
for (int k = 0; k < vj.length; k++) {
vj[k] = new FloatWritable(1.0f / numPages);
}
} else {
// Load the stripe j of the vector v_{k-1} from the MapFiles.
Path outputDir = MapFileOutputFormat.getOutputPath(context).getParent();
Path vjDir = new Path(outputDir, "v" + (iter - 1));
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(vjDir, conf);
Partitioner<ShortWritable, FloatArrayWritable> partitioner =
new HashPartitioner<ShortWritable, FloatArrayWritable>();
ShortWritable key = new ShortWritable(j);
FloatArrayWritable value = new FloatArrayWritable();
MapFileOutputFormat.getEntry(readers, partitioner, key, value);
Writable[] writables = value.get();
for (int k = 0; k < vj.length; k++) {
vj[k] = (FloatWritable) writables[k];
}
for (MapFile.Reader reader : readers) {
reader.close();
}
}
// Initialize the partial result i of the vector v_k.
int viSize = (i > numPages / blockSize) ? (numPages % blockSize) : blockSize;
FloatWritable[] vi = new FloatWritable[viSize];
for (int k = 0; k < vi.length; k++) {
vi[k] = new FloatWritable(0);
}
// Multiply M_{i,j} by the stripe j of the vector v_{k-1} to obtain the
// partial result i of the vector v_k.
Writable[][] blockColumns = inValue.get();
for (int k = 0; k < blockColumns.length; k++) {
Writable[] blockColumn = blockColumns[k];
if (blockColumn.length > 0) {
int vDegree = ((ShortWritable) blockColumn[0]).get();
for (int columnIndex = 1; columnIndex < blockColumn.length; columnIndex++) {
int l = ((ShortWritable) blockColumn[columnIndex]).get();
vi[l].set(vi[l].get() + (1.0f / vDegree) * vj[k].get());
}
}
}
context.write(new ShortWritable(i), new FloatArrayWritable(vi));
}
示例15: saveMapFile
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
/**
* Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
* given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
* {@link RecordWritable} instances.<br>
* <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
* point of view. Contiguous keys are often only required for non-Spark use cases, such as with
* {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}
* <p>
* Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
*
* @param path Path to save the MapFile
* @param rdd RDD to save
* @param c Configuration object, used to customise options for the map file
* @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
* to limit the maximum number of output map files
* @see #saveMapFileSequences(String, JavaRDD)
* @see #saveSequenceFile(String, JavaRDD)
*/
public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c,
@Nullable Integer maxOutputFiles) {
path = FilenameUtils.normalize(path, true);
if (maxOutputFiles != null) {
rdd = rdd.coalesce(maxOutputFiles);
}
JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count
JavaPairRDD<LongWritable, RecordWritable> keyedByIndex =
dataIndexPairs.mapToPair(new RecordSavePrepPairFunction());
keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class,
c);
}