当前位置: 首页>>代码示例>>Java>>正文


Java MapFileOutputFormat类代码示例

本文整理汇总了Java中org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat的典型用法代码示例。如果您正苦于以下问题:Java MapFileOutputFormat类的具体用法?Java MapFileOutputFormat怎么用?Java MapFileOutputFormat使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


MapFileOutputFormat类属于org.apache.hadoop.mapreduce.lib.output包,在下文中一共展示了MapFileOutputFormat类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("Usage: OutLinks <input path> <output path>");
    System.exit(-1);
  }
  
  Job job = new Job();
  job.setJarByClass(OutLinks.class);

  FileInputFormat.addInputPath(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  
  job.setOutputFormatClass(MapFileOutputFormat.class);
  job.setMapperClass(OutLinkNumMapper.class);
  job.setReducerClass(OutLinkNumReducer.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(IntWritable.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(DoubleWritable.class);

  job.waitForCompletion(true);
}
 
开发者ID:ifuding,项目名称:search-1047,代码行数:24,代码来源:OutLinkNum.java

示例2: main

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  if(args.length != 2) {
    System.err.println("Usage: UrlModulus <input path> <output path>");
    System.exit(-1);
  }

  Configuration conf = new Configuration();
  Job job = new Job(conf, "UrlModulus");
  job.setJarByClass(UrlModulus.class);

  FileInputFormat.addInputPath(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setOutputFormatClass(MapFileOutputFormat.class);

  job.setMapperClass(UrlModulusMapper.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(DoubleWritable.class);

  job.setReducerClass(UrlModulusReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(DoubleWritable.class);

  job.setNumReduceTasks(1);
  job.waitForCompletion(true);
}
 
开发者ID:ifuding,项目名称:search-1047,代码行数:26,代码来源:UrlModulus.java

示例3: main

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("Usage: OutLinks <input path> <output path>");
    System.exit(-1);
  }
  
  Job job = new Job();
  job.setJarByClass(PageRank.class);

  FileInputFormat.addInputPath(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  
  job.setOutputFormatClass(MapFileOutputFormat.class);
  job.setMapperClass(PageRankMapper.class);
  job.setReducerClass(PageRankReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(DoubleWritable.class);

  PageRankRead = new MapFileRead("/input/PageRankMap");
  //OutLinksRead = new MapFileRead("/input/OutLinksMap");
  
  job.waitForCompletion(true);
}
 
开发者ID:ifuding,项目名称:search-1047,代码行数:24,代码来源:PageRank.java

示例4: getNumPages

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
private int getNumPages(Configuration conf, Path titlesDir)
		throws Exception {

	int numPages = 0;

	IntWritable pageNumber = new IntWritable();
	MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
	for (int i = 0; i < readers.length; i++) {
		readers[i].finalKey(pageNumber);
		if (pageNumber.get() > numPages) {
			numPages = pageNumber.get();
		}
	}
	for (MapFile.Reader reader : readers) {
		reader.close();
	}

	return numPages;
}
 
开发者ID:yasserglez,项目名称:pagerank-hadoop,代码行数:20,代码来源:PageRank.java

示例5: run

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {
	if (args.length != 2) {
		JobBuilder.printUsage(this, "<path> <key>");
		return -1;
	}
	Path path = new Path(args[0]);
	IntWritable key = new IntWritable(Integer.parseInt(args[1]));

	Reader[] readers = MapFileOutputFormat.getReaders(path, getConf());
	Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
	Text val = new Text();
	Writable entry = MapFileOutputFormat.getEntry(readers, partitioner,
			key, val);
	if (entry == null) {
		System.err.println("Key not found: " + key);
		return -1;
	}
	NcdcRecordParser parser = new NcdcRecordParser();
	parser.parse(val.toString());
	System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
	return 0;
}
 
开发者ID:lhfei,项目名称:hadoop-in-action,代码行数:24,代码来源:LookupRecordByTemperature.java

示例6: run

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {
	Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
	if (job == null) {
		return -1;
	}

	job.setInputFormatClass(SequenceFileInputFormat.class);
	job.setOutputKeyClass(IntWritable.class);
	job.setOutputFormatClass(MapFileOutputFormat.class);
	SequenceFileOutputFormat.setCompressOutput(job, true);
	SequenceFileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
	SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

	return job.waitForCompletion(true) ? 0 : 1;
}
 
开发者ID:lhfei,项目名称:hadoop-in-action,代码行数:17,代码来源:SortByTemperatureToMapFile.java

示例7: createSubmittableJob

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public Job createSubmittableJob(String[] args) throws IOException {
  Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
  generatePartitions(partitionsPath);

  Job job = Job.getInstance(getConf(),
        getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
  Configuration jobConf = job.getConfiguration();
  jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
  job.setJarByClass(HashTable.class);

  TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
      HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);

  // use a TotalOrderPartitioner and reducers to group region output into hash files
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
  job.setReducerClass(Reducer.class);  // identity reducer
  job.setNumReduceTasks(tableHash.numHashFiles);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(ImmutableBytesWritable.class);
  job.setOutputFormatClass(MapFileOutputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));

  return job;
}
 
开发者ID:apache,项目名称:hbase,代码行数:26,代码来源:HashTable.java

示例8: createSubmittableJob

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public Job createSubmittableJob(String[] args) throws IOException {
  Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
  generatePartitions(partitionsPath);
  
  Job job = Job.getInstance(getConf(),
        getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
  Configuration jobConf = job.getConfiguration();
  jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
  job.setJarByClass(HashTable.class);

  TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
      HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
  
  // use a TotalOrderPartitioner and reducers to group region output into hash files
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
  job.setReducerClass(Reducer.class);  // identity reducer
  job.setNumReduceTasks(tableHash.numHashFiles);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(ImmutableBytesWritable.class);
  job.setOutputFormatClass(MapFileOutputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));
  
  return job;
}
 
开发者ID:fengchen8086,项目名称:ditb,代码行数:26,代码来源:HashTable.java

示例9: run

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
public int run(String[] args) throws Exception {
	if (args.length != 2) {
		System.out.println("Usage: TitleIndex <titles-sorted.txt> <output-dir>");
		ToolRunner.printGenericCommandUsage(System.out);
		return 2;
	}

	Path titlesFile = new Path(args[0]);
	Path outputDir = new Path(args[1]);

	Configuration conf = getConf();

	// Do not create _SUCCESS files. MapFileOutputFormat.getReaders calls
	// try to read the _SUCCESS as another MapFile dir.
	conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false");

	// This job creates a MapFile of the titles indexed by the page id.
	// UnsplittableTextInputFormat is used to ensure that the same map task
	// gets all the lines in the titlesFile and it can count the line
	// numbers. The number of reduce tasks is set to 0.

	Job job = Job.getInstance(conf, "TitleIndex");

	job.setJarByClass(InLinks.class);
	job.setInputFormatClass(UnsplittableTextInputFormat.class);
	job.setMapperClass(TitleIndexMapper.class);
	job.setOutputFormatClass(MapFileOutputFormat.class);
	job.setOutputKeyClass(IntWritable.class);
	job.setOutputValueClass(Text.class);
	FileInputFormat.addInputPath(job, titlesFile);
	FileOutputFormat.setOutputPath(job, outputDir);

	job.setNumReduceTasks(0);
	job.waitForCompletion(true);

	return 0;
}
 
开发者ID:yasserglez,项目名称:pagerank-hadoop,代码行数:38,代码来源:TitleIndex.java

示例10: cleanup

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
protected void cleanup(Context context) throws IOException,
		InterruptedException {

	Configuration conf = context.getConfiguration();
	Path titlesDir = new Path(conf.get("pagerank.titles_dir"));

	MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
	Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
	IntWritable page = new IntWritable();
	Text title = new Text();

	float[] pageRanks = new float[topN.size()];
	String[] titles = new String[topN.size()];

	// The order of the entries is reversed. The priority queue is in
	// non-decreasing order and we want the highest PageRank first.
	for (int i = pageRanks.length - 1; i >= 0; i--) {
		Map.Entry<Float, Integer> entry = topN.poll();
		// Get the title of the page from the title index.
		page.set(entry.getValue());
		MapFileOutputFormat.getEntry(readers, partitioner, page, title);
		pageRanks[i] = entry.getKey();
		titles[i] = title.toString();
	}

	for (MapFile.Reader reader : readers) {
		reader.close();
	}

	for (int i = 0; i < pageRanks.length; i++) {
		context.write(new FloatWritable(pageRanks[i]), new Text(titles[i]));
	}
}
 
开发者ID:yasserglez,项目名称:pagerank-hadoop,代码行数:35,代码来源:PageRankTopNReducer.java

示例11: pageRankIteration

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
private void pageRankIteration(int iter, Configuration conf, Path outputDir)
		throws Exception {

	// This job performs an iteration of the power iteration method to
	// compute PageRank. The map task processes each block M_{i,j}, loads
	// the corresponding stripe j of the vector v_{k-1} and produces the
	// partial result of the stripe i of the vector v_k. The reduce task
	// sums all the partial results of v_k and adds the teleportation factor
	// (the combiner only sums all the partial results). See Section 5.2
	// (and 5.2.3 in particular) of Mining of Massive Datasets
	// (http://infolab.stanford.edu/~ullman/mmds.html) for details. The
	// output is written in a "vk" subdir of the output dir, where k is the
	// iteration number. MapFileOutputFormat is used to keep an array of the
	// stripes of v.

	Job job = Job.getInstance(conf, "PageRank:Iteration");

	job.setJarByClass(PageRank.class);
	job.setInputFormatClass(SequenceFileInputFormat.class);
	job.setMapperClass(PageRankIterationMapper.class);
	job.setMapOutputKeyClass(ShortWritable.class);
	job.setMapOutputValueClass(FloatArrayWritable.class);
	job.setCombinerClass(PageRankIterationCombiner.class);
	job.setReducerClass(PageRankIterationReducer.class);
	job.setOutputFormatClass(MapFileOutputFormat.class);
	job.setOutputKeyClass(ShortWritable.class);
	job.setOutputValueClass(FloatArrayWritable.class);
	FileInputFormat.addInputPath(job, new Path(outputDir, "M"));
	FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter));

	job.waitForCompletion(true);
}
 
开发者ID:yasserglez,项目名称:pagerank-hadoop,代码行数:33,代码来源:PageRank.java

示例12: cleanup

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
protected void cleanup(Context context) throws IOException,
		InterruptedException {

	Configuration conf = context.getConfiguration();
	Path titlesDir = new Path(conf.get("inlinks.titles_dir"));

	MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
	Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
	IntWritable page = new IntWritable();
	Text title = new Text();

	int[] inLinks = new int[topN.size()];
	String[] titles = new String[topN.size()];

	for (int i = inLinks.length - 1; i >= 0; i--) {
		Map.Entry<Integer, Integer> entry = topN.poll();
		page.set(entry.getValue());
		MapFileOutputFormat.getEntry(readers, partitioner, page, title);
		inLinks[i] = entry.getKey();
		titles[i] = title.toString();
	}

	for (MapFile.Reader reader : readers) {
		reader.close();
	}

	for (int i = 0; i < inLinks.length; i++) {
		context.write(new IntWritable(inLinks[i]), new Text(titles[i]));
	}
}
 
开发者ID:yasserglez,项目名称:pagerank-hadoop,代码行数:32,代码来源:InLinksTopNReducer.java

示例13: run

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
public int run(String[] args) throws Exception {
	if (args.length != 2) {
		JobBuilder.printUsage(this, "<path> <key>");
		return -1;
	}
	Path path = new Path(args[0]);
	IntWritable key = new IntWritable(Integer.parseInt(args[1]));

	Reader[] readers = MapFileOutputFormat.getReaders(path, getConf());
	Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
	Text val = new Text();

	Reader reader = readers[partitioner.getPartition(key, val,
			readers.length)];
	Writable entry = reader.get(key, val);
	if (entry == null) {
		System.err.println("Key not found: " + key);
		return -1;
	}
	NcdcRecordParser parser = new NcdcRecordParser();
	IntWritable nextKey = new IntWritable();
	do {
		parser.parse(val.toString());
		System.out.printf("%s\t%s\n", parser.getStationId(),
				parser.getYear());
	} while (reader.next(nextKey, val) && key.equals(nextKey));
	return 0;
}
 
开发者ID:lhfei,项目名称:hadoop-in-action,代码行数:30,代码来源:LookupRecordsByTemperature.java

示例14: map

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
@Override
public void map(ShortArrayWritable inKey, MatrixBlockWritable inValue,
		Context context) throws IOException, InterruptedException {

	// This task gets each block M_{i,j}, loads the corresponding stripe j
	// of the vector v_{k-1} and produces the partial result of the stripe i
	// of the vector v_k.

	Configuration conf = context.getConfiguration();
	int iter = Integer.parseInt(conf.get("pagerank.iteration"));
	int numPages = Integer.parseInt(conf.get("pagerank.num_pages"));
	short blockSize = Short.parseShort(conf.get("pagerank.block_size"));

	Writable[] blockIndexes = inKey.get();
	short i = ((ShortWritable) blockIndexes[0]).get();
	short j = ((ShortWritable) blockIndexes[1]).get();

	int vjSize = (j > numPages / blockSize) ? (numPages % blockSize) : blockSize;
	FloatWritable[] vj = new FloatWritable[vjSize];

	if (iter == 1) {
		// Initial PageRank vector with 1/n for all pages.
		for (int k = 0; k < vj.length; k++) {
			vj[k] = new FloatWritable(1.0f / numPages);
		}
	} else {
		// Load the stripe j of the vector v_{k-1} from the MapFiles.
		Path outputDir = MapFileOutputFormat.getOutputPath(context).getParent();
		Path vjDir = new Path(outputDir, "v" + (iter - 1));
		MapFile.Reader[] readers = MapFileOutputFormat.getReaders(vjDir, conf);
		Partitioner<ShortWritable, FloatArrayWritable> partitioner =
				new HashPartitioner<ShortWritable, FloatArrayWritable>();
		ShortWritable key = new ShortWritable(j);
		FloatArrayWritable value = new FloatArrayWritable();
		MapFileOutputFormat.getEntry(readers, partitioner, key, value);
		Writable[] writables = value.get();
		for (int k = 0; k < vj.length; k++) {
			vj[k] = (FloatWritable) writables[k];
		}
		for (MapFile.Reader reader : readers) {
			reader.close();
		}
	}

	// Initialize the partial result i of the vector v_k.
	int viSize = (i > numPages / blockSize) ? (numPages % blockSize) : blockSize;
	FloatWritable[] vi = new FloatWritable[viSize];
	for (int k = 0; k < vi.length; k++) {
		vi[k] = new FloatWritable(0);
	}

	// Multiply M_{i,j} by the stripe j of the vector v_{k-1} to obtain the
	// partial result i of the vector v_k.
	Writable[][] blockColumns = inValue.get();
	for (int k = 0; k < blockColumns.length; k++) {
		Writable[] blockColumn = blockColumns[k];
		if (blockColumn.length > 0) {
			int vDegree = ((ShortWritable) blockColumn[0]).get();
			for (int columnIndex = 1; columnIndex < blockColumn.length; columnIndex++) {
				int l = ((ShortWritable) blockColumn[columnIndex]).get();
				vi[l].set(vi[l].get() +  (1.0f / vDegree) * vj[k].get());
			}
		}
	}

	context.write(new ShortWritable(i), new FloatArrayWritable(vi));
}
 
开发者ID:yasserglez,项目名称:pagerank-hadoop,代码行数:68,代码来源:PageRankIterationMapper.java

示例15: saveMapFile

import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; //导入依赖的package包/类
/**
 * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link RecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the MapFile
 * @param rdd            RDD to save
 * @param c              Configuration object, used to customise options for the map file
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output map files
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c,
                @Nullable Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count
    JavaPairRDD<LongWritable, RecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new RecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class,
                    c);
}
 
开发者ID:deeplearning4j,项目名称:DataVec,代码行数:32,代码来源:SparkStorageUtils.java


注:本文中的org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。