当前位置: 首页>>代码示例>>Java>>正文


Java SequenceFileOutputFormat.setOutputPath方法代码示例

本文整理汇总了Java中org.apache.hadoop.mapred.SequenceFileOutputFormat.setOutputPath方法的典型用法代码示例。如果您正苦于以下问题:Java SequenceFileOutputFormat.setOutputPath方法的具体用法?Java SequenceFileOutputFormat.setOutputPath怎么用?Java SequenceFileOutputFormat.setOutputPath使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.hadoop.mapred.SequenceFileOutputFormat的用法示例。


在下文中一共展示了SequenceFileOutputFormat.setOutputPath方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: run

import org.apache.hadoop.mapred.SequenceFileOutputFormat; //导入方法依赖的package包/类
@Override
public void run(String[] args) throws Exception {
  Flags flags = new Flags();
  flags.addWithDefaultValue(
      "tag_subject_data", "/media/work/datasets(secret)/douban/raw/tag_subject.dat", "");
  flags.addWithDefaultValue(
      "subject_data", "/media/work/datasets(secret)/douban/raw/subject.dat", "");
  flags.add("output");
  flags.parseAndCheck(args);
  
  JobConf job = new JobConf(this.getClass());
  job.setJobName("convert-douban-raw-to-posts");
  MapReduceHelper.setAllOutputTypes(job, Text.class);
  MapReduceHelper.setMR(
      job, DoubanRawMapper.class, DoubanToPostReducer.class);
  job.setInputFormat(TextInputFormat.class);
  TextInputFormat.addInputPath(
      job, new Path(flags.getString("tag_subject_data")));
  TextInputFormat.addInputPath(
      job, new Path(flags.getString("subject_data")));
  job.setOutputFormat(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(
      job, new Path(flags.getString("output")));
  JobClient.runJob(job);
}
 
开发者ID:thunlp,项目名称:THUTag,代码行数:26,代码来源:ImportDouban.java

示例2: IDMappingJob

import org.apache.hadoop.mapred.SequenceFileOutputFormat; //导入方法依赖的package包/类
public static void IDMappingJob(String[] args) throws  IOException {

		JobConf job = new JobConf();
		new GenericOptionsParser(job, args);
		job.setJarByClass(HybridDriver.class);
		job.setJobName("Converting binary similarity scores to text");
		job.setMapperClass(IDMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setNumReduceTasks(0);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		Path inputPath = new Path(OUTPUT_DIR);
		job.setInputFormat(SequenceFileInputFormat.class);
		SequenceFileInputFormat.setInputPaths(job, inputPath);
		Path outputPath = new Path("SimilarityScores"); 
		job.setOutputFormat(TextOutputFormat.class);
		SequenceFileOutputFormat.setOutputPath(job, outputPath);
		FileSystem.get(job).delete(outputPath, true);
		HashPagesDriver.prepareDistribCache(job, HashPagesDriver.IDS_FILE2); //remove not sure
		JobSubmitter.run(job,"BINARY TO TEXT",job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE)); 
	}
 
开发者ID:mahaucsb,项目名称:pss,代码行数:24,代码来源:HybridDriver.java

示例3: writeSequence

import org.apache.hadoop.mapred.SequenceFileOutputFormat; //导入方法依赖的package包/类
/**
 * Runs a MR job with maps only to convert input directory of numeric valued
 * records to hadoop sequence format. It assumes a text input of format of
 * [id feature weight ..] to be the format of input.
 */
public static void writeSequence() throws IOException {

	JobConf job = new JobConf();
	job.setJobName("Convert text vectors to hadoop seqeunce ");
	job.setJarByClass(SeqWriter.class);

	job.setMapperClass(SeqMapper.class);
	job.setNumReduceTasks(0);
	job.setMapOutputKeyClass(LongWritable.class);
	job.setMapOutputValueClass(FeatureWeightArrayWritable.class);
	job.setOutputKeyClass(LongWritable.class);
	job.setOutputValueClass(FeatureWeightArrayWritable.class);

	job.setInputFormat(TextInputFormat.class);
	TextInputFormat.addInputPath(job, new Path(INPUT_DIR));
	FileSystem.get(job).delete(new Path(HashPagesDriver.IDS_FILE2), true);
	Path outputPath = new Path(OUTPUT_DIR);
	FileSystem.get(job).delete(outputPath, true);
	job.setOutputFormat(SequenceFileOutputFormat.class);
	SequenceFileOutputFormat.setOutputPath(job, outputPath);

	JobSubmitter.run(job,"PREPROCESS",-1);
}
 
开发者ID:mahaucsb,项目名称:pss,代码行数:29,代码来源:SeqWriter.java

示例4: SetSeqFileInputOutput

import org.apache.hadoop.mapred.SequenceFileOutputFormat; //导入方法依赖的package包/类
public static void SetSeqFileInputOutput(JobConf job, String inputPaths, Path output) throws IOException {
	job.setInputFormat(SequenceFileInputFormat.class);
	job.setOutputFormat(SequenceFileOutputFormat.class);
	SequenceFileOutputFormat.setOutputPath(job, output);

	// Expand input pattern.
	FileSystem fs = FileSystem.get(job);
	String[] paths = inputPaths.split(",");
	for (String p : paths) {
		int lastslash = p.lastIndexOf("/");
		if (lastslash < 0) {
			p = "./" + p;
			lastslash = 1;
		}
		String parent = p.substring(0, lastslash);
		p = p.substring(lastslash + 1);
		// Each path is treated as a pattern.
		p = p.replace("\\", "\\\\");
		p = p.replace(".", "\\.");
		p = p.replace("*", ".*");
		p = p.replace("?", ".");
		LOG.info("Use pattern:" + p);
		Pattern re = Pattern.compile(p);
		// List all files.
		FileStatus[] files = fs.listStatus(new Path(parent));
		for (FileStatus f : files) {
			if (re.matcher(f.getPath().getName()).matches()) {
				SequenceFileInputFormat.addInputPath(job, f.getPath());
				LOG.info("Adding input:" + f.getPath());
			}
		}
	}
}
 
开发者ID:thunlp,项目名称:THUTag,代码行数:34,代码来源:MapReduceHelper.java

示例5: main

import org.apache.hadoop.mapred.SequenceFileOutputFormat; //导入方法依赖的package包/类
@SuppressWarnings("unused")
public static void main(String[] args) throws IOException {
    JobConf conf = new JobConf(EdgelistPartitioner.class);

    if (conf == null) {
        return;
    }
    String dir1 = "/user/miyuru/merged";
    String dir2 = "/user/miyuru/merged-out";

    // We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());
    // only delete dir2 because dir1 is uploaded externally.
    if (fs1.exists(new Path(dir2))) {
        fs1.delete(new Path(dir2), true);
    }

    conf.setInputFormat(WholeFileInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    WholeFileInputFormat.setInputPaths(conf, new Path(dir1));
    SequenceFileOutputFormat.setOutputPath(conf, new Path(dir2));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(SequenceFileMapper.class);
    conf.setReducerClass(MultipleOutputsInvertedReducer.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setJobName("EdgelistPartitioner");

    MultipleOutputs.addMultiNamedOutput(conf, "partition",
            TextOutputFormat.class, NullWritable.class, Text.class);

    JobClient.runJob(conf);
}
 
开发者ID:miyurud,项目名称:Acacia,代码行数:38,代码来源:EdgelistPartitioner.java

示例6: setSeqFileInputOutput

import org.apache.hadoop.mapred.SequenceFileOutputFormat; //导入方法依赖的package包/类
public static void setSeqFileInputOutput(JobConf job, Path input, Path output) {
	job.setInputFormat(SequenceFileInputFormat.class);
	job.setOutputFormat(SequenceFileOutputFormat.class);
	SequenceFileInputFormat.addInputPath(job, input);
	SequenceFileOutputFormat.setOutputPath(job, output);
}
 
开发者ID:thunlp,项目名称:THUTag,代码行数:7,代码来源:MapReduceHelper.java

示例7: run

import org.apache.hadoop.mapred.SequenceFileOutputFormat; //导入方法依赖的package包/类
public int run(String[] args) throws Exception {
	// Get current configuration.
	Configuration conf = getConf();
	
	// Parse command line arguments.
	String inputPath = args[0];
	String outputPath = args[1];
	
	String maxArcFiles = "";
	if (args.length == 3)
		maxArcFiles = args[2];
	
	// Set the maximum number of arc files to process.
	conf.set(MAX_FILES_KEY, maxArcFiles);
			
	JobConf job = new JobConf(conf);
	
	// Set input path.
	if (inputPath.length() > 0) {
		LOG.info("Setting input path to " + inputPath);
	    FileInputFormat.addInputPath(job, new Path(inputPath));
	    FileInputFormat.setInputPathFilter(job, FileCountFilter.class);
	} else {
		System.err.println("No input path found.");
		return 1;	
	}
	
	// Set output path.									
	if (outputPath.length() > 0) {		
		LOG.info("Setting output path to " + outputPath);
		SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
		// Compress output to boost performance.
		SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
		SequenceFileOutputFormat.setCompressOutput(job, true);
	} else {
		System.err.println("No output path found.");
		return 1;	
	}
	
	// Load other classes from same jar a this class.
	job.setJarByClass(WikiReverse.class);
	
    // Input is in WARC file format.
    job.setInputFormat(WarcFileInputFormat.class);

    // Output is Hadoop sequence file format.
    job.setOutputFormat(SequenceFileOutputFormat.class);

    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LinkArrayWritable.class);

    // Use custom mapper class.
    job.setMapRunnerClass(WikiReverseMapper.class);
    
    // Use custom reducer class.
    job.setReducerClass(LinkArrayReducer.class);
    
    // Allow 5 percent of map tasks to fail.
    job.setMaxMapTaskFailuresPercent(MAX_MAP_TASK_FAILURES_PERCENT);
    
	if (JobClient.runJob(job).isSuccessful())
		return 0;
	else
		return 1;
}
 
开发者ID:rossf7,项目名称:wikireverse,代码行数:67,代码来源:WikiReverse.java

示例8: run

import org.apache.hadoop.mapred.SequenceFileOutputFormat; //导入方法依赖的package包/类
public int run(String[] args) throws Exception {
	// Get current configuration.
	Configuration conf = getConf();

	// Parse command line arguments.
	String inputPaths = args[0];
	String outputPath = args[1];

	JobConf job = new JobConf(conf);

	// Set input path.
	if (inputPaths.length() > 0) {
		List<String> segmentPaths = Lists.newArrayList(Splitter.on(",")
				.split(inputPaths));

		for (String segmentPath : segmentPaths) {
			LOG.info("Adding input path " + segmentPath);
			FileInputFormat.addInputPath(job, new Path(segmentPath));
		}
	} else {
		System.err.println("No input path found.");
		return 1;
	}

	// Set output path.
	if (outputPath.length() > 0) {
		LOG.info("Setting output path to " + outputPath);
		SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
		// Compress output to boost performance.
		SequenceFileOutputFormat.setCompressOutput(job, true);
		SequenceFileOutputFormat.getOutputCompressorClass(job, GzipCodec.class);
	} else {
		System.err.println("No output path found.");
		return 1;
	}

	// Load other classes from same jar as this class.
	job.setJarByClass(SegmentCombiner.class);

	// Input is Hadoop sequence file format.
	job.setInputFormat(SequenceFileInputFormat.class);

	// Output is Hadoop sequence file format.
	job.setOutputFormat(SequenceFileOutputFormat.class);

	// Set the output data types.
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(LinkArrayWritable.class);

	// Use custom mapper class.
	job.setMapperClass(SegmentCombinerMapper.class);

	// Use custom reducer class.
	job.setReducerClass(LinkArrayReducer.class);

	if (JobClient.runJob(job).isSuccessful())
		return 0;
	else
		return 1;
}
 
开发者ID:rossf7,项目名称:wikireverse,代码行数:61,代码来源:SegmentCombiner.java

示例9: main

import org.apache.hadoop.mapred.SequenceFileOutputFormat; //导入方法依赖的package包/类
public static void main(String args[]) throws ParseException, IOException {

		// job.set("mapred.job.tracker", "local");
		// job.set("fs.default.name", "file:///");

		JobConf job = new JobConf();
		job.setJarByClass(HybridDriver.class);
		new GenericOptionsParser(job, args);
		setMapperAndRunner(job);
		job.setMapOutputKeyClass(DocDocWritable.class);
		job.setMapOutputValueClass(FloatWritable.class);
		job.setNumReduceTasks(0);
		job.setOutputKeyClass(DocDocWritable.class);
		job.setOutputValueClass(FloatWritable.class);

		Path inputPath = new Path(INPUT_DIR);
		CustomSequenceFileInputFormat.addInputPath(job, inputPath);
		Path outputPath = new Path(OUTPUT_DIR);
		job.setOutputFormat(SequenceFileOutputFormat.class);
		SequenceFileOutputFormat.setOutputPath(job, outputPath);
		FileSystem.get(job).delete(outputPath, true);

		job.setBoolean("fs.hdfs.impl.disable.cache", true); //xun not sure if needed

		if (job.getBoolean(Config.SPLITABLE_PROPERTY, Config.SPLITABLE_VALUE)) {
			job.setInputFormat(CustomSequenceFileInputFormat.class);
			Long splitMB = job.getLong(Config.SPLIT_MB_PROPERTY, Config.SPLIT_MB_VALUE) * 1024 * 1024;
			job.setLong("mapred.min.split.size", splitMB);
			job.setLong("mapred.max.split.size", splitMB);
			job.setLong("dfs.block.size", splitMB);
		} else {
			//  Comment the following of splitter for www experiments it assumes no splitting
			// of partitions for load balancing, should be fixed.
			Splitter.configure(job, inputPath);// remove comment unless for www
			job.setInputFormat(NonSplitableSequenceInputFormat.class); //remove comment
		}
		//SIGIR'14 two-stage balancing //not yet fully incorporated 
		if (job.getInt(Config.LOAD_BALANCE_PROPERTY, Config.LOAD_BALANCE_VALUE) != 0) {
			TwoStageLoadbalancing.main(job.getInt(Config.LOAD_BALANCE_PROPERTY, Config.LOAD_BALANCE_VALUE),
					new Path(PartDriver.OUTPUT_DIR), job);
		}
		JobSubmitter.run(job,"SIMILARITY",job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE)); 
		if(job.getBoolean(Config.CONVERT_TEXT_PROPERTY, Config.CONVERT_TEXT_VALUE))
			IDMappingJob(args);
	}
 
开发者ID:mahaucsb,项目名称:pss,代码行数:46,代码来源:HybridDriver.java

示例10: main

import org.apache.hadoop.mapred.SequenceFileOutputFormat; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	JobConf job = new JobConf();
	job.setJobName("InvertedIndexDriver-BuildII");
	job.setJarByClass(InvertedIndexDriver.class);
	GenericOptionsParser gop = new GenericOptionsParser(job, args);
	args = gop.getRemainingArgs();

	if (args.length != 2)
		printUsage();
	//
	// Job1
	//

	job.setMapperClass(InvertedMapper.class);
	job.setReducerClass(InvertedReducer.class);
	job.setNumReduceTasks(4);
	job.setMapOutputKeyClass(LongWritable.class);
	job.setMapOutputValueClass(DocWeight.class);
	job.setOutputKeyClass(LongWritable.class);
	job.setOutputValueClass(DocWeightArrayWritable.class);

	job.setInputFormat(SequenceFileInputFormat.class);
	SequenceFileInputFormat.addInputPath(job, new Path(args[0]));
	job.setOutputFormat(SequenceFileOutputFormat.class);
	Path interPath = new Path("inverted");
	FileSystem.get(job).delete(interPath, true);
	SequenceFileOutputFormat.setOutputPath(job, interPath);

	HybridDriver.run(job);

	//
	// Collect statistics
	//

	//
	// Job2
	//
	job = new JobConf(new Configuration());
	job.setJarByClass(InvertedIndexDriver.class);
	job.setJobName("InvertedIndexDriver-Similarity (SII)");
	job.setMapperClass(InvertedSimMapper.class);
	job.setReducerClass(InvertedSimReducer.class);
	job.setNumReduceTasks(5);
	job.setInputFormat(SequenceFileInputFormat.class);
	SequenceFileInputFormat.addInputPath(job, new Path("inverted"));

	job.setOutputFormat(SequenceFileOutputFormat.class);
	Path outputPath = new Path(args[1]);
	FileSystem.get(job).delete(outputPath, true);
	SequenceFileOutputFormat.setOutputPath(job, outputPath);

	job.setOutputKeyClass(DocDocWritable.class);
	job.setOutputValueClass(FloatWritable.class);
	long t = System.currentTimeMillis();
	HybridDriver.run(job);
	System.out.println("Job took " + (System.currentTimeMillis() - t) + " millisec.");

}
 
开发者ID:mahaucsb,项目名称:pss,代码行数:59,代码来源:InvertedIndexDriver.java

示例11: run

import org.apache.hadoop.mapred.SequenceFileOutputFormat; //导入方法依赖的package包/类
public int run(String[] args) throws Exception {
	// Get current configuration.
	Configuration conf = getConf();

	// Parse command line arguments.
	String inputPath = args[0];
	String outputPath = args[1];

	// Set Max Files if specified otherwise parse the entire segment.
	String maxFiles = "";
	if (args.length == 3)
		maxFiles = args[2];

	conf.set(MAX_FILES_KEY, maxFiles);
	
	JobConf job = new JobConf(conf);

	// Set input path.
	if (inputPath.length() > 0) {
		LOG.info("Setting input path to " + inputPath);
	    FileInputFormat.addInputPath(job, new Path(inputPath));
	    FileInputFormat.setInputPathFilter(job, FileCountFilter.class);
	} else {
		System.err.println("No input path found.");
		return 1;	
	}

	// Set output path.
	if (outputPath.length() > 0) {
		LOG.info("Setting output path to " + outputPath);
		SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
		// Compress output to boost performance.
		SequenceFileOutputFormat.setCompressOutput(job, true);
		SequenceFileOutputFormat.getOutputCompressorClass(job, GzipCodec.class);
	} else {
		System.err.println("No output path found.");
		return 1;
	}

	// Load other classes from same jar as this class.
	job.setJarByClass(WordCount.class);

	// Input is in WARC file format.
	job.setInputFormat(WarcFileInputFormat.class);

	// Output to Hadoop sequence file format.
	job.setOutputFormat(SequenceFileOutputFormat.class);

	// Set the output data types.
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(LongWritable.class);

	// Use custom mapper class.
	job.setMapperClass(WordCountMapper.class);

	// Use standard reducer class.
	job.setReducerClass(LongSumReducer.class);
	
    // Allow 5 percent of map tasks to fail.
    job.setMaxMapTaskFailuresPercent(MAX_MAP_TASK_FAILURES_PERCENT);

	if (JobClient.runJob(job).isSuccessful())
		return 0;
	else
		return 1;
}
 
开发者ID:rossf7,项目名称:elasticrawl-examples,代码行数:67,代码来源:WordCount.java


注:本文中的org.apache.hadoop.mapred.SequenceFileOutputFormat.setOutputPath方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。