当前位置: 首页>>代码示例>>Java>>正文


Java FileInputFormat.addInputPath方法代码示例

本文整理汇总了Java中org.apache.hadoop.mapred.FileInputFormat.addInputPath方法的典型用法代码示例。如果您正苦于以下问题:Java FileInputFormat.addInputPath方法的具体用法?Java FileInputFormat.addInputPath怎么用?Java FileInputFormat.addInputPath使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.hadoop.mapred.FileInputFormat的用法示例。


在下文中一共展示了FileInputFormat.addInputPath方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: merge

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
    throws Exception {
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("LinkDb merge: starting at " + sdf.format(start));

  JobConf job = createMergeJob(getConf(), output, normalize, filter);
  for (int i = 0; i < dbs.length; i++) {
    FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME));
  }
  JobClient.runJob(job);
  FileSystem fs = FileSystem.get(getConf());
  fs.mkdirs(output);
  fs.rename(FileOutputFormat.getOutputPath(job), new Path(output,
      LinkDb.CURRENT_NAME));

  long end = System.currentTimeMillis();
  LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: "
      + TimingUtil.elapsedTime(start, end));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:21,代码来源:LinkDbMerger.java

示例2: splitInput

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
private void splitInput(final Properties properties, final StorageDescriptor sd, final Partition partition)
    throws ReflectiveOperationException, IOException {
  final JobConf job = new JobConf();
  for (final Object obj : properties.keySet()) {
    job.set((String) obj, (String) properties.get(obj));
  }
  for (final Map.Entry<String, String> entry : hiveReadEntry.hiveConfigOverride.entrySet()) {
    job.set(entry.getKey(), entry.getValue());
  }
  InputFormat<?, ?> format = (InputFormat<?, ?>)
      Class.forName(sd.getInputFormat()).getConstructor().newInstance();
  job.setInputFormat(format.getClass());
  final Path path = new Path(sd.getLocation());
  final FileSystem fs = path.getFileSystem(job);

  if (fs.exists(path)) {
    FileInputFormat.addInputPath(job, path);
    format = job.getInputFormat();
    for (final InputSplit split : format.getSplits(job, 1)) {
      inputSplits.add(split);
      partitionMap.put(split, partition);
    }
  }
  final String numRowsProp = properties.getProperty("numRows");
  logger.trace("HiveScan num rows property = {}", numRowsProp);
  if (numRowsProp != null) {
    final long numRows = Long.valueOf(numRowsProp);
    // starting from hive-0.13, when no statistics are available, this property is set to -1
    // it's important to note that the value returned by hive may not be up to date
    if (numRows > 0) {
      rowCount += numRows;
    }
  }
}
 
开发者ID:skhalifa,项目名称:QDrill,代码行数:35,代码来源:HiveScan.java

示例3: addInputPath

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
private static boolean addInputPath(StorageDescriptor sd, JobConf job) throws IOException {
  final Path path = new Path(sd.getLocation());
  final FileSystem fs = FileSystemWrapper.get(path, job);

  if (fs.exists(path)) {
    FileInputFormat.addInputPath(job, path);
    return true;
  }

  return false;
}
 
开发者ID:dremio,项目名称:dremio-oss,代码行数:12,代码来源:DatasetBuilder.java

示例4: createSegments

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
/**
 * <p>
 * Creates the arc files to segments job.
 * </p>
 * 
 * @param arcFiles
 *          The path to the directory holding the arc files
 * @param segmentsOutDir
 *          The output directory for writing the segments
 * 
 * @throws IOException
 *           If an IO error occurs while running the job.
 */
public void createSegments(Path arcFiles, Path segmentsOutDir)
    throws IOException {

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  if (LOG.isInfoEnabled()) {
    LOG.info("ArcSegmentCreator: starting at " + sdf.format(start));
    LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
  }

  JobConf job = new NutchJob(getConf());
  job.setJobName("ArcSegmentCreator " + arcFiles);
  String segName = generateSegmentName();
  job.set(Nutch.SEGMENT_NAME_KEY, segName);
  FileInputFormat.addInputPath(job, arcFiles);
  job.setInputFormat(ArcInputFormat.class);
  job.setMapperClass(ArcSegmentCreator.class);
  FileOutputFormat.setOutputPath(job, new Path(segmentsOutDir, segName));
  job.setOutputFormat(FetcherOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(NutchWritable.class);

  JobClient.runJob(job);

  long end = System.currentTimeMillis();
  LOG.info("ArcSegmentCreator: finished at " + sdf.format(end)
      + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:42,代码来源:ArcSegmentCreator.java

示例5: task3

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
/**
 * Extracts CF for each found anchor.
 *
 * @param inputPath
 * @param mapPath
 * @param outputPath
 * @throws IOException
 */
private void task3(String inputPath, String mapPath, String outputPath) throws IOException {
	LOG.info("Extracting anchor text (phase 3)...");
	LOG.info(" - input:   " + inputPath);
	LOG.info(" - output:  " + outputPath);
	LOG.info(" - mapping: " + mapPath);

	JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
	conf.setJobName(String.format("ExtractWikipediaAnchorText:phase3[input: %s, output: %s]", inputPath, outputPath));

	conf.setNumReduceTasks(1);
	String location = "map.dat";

	try {
		DistributedCache.addCacheFile(new URI(mapPath + "/part-00000/data" + "#" + location), conf);
		//DistributedCache.addCacheFile(new URI(mapPath + "/singleentitymap.data" + "#" + location), conf);
		DistributedCache.createSymlink(conf);
	} catch (URISyntaxException e) {
		e.printStackTrace();
	}

	FileInputFormat.addInputPath(conf, new Path(inputPath));
	FileOutputFormat.setOutputPath(conf, new Path(outputPath));

	conf.setInputFormat(SequenceFileInputFormat.class);
	conf.setOutputFormat(MapFileOutputFormat.class);
	// conf.setOutputFormat(TextOutputFormat.class);

	conf.setMapOutputKeyClass(Text.class);
	conf.setMapOutputValueClass(IntWritable.class);

	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(IntWritable.class);

	conf.setMapperClass(MyMapper3.class);
	conf.setCombinerClass(MyReducer3.class);
	conf.setReducerClass(MyReducer3.class);

	JobClient.runJob(conf);
}
 
开发者ID:yahoo,项目名称:FEL,代码行数:48,代码来源:ExtractWikipediaAnchorText.java

示例6: runInitializer

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
/**
 * Runs the initializer job. The initializer job sets up the nodes with a
 * default starting score for link analysis.
 * 
 * @param nodeDb
 *          The node database to use.
 * @param output
 *          The job output directory.
 * 
 * @throws IOException
 *           If an error occurs while running the initializer job.
 */
private void runInitializer(Path nodeDb, Path output) throws IOException {

  // configure the initializer
  JobConf initializer = new NutchJob(getConf());
  initializer.setJobName("LinkAnalysis Initializer");
  FileInputFormat.addInputPath(initializer, nodeDb);
  FileOutputFormat.setOutputPath(initializer, output);
  initializer.setInputFormat(SequenceFileInputFormat.class);
  initializer.setMapperClass(Initializer.class);
  initializer.setMapOutputKeyClass(Text.class);
  initializer.setMapOutputValueClass(Node.class);
  initializer.setOutputKeyClass(Text.class);
  initializer.setOutputValueClass(Node.class);
  initializer.setOutputFormat(MapFileOutputFormat.class);
  initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
      false);

  // run the initializer
  LOG.info("Starting initialization job");
  try {
    JobClient.runJob(initializer);
  } catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished initialization job.");
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:40,代码来源:LinkRank.java

示例7: runInverter

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
/**
 * Runs the inverter job. The inverter job flips outlinks to inlinks to be
 * passed into the analysis job.
 * 
 * @param nodeDb
 *          The node database to use.
 * @param outlinkDb
 *          The outlink database to use.
 * @param output
 *          The output directory.
 * 
 * @throws IOException
 *           If an error occurs while running the inverter job.
 */
private void runInverter(Path nodeDb, Path outlinkDb, Path output)
    throws IOException {

  // configure the inverter
  JobConf inverter = new NutchJob(getConf());
  inverter.setJobName("LinkAnalysis Inverter");
  FileInputFormat.addInputPath(inverter, nodeDb);
  FileInputFormat.addInputPath(inverter, outlinkDb);
  FileOutputFormat.setOutputPath(inverter, output);
  inverter.setInputFormat(SequenceFileInputFormat.class);
  inverter.setMapperClass(Inverter.class);
  inverter.setReducerClass(Inverter.class);
  inverter.setMapOutputKeyClass(Text.class);
  inverter.setMapOutputValueClass(ObjectWritable.class);
  inverter.setOutputKeyClass(Text.class);
  inverter.setOutputValueClass(LinkDatum.class);
  inverter.setOutputFormat(SequenceFileOutputFormat.class);
  inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
      false);

  // run the inverter job
  LOG.info("Starting inverter job");
  try {
    JobClient.runJob(inverter);
  } catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished inverter job.");
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:45,代码来源:LinkRank.java

示例8: task4

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
/**
 * Maps from (targetID, (anchor, count)) to (anchor, (targetID, count)).
 *
 * @param inputPath
 * @param outputPath
 * @throws IOException
 */
private void task4(String inputPath, String outputPath) throws IOException {
	LOG.info("Extracting anchor text (phase 4)...");
	LOG.info(" - input:   " + inputPath);
	LOG.info(" - output:  " + outputPath);

	JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
	conf.setJobName(String.format("ExtractWikipediaAnchorText:phase4[input: %s, output: %s]", inputPath, outputPath));

	conf.setNumReduceTasks(1);

	//FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-00000/data"));
	FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-*/data"));
	FileOutputFormat.setOutputPath(conf, new Path(outputPath));

	conf.setInputFormat(SequenceFileInputFormat.class);
	conf.setOutputFormat(MapFileOutputFormat.class);

	conf.setMapOutputKeyClass(Text.class);
	conf.setMapOutputValueClass(HMapSIW.class);

	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(HMapSIW.class);

	conf.setMapperClass(MyMapper4.class);
	conf.setReducerClass(MyReducer4.class);

	JobClient.runJob(conf);
}
 
开发者ID:yahoo,项目名称:FEL,代码行数:36,代码来源:ExtractWikipediaAnchorText.java

示例9: delete

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
public void delete(String crawldb, boolean noCommit) throws IOException {
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("CleaningJob: starting at " + sdf.format(start));

  JobConf job = new NutchJob(getConf());

  FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
  job.setBoolean("noCommit", noCommit);
  job.setInputFormat(SequenceFileInputFormat.class);
  job.setOutputFormat(NullOutputFormat.class);
  job.setMapOutputKeyClass(ByteWritable.class);
  job.setMapOutputValueClass(Text.class);
  job.setMapperClass(DBFilter.class);
  job.setReducerClass(DeleterReducer.class);

  job.setJobName("CleaningJob");

  // need to expicitely allow deletions
  job.setBoolean(IndexerMapReduce.INDEXER_DELETE, true);

  JobClient.runJob(job);

  long end = System.currentTimeMillis();
  LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: "
      + TimingUtil.elapsedTime(start, end));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:28,代码来源:CleaningJob.java

示例10: task0

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
/**
 * Extracts redirects and the target for each.
 *
 * @param inputPath
 * @param outputPath
 * @throws IOException
 */
private void task0(String inputPath, String outputPath) throws IOException {
	LOG.info("Extracting redirects (phase 0)...");
	LOG.info(" - input: " + inputPath);
	LOG.info(" - output: " + outputPath);

	JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
	conf.setJobName(String.format("ExtractWikipediaAnchorText:phase0[input: %s, output: %s]", inputPath, outputPath));

	conf.setNumReduceTasks(1);

	FileInputFormat.addInputPath(conf, new Path(inputPath));
	FileOutputFormat.setOutputPath(conf, new Path(outputPath));

	conf.setInputFormat(SequenceFileInputFormat.class);
	conf.setOutputFormat(SequenceFileOutputFormat.class);

	conf.setMapOutputKeyClass(Text.class);
	conf.setMapOutputValueClass(Text.class);

	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(Text.class);

	conf.setMapperClass(MyMapper0.class);
	conf.setReducerClass(IdentityReducer.class);

	JobClient.runJob(conf);
}
 
开发者ID:yahoo,项目名称:FEL,代码行数:35,代码来源:ExtractWikipediaAnchorText.java

示例11: runParseTest

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
public void runParseTest(String fieldTerminator, String lineTerminator,
    String encloser, String escape, boolean encloseRequired)
    throws IOException {

  ClassLoader prevClassLoader = null;

  String[] argv = getArgv(true, fieldTerminator, lineTerminator,
      encloser, escape, encloseRequired);
  runImport(argv);
  try {
    String tableClassName = getTableName();

    argv = getArgv(false, fieldTerminator, lineTerminator, encloser,
        escape, encloseRequired);
    SqoopOptions opts = new ImportTool().parseArguments(argv, null,
        null, true);

    CompilationManager compileMgr = new CompilationManager(opts);
    String jarFileName = compileMgr.getJarFilename();

    // Make sure the user's class is loaded into our address space.
    prevClassLoader = ClassLoaderStack.addJarFile(jarFileName,
        tableClassName);

    JobConf job = new JobConf();
    job.setJar(jarFileName);

    // Tell the job what class we're testing.
    job.set(ReparseMapper.USER_TYPE_NAME_KEY, tableClassName);

    // use local mode in the same JVM.
    ConfigurationHelper.setJobtrackerAddr(job, "local");
    job.set("fs.default.name", "file:///");

    String warehouseDir = getWarehouseDir();
    Path warehousePath = new Path(warehouseDir);
    Path inputPath = new Path(warehousePath, getTableName());
    Path outputPath = new Path(warehousePath, getTableName() + "-out");

    job.setMapperClass(ReparseMapper.class);
    job.setNumReduceTasks(0);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    JobClient.runJob(job);
  } catch (InvalidOptionsException ioe) {
    LOG.error(StringUtils.stringifyException(ioe));
    fail(ioe.toString());
  } catch (ParseException pe) {
    LOG.error(StringUtils.stringifyException(pe));
    fail(pe.toString());
  } finally {
    if (null != prevClassLoader) {
      ClassLoaderStack.setCurrentClassLoader(prevClassLoader);
    }
  }
}
 
开发者ID:aliyun,项目名称:aliyun-maxcompute-data-collectors,代码行数:61,代码来源:SQLServerParseMethodsManualTest.java

示例12: runParseTest

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
public void runParseTest(String fieldTerminator, String lineTerminator,
    String encloser, String escape, boolean encloseRequired)
    throws IOException {

  ClassLoader prevClassLoader = null;

  String [] argv = getArgv(true, fieldTerminator, lineTerminator,
      encloser, escape, encloseRequired);
  runImport(argv);
  try {
    String tableClassName = getTableName();

    argv = getArgv(false, fieldTerminator, lineTerminator, encloser, escape,
        encloseRequired);
    SqoopOptions opts = new ImportTool().parseArguments(argv, null, null,
        true);

    CompilationManager compileMgr = new CompilationManager(opts);
    String jarFileName = compileMgr.getJarFilename();

    // Make sure the user's class is loaded into our address space.
    prevClassLoader = ClassLoaderStack.addJarFile(jarFileName,
        tableClassName);

    JobConf job = new JobConf();
    job.setJar(jarFileName);

    // Tell the job what class we're testing.
    job.set(ReparseMapper.USER_TYPE_NAME_KEY, tableClassName);

    // use local mode in the same JVM.
    ConfigurationHelper.setJobtrackerAddr(job, "local");
    if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
      job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
    }
    String warehouseDir = getWarehouseDir();
    Path warehousePath = new Path(warehouseDir);
    Path inputPath = new Path(warehousePath, getTableName());
    Path outputPath = new Path(warehousePath, getTableName() + "-out");

    job.setMapperClass(ReparseMapper.class);
    job.setNumReduceTasks(0);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    JobClient.runJob(job);
  } catch (InvalidOptionsException ioe) {
    fail(ioe.toString());
  } catch (ParseException pe) {
    fail(pe.toString());
  } finally {
    if (null != prevClassLoader) {
      ClassLoaderStack.setCurrentClassLoader(prevClassLoader);
    }
  }
}
 
开发者ID:aliyun,项目名称:aliyun-maxcompute-data-collectors,代码行数:60,代码来源:TestParseMethods.java

示例13: testFieldSetter

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
public void testFieldSetter() throws IOException {
  ClassLoader prevClassLoader = null;

  String [] types = { "VARCHAR(32)", "VARCHAR(32)" };
  String [] vals = { "'meep'", "'foo'" };
  createTableWithColTypes(types, vals);

  String [] argv = getArgv(true, ",", "\\n", "\\\'", "\\", false);
  runImport(argv);
  try {
    String tableClassName = getTableName();

    argv = getArgv(false, ",", "\\n", "\\\'", "\\", false);
    SqoopOptions opts = new ImportTool().parseArguments(argv, null, null,
        true);

    CompilationManager compileMgr = new CompilationManager(opts);
    String jarFileName = compileMgr.getJarFilename();

    // Make sure the user's class is loaded into our address space.
    prevClassLoader = ClassLoaderStack.addJarFile(jarFileName,
        tableClassName);

    JobConf job = new JobConf();
    job.setJar(jarFileName);

    // Tell the job what class we're testing.
    job.set(ExplicitSetMapper.USER_TYPE_NAME_KEY, tableClassName);
    job.set(ExplicitSetMapper.SET_COL_KEY, BASE_COL_NAME + "0");
    job.set(ExplicitSetMapper.SET_VAL_KEY, "this-is-a-test");

    // use local mode in the same JVM.
    ConfigurationHelper.setJobtrackerAddr(job, "local");
    if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
      job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
    }
    String warehouseDir = getWarehouseDir();
    Path warehousePath = new Path(warehouseDir);
    Path inputPath = new Path(warehousePath, getTableName());
    Path outputPath = new Path(warehousePath, getTableName() + "-out");

    job.setMapperClass(ExplicitSetMapper.class);
    job.setNumReduceTasks(0);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    JobClient.runJob(job);
  } catch (InvalidOptionsException ioe) {
    fail(ioe.toString());
  } catch (ParseException pe) {
    fail(pe.toString());
  } finally {
    if (null != prevClassLoader) {
      ClassLoaderStack.setCurrentClassLoader(prevClassLoader);
    }
  }
}
 
开发者ID:aliyun,项目名称:aliyun-maxcompute-data-collectors,代码行数:61,代码来源:TestParseMethods.java

示例14: testInputPath

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
public void testInputPath() throws Exception {
  JobConf jobConf = new JobConf();
  Path workingDir = jobConf.getWorkingDirectory();
  
  Path path = new Path(workingDir, 
      "xx{y"+StringUtils.COMMA_STR+"z}");
  FileInputFormat.setInputPaths(jobConf, path);
  Path[] paths = FileInputFormat.getInputPaths(jobConf);
  assertEquals(1, paths.length);
  assertEquals(path.toString(), paths[0].toString());
   
  StringBuilder pathStr = new StringBuilder();
  pathStr.append(StringUtils.ESCAPE_CHAR);
  pathStr.append(StringUtils.ESCAPE_CHAR);
  pathStr.append(StringUtils.COMMA);
  pathStr.append(StringUtils.COMMA);
  pathStr.append('a');
  path = new Path(workingDir, pathStr.toString());
  FileInputFormat.setInputPaths(jobConf, path);
  paths = FileInputFormat.getInputPaths(jobConf);
  assertEquals(1, paths.length);
  assertEquals(path.toString(), paths[0].toString());
    
  pathStr.setLength(0);
  pathStr.append(StringUtils.ESCAPE_CHAR);
  pathStr.append("xx");
  pathStr.append(StringUtils.ESCAPE_CHAR);
  path = new Path(workingDir, pathStr.toString());
  Path path1 = new Path(workingDir,
      "yy"+StringUtils.COMMA_STR+"zz");
  FileInputFormat.setInputPaths(jobConf, path);
  FileInputFormat.addInputPath(jobConf, path1);
  paths = FileInputFormat.getInputPaths(jobConf);
  assertEquals(2, paths.length);
  assertEquals(path.toString(), paths[0].toString());
  assertEquals(path1.toString(), paths[1].toString());

  FileInputFormat.setInputPaths(jobConf, path, path1);
  paths = FileInputFormat.getInputPaths(jobConf);
  assertEquals(2, paths.length);
  assertEquals(path.toString(), paths[0].toString());
  assertEquals(path1.toString(), paths[1].toString());

  Path[] input = new Path[] {path, path1};
  FileInputFormat.setInputPaths(jobConf, input);
  paths = FileInputFormat.getInputPaths(jobConf);
  assertEquals(2, paths.length);
  assertEquals(path.toString(), paths[0].toString());
  assertEquals(path1.toString(), paths[1].toString());
  
  pathStr.setLength(0);
  String str1 = "{a{b,c},de}";
  String str2 = "xyz";
  String str3 = "x{y,z}";
  pathStr.append(str1);
  pathStr.append(StringUtils.COMMA);
  pathStr.append(str2);
  pathStr.append(StringUtils.COMMA);
  pathStr.append(str3);
  FileInputFormat.setInputPaths(jobConf, pathStr.toString());
  paths = FileInputFormat.getInputPaths(jobConf);
  assertEquals(3, paths.length);
  assertEquals(new Path(workingDir, str1).toString(), paths[0].toString());
  assertEquals(new Path(workingDir, str2).toString(), paths[1].toString());
  assertEquals(new Path(workingDir, str3).toString(), paths[2].toString());

  pathStr.setLength(0);
  String str4 = "abc";
  String str5 = "pq{r,s}";
  pathStr.append(str4);
  pathStr.append(StringUtils.COMMA);
  pathStr.append(str5);
  FileInputFormat.addInputPaths(jobConf, pathStr.toString());
  paths = FileInputFormat.getInputPaths(jobConf);
  assertEquals(5, paths.length);
  assertEquals(new Path(workingDir, str1).toString(), paths[0].toString());
  assertEquals(new Path(workingDir, str2).toString(), paths[1].toString());
  assertEquals(new Path(workingDir, str3).toString(), paths[2].toString());
  assertEquals(new Path(workingDir, str4).toString(), paths[3].toString());
  assertEquals(new Path(workingDir, str5).toString(), paths[4].toString());
}
 
开发者ID:aliyun-beta,项目名称:aliyun-oss-hadoop-fs,代码行数:82,代码来源:TestInputPath.java

示例15: dump

import org.apache.hadoop.mapred.FileInputFormat; //导入方法依赖的package包/类
public void dump(Path segment, Path output) throws IOException {

    if (LOG.isInfoEnabled()) {
      LOG.info("SegmentReader: dump segment: " + segment);
    }

    JobConf job = createJobConf();
    job.setJobName("read " + segment);

    if (ge)
      FileInputFormat.addInputPath(job, new Path(segment,
          CrawlDatum.GENERATE_DIR_NAME));
    if (fe)
      FileInputFormat.addInputPath(job, new Path(segment,
          CrawlDatum.FETCH_DIR_NAME));
    if (pa)
      FileInputFormat.addInputPath(job, new Path(segment,
          CrawlDatum.PARSE_DIR_NAME));
    if (co)
      FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
    if (pd)
      FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
    if (pt)
      FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(InputCompatMapper.class);
    job.setReducerClass(SegmentReader.class);

    Path tempDir = new Path(job.get("hadoop.tmp.dir", "/tmp") + "/segread-"
        + new java.util.Random().nextInt());
    fs.delete(tempDir, true);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    // concatenate the output
    Path dumpFile = new Path(output, job.get("segment.dump.dir", "dump"));

    // remove the old file
    fs.delete(dumpFile, true);
    FileStatus[] fstats = fs.listStatus(tempDir,
        HadoopFSUtil.getPassAllFilter());
    Path[] files = HadoopFSUtil.getPaths(fstats);

    PrintWriter writer = null;
    int currentRecordNumber = 0;
    if (files.length > 0) {
      writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(
          fs.create(dumpFile))));
      try {
        for (int i = 0; i < files.length; i++) {
          Path partFile = files[i];
          try {
            currentRecordNumber = append(fs, job, partFile, writer,
                currentRecordNumber);
          } catch (IOException exception) {
            if (LOG.isWarnEnabled()) {
              LOG.warn("Couldn't copy the content of " + partFile.toString()
                  + " into " + dumpFile.toString());
              LOG.warn(exception.getMessage());
            }
          }
        }
      } finally {
        writer.close();
      }
    }
    fs.delete(tempDir, true);
    if (LOG.isInfoEnabled()) {
      LOG.info("SegmentReader: done");
    }
  }
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:78,代码来源:SegmentReader.java


注:本文中的org.apache.hadoop.mapred.FileInputFormat.addInputPath方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。