本文整理匯總了Java中org.apache.hadoop.mapred.FileInputFormat.addInputPath方法的典型用法代碼示例。如果您正苦於以下問題:Java FileInputFormat.addInputPath方法的具體用法?Java FileInputFormat.addInputPath怎麽用?Java FileInputFormat.addInputPath使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.hadoop.mapred.FileInputFormat
的用法示例。
在下文中一共展示了FileInputFormat.addInputPath方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: merge
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
throws Exception {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("LinkDb merge: starting at " + sdf.format(start));
JobConf job = createMergeJob(getConf(), output, normalize, filter);
for (int i = 0; i < dbs.length; i++) {
FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME));
}
JobClient.runJob(job);
FileSystem fs = FileSystem.get(getConf());
fs.mkdirs(output);
fs.rename(FileOutputFormat.getOutputPath(job), new Path(output,
LinkDb.CURRENT_NAME));
long end = System.currentTimeMillis();
LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: "
+ TimingUtil.elapsedTime(start, end));
}
示例2: splitInput
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
private void splitInput(final Properties properties, final StorageDescriptor sd, final Partition partition)
throws ReflectiveOperationException, IOException {
final JobConf job = new JobConf();
for (final Object obj : properties.keySet()) {
job.set((String) obj, (String) properties.get(obj));
}
for (final Map.Entry<String, String> entry : hiveReadEntry.hiveConfigOverride.entrySet()) {
job.set(entry.getKey(), entry.getValue());
}
InputFormat<?, ?> format = (InputFormat<?, ?>)
Class.forName(sd.getInputFormat()).getConstructor().newInstance();
job.setInputFormat(format.getClass());
final Path path = new Path(sd.getLocation());
final FileSystem fs = path.getFileSystem(job);
if (fs.exists(path)) {
FileInputFormat.addInputPath(job, path);
format = job.getInputFormat();
for (final InputSplit split : format.getSplits(job, 1)) {
inputSplits.add(split);
partitionMap.put(split, partition);
}
}
final String numRowsProp = properties.getProperty("numRows");
logger.trace("HiveScan num rows property = {}", numRowsProp);
if (numRowsProp != null) {
final long numRows = Long.valueOf(numRowsProp);
// starting from hive-0.13, when no statistics are available, this property is set to -1
// it's important to note that the value returned by hive may not be up to date
if (numRows > 0) {
rowCount += numRows;
}
}
}
示例3: addInputPath
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
private static boolean addInputPath(StorageDescriptor sd, JobConf job) throws IOException {
final Path path = new Path(sd.getLocation());
final FileSystem fs = FileSystemWrapper.get(path, job);
if (fs.exists(path)) {
FileInputFormat.addInputPath(job, path);
return true;
}
return false;
}
示例4: createSegments
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
/**
* <p>
* Creates the arc files to segments job.
* </p>
*
* @param arcFiles
* The path to the directory holding the arc files
* @param segmentsOutDir
* The output directory for writing the segments
*
* @throws IOException
* If an IO error occurs while running the job.
*/
public void createSegments(Path arcFiles, Path segmentsOutDir)
throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
LOG.info("ArcSegmentCreator: starting at " + sdf.format(start));
LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
}
JobConf job = new NutchJob(getConf());
job.setJobName("ArcSegmentCreator " + arcFiles);
String segName = generateSegmentName();
job.set(Nutch.SEGMENT_NAME_KEY, segName);
FileInputFormat.addInputPath(job, arcFiles);
job.setInputFormat(ArcInputFormat.class);
job.setMapperClass(ArcSegmentCreator.class);
FileOutputFormat.setOutputPath(job, new Path(segmentsOutDir, segName));
job.setOutputFormat(FetcherOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NutchWritable.class);
JobClient.runJob(job);
long end = System.currentTimeMillis();
LOG.info("ArcSegmentCreator: finished at " + sdf.format(end)
+ ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
示例5: task3
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
/**
* Extracts CF for each found anchor.
*
* @param inputPath
* @param mapPath
* @param outputPath
* @throws IOException
*/
private void task3(String inputPath, String mapPath, String outputPath) throws IOException {
LOG.info("Extracting anchor text (phase 3)...");
LOG.info(" - input: " + inputPath);
LOG.info(" - output: " + outputPath);
LOG.info(" - mapping: " + mapPath);
JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
conf.setJobName(String.format("ExtractWikipediaAnchorText:phase3[input: %s, output: %s]", inputPath, outputPath));
conf.setNumReduceTasks(1);
String location = "map.dat";
try {
DistributedCache.addCacheFile(new URI(mapPath + "/part-00000/data" + "#" + location), conf);
//DistributedCache.addCacheFile(new URI(mapPath + "/singleentitymap.data" + "#" + location), conf);
DistributedCache.createSymlink(conf);
} catch (URISyntaxException e) {
e.printStackTrace();
}
FileInputFormat.addInputPath(conf, new Path(inputPath));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputFormat(MapFileOutputFormat.class);
// conf.setOutputFormat(TextOutputFormat.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(IntWritable.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(MyMapper3.class);
conf.setCombinerClass(MyReducer3.class);
conf.setReducerClass(MyReducer3.class);
JobClient.runJob(conf);
}
示例6: runInitializer
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
/**
* Runs the initializer job. The initializer job sets up the nodes with a
* default starting score for link analysis.
*
* @param nodeDb
* The node database to use.
* @param output
* The job output directory.
*
* @throws IOException
* If an error occurs while running the initializer job.
*/
private void runInitializer(Path nodeDb, Path output) throws IOException {
// configure the initializer
JobConf initializer = new NutchJob(getConf());
initializer.setJobName("LinkAnalysis Initializer");
FileInputFormat.addInputPath(initializer, nodeDb);
FileOutputFormat.setOutputPath(initializer, output);
initializer.setInputFormat(SequenceFileInputFormat.class);
initializer.setMapperClass(Initializer.class);
initializer.setMapOutputKeyClass(Text.class);
initializer.setMapOutputValueClass(Node.class);
initializer.setOutputKeyClass(Text.class);
initializer.setOutputValueClass(Node.class);
initializer.setOutputFormat(MapFileOutputFormat.class);
initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
false);
// run the initializer
LOG.info("Starting initialization job");
try {
JobClient.runJob(initializer);
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
LOG.info("Finished initialization job.");
}
示例7: runInverter
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
/**
* Runs the inverter job. The inverter job flips outlinks to inlinks to be
* passed into the analysis job.
*
* @param nodeDb
* The node database to use.
* @param outlinkDb
* The outlink database to use.
* @param output
* The output directory.
*
* @throws IOException
* If an error occurs while running the inverter job.
*/
private void runInverter(Path nodeDb, Path outlinkDb, Path output)
throws IOException {
// configure the inverter
JobConf inverter = new NutchJob(getConf());
inverter.setJobName("LinkAnalysis Inverter");
FileInputFormat.addInputPath(inverter, nodeDb);
FileInputFormat.addInputPath(inverter, outlinkDb);
FileOutputFormat.setOutputPath(inverter, output);
inverter.setInputFormat(SequenceFileInputFormat.class);
inverter.setMapperClass(Inverter.class);
inverter.setReducerClass(Inverter.class);
inverter.setMapOutputKeyClass(Text.class);
inverter.setMapOutputValueClass(ObjectWritable.class);
inverter.setOutputKeyClass(Text.class);
inverter.setOutputValueClass(LinkDatum.class);
inverter.setOutputFormat(SequenceFileOutputFormat.class);
inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
false);
// run the inverter job
LOG.info("Starting inverter job");
try {
JobClient.runJob(inverter);
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
LOG.info("Finished inverter job.");
}
示例8: task4
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
/**
* Maps from (targetID, (anchor, count)) to (anchor, (targetID, count)).
*
* @param inputPath
* @param outputPath
* @throws IOException
*/
private void task4(String inputPath, String outputPath) throws IOException {
LOG.info("Extracting anchor text (phase 4)...");
LOG.info(" - input: " + inputPath);
LOG.info(" - output: " + outputPath);
JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
conf.setJobName(String.format("ExtractWikipediaAnchorText:phase4[input: %s, output: %s]", inputPath, outputPath));
conf.setNumReduceTasks(1);
//FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-00000/data"));
FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-*/data"));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputFormat(MapFileOutputFormat.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(HMapSIW.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(HMapSIW.class);
conf.setMapperClass(MyMapper4.class);
conf.setReducerClass(MyReducer4.class);
JobClient.runJob(conf);
}
示例9: delete
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
public void delete(String crawldb, boolean noCommit) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("CleaningJob: starting at " + sdf.format(start));
JobConf job = new NutchJob(getConf());
FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
job.setBoolean("noCommit", noCommit);
job.setInputFormat(SequenceFileInputFormat.class);
job.setOutputFormat(NullOutputFormat.class);
job.setMapOutputKeyClass(ByteWritable.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(DBFilter.class);
job.setReducerClass(DeleterReducer.class);
job.setJobName("CleaningJob");
// need to expicitely allow deletions
job.setBoolean(IndexerMapReduce.INDEXER_DELETE, true);
JobClient.runJob(job);
long end = System.currentTimeMillis();
LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: "
+ TimingUtil.elapsedTime(start, end));
}
示例10: task0
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
/**
* Extracts redirects and the target for each.
*
* @param inputPath
* @param outputPath
* @throws IOException
*/
private void task0(String inputPath, String outputPath) throws IOException {
LOG.info("Extracting redirects (phase 0)...");
LOG.info(" - input: " + inputPath);
LOG.info(" - output: " + outputPath);
JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
conf.setJobName(String.format("ExtractWikipediaAnchorText:phase0[input: %s, output: %s]", inputPath, outputPath));
conf.setNumReduceTasks(1);
FileInputFormat.addInputPath(conf, new Path(inputPath));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(MyMapper0.class);
conf.setReducerClass(IdentityReducer.class);
JobClient.runJob(conf);
}
示例11: runParseTest
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
public void runParseTest(String fieldTerminator, String lineTerminator,
String encloser, String escape, boolean encloseRequired)
throws IOException {
ClassLoader prevClassLoader = null;
String[] argv = getArgv(true, fieldTerminator, lineTerminator,
encloser, escape, encloseRequired);
runImport(argv);
try {
String tableClassName = getTableName();
argv = getArgv(false, fieldTerminator, lineTerminator, encloser,
escape, encloseRequired);
SqoopOptions opts = new ImportTool().parseArguments(argv, null,
null, true);
CompilationManager compileMgr = new CompilationManager(opts);
String jarFileName = compileMgr.getJarFilename();
// Make sure the user's class is loaded into our address space.
prevClassLoader = ClassLoaderStack.addJarFile(jarFileName,
tableClassName);
JobConf job = new JobConf();
job.setJar(jarFileName);
// Tell the job what class we're testing.
job.set(ReparseMapper.USER_TYPE_NAME_KEY, tableClassName);
// use local mode in the same JVM.
ConfigurationHelper.setJobtrackerAddr(job, "local");
job.set("fs.default.name", "file:///");
String warehouseDir = getWarehouseDir();
Path warehousePath = new Path(warehouseDir);
Path inputPath = new Path(warehousePath, getTableName());
Path outputPath = new Path(warehousePath, getTableName() + "-out");
job.setMapperClass(ReparseMapper.class);
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
JobClient.runJob(job);
} catch (InvalidOptionsException ioe) {
LOG.error(StringUtils.stringifyException(ioe));
fail(ioe.toString());
} catch (ParseException pe) {
LOG.error(StringUtils.stringifyException(pe));
fail(pe.toString());
} finally {
if (null != prevClassLoader) {
ClassLoaderStack.setCurrentClassLoader(prevClassLoader);
}
}
}
開發者ID:aliyun,項目名稱:aliyun-maxcompute-data-collectors,代碼行數:61,代碼來源:SQLServerParseMethodsManualTest.java
示例12: runParseTest
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
public void runParseTest(String fieldTerminator, String lineTerminator,
String encloser, String escape, boolean encloseRequired)
throws IOException {
ClassLoader prevClassLoader = null;
String [] argv = getArgv(true, fieldTerminator, lineTerminator,
encloser, escape, encloseRequired);
runImport(argv);
try {
String tableClassName = getTableName();
argv = getArgv(false, fieldTerminator, lineTerminator, encloser, escape,
encloseRequired);
SqoopOptions opts = new ImportTool().parseArguments(argv, null, null,
true);
CompilationManager compileMgr = new CompilationManager(opts);
String jarFileName = compileMgr.getJarFilename();
// Make sure the user's class is loaded into our address space.
prevClassLoader = ClassLoaderStack.addJarFile(jarFileName,
tableClassName);
JobConf job = new JobConf();
job.setJar(jarFileName);
// Tell the job what class we're testing.
job.set(ReparseMapper.USER_TYPE_NAME_KEY, tableClassName);
// use local mode in the same JVM.
ConfigurationHelper.setJobtrackerAddr(job, "local");
if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
}
String warehouseDir = getWarehouseDir();
Path warehousePath = new Path(warehouseDir);
Path inputPath = new Path(warehousePath, getTableName());
Path outputPath = new Path(warehousePath, getTableName() + "-out");
job.setMapperClass(ReparseMapper.class);
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
JobClient.runJob(job);
} catch (InvalidOptionsException ioe) {
fail(ioe.toString());
} catch (ParseException pe) {
fail(pe.toString());
} finally {
if (null != prevClassLoader) {
ClassLoaderStack.setCurrentClassLoader(prevClassLoader);
}
}
}
示例13: testFieldSetter
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
public void testFieldSetter() throws IOException {
ClassLoader prevClassLoader = null;
String [] types = { "VARCHAR(32)", "VARCHAR(32)" };
String [] vals = { "'meep'", "'foo'" };
createTableWithColTypes(types, vals);
String [] argv = getArgv(true, ",", "\\n", "\\\'", "\\", false);
runImport(argv);
try {
String tableClassName = getTableName();
argv = getArgv(false, ",", "\\n", "\\\'", "\\", false);
SqoopOptions opts = new ImportTool().parseArguments(argv, null, null,
true);
CompilationManager compileMgr = new CompilationManager(opts);
String jarFileName = compileMgr.getJarFilename();
// Make sure the user's class is loaded into our address space.
prevClassLoader = ClassLoaderStack.addJarFile(jarFileName,
tableClassName);
JobConf job = new JobConf();
job.setJar(jarFileName);
// Tell the job what class we're testing.
job.set(ExplicitSetMapper.USER_TYPE_NAME_KEY, tableClassName);
job.set(ExplicitSetMapper.SET_COL_KEY, BASE_COL_NAME + "0");
job.set(ExplicitSetMapper.SET_VAL_KEY, "this-is-a-test");
// use local mode in the same JVM.
ConfigurationHelper.setJobtrackerAddr(job, "local");
if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
}
String warehouseDir = getWarehouseDir();
Path warehousePath = new Path(warehouseDir);
Path inputPath = new Path(warehousePath, getTableName());
Path outputPath = new Path(warehousePath, getTableName() + "-out");
job.setMapperClass(ExplicitSetMapper.class);
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
JobClient.runJob(job);
} catch (InvalidOptionsException ioe) {
fail(ioe.toString());
} catch (ParseException pe) {
fail(pe.toString());
} finally {
if (null != prevClassLoader) {
ClassLoaderStack.setCurrentClassLoader(prevClassLoader);
}
}
}
示例14: testInputPath
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
public void testInputPath() throws Exception {
JobConf jobConf = new JobConf();
Path workingDir = jobConf.getWorkingDirectory();
Path path = new Path(workingDir,
"xx{y"+StringUtils.COMMA_STR+"z}");
FileInputFormat.setInputPaths(jobConf, path);
Path[] paths = FileInputFormat.getInputPaths(jobConf);
assertEquals(1, paths.length);
assertEquals(path.toString(), paths[0].toString());
StringBuilder pathStr = new StringBuilder();
pathStr.append(StringUtils.ESCAPE_CHAR);
pathStr.append(StringUtils.ESCAPE_CHAR);
pathStr.append(StringUtils.COMMA);
pathStr.append(StringUtils.COMMA);
pathStr.append('a');
path = new Path(workingDir, pathStr.toString());
FileInputFormat.setInputPaths(jobConf, path);
paths = FileInputFormat.getInputPaths(jobConf);
assertEquals(1, paths.length);
assertEquals(path.toString(), paths[0].toString());
pathStr.setLength(0);
pathStr.append(StringUtils.ESCAPE_CHAR);
pathStr.append("xx");
pathStr.append(StringUtils.ESCAPE_CHAR);
path = new Path(workingDir, pathStr.toString());
Path path1 = new Path(workingDir,
"yy"+StringUtils.COMMA_STR+"zz");
FileInputFormat.setInputPaths(jobConf, path);
FileInputFormat.addInputPath(jobConf, path1);
paths = FileInputFormat.getInputPaths(jobConf);
assertEquals(2, paths.length);
assertEquals(path.toString(), paths[0].toString());
assertEquals(path1.toString(), paths[1].toString());
FileInputFormat.setInputPaths(jobConf, path, path1);
paths = FileInputFormat.getInputPaths(jobConf);
assertEquals(2, paths.length);
assertEquals(path.toString(), paths[0].toString());
assertEquals(path1.toString(), paths[1].toString());
Path[] input = new Path[] {path, path1};
FileInputFormat.setInputPaths(jobConf, input);
paths = FileInputFormat.getInputPaths(jobConf);
assertEquals(2, paths.length);
assertEquals(path.toString(), paths[0].toString());
assertEquals(path1.toString(), paths[1].toString());
pathStr.setLength(0);
String str1 = "{a{b,c},de}";
String str2 = "xyz";
String str3 = "x{y,z}";
pathStr.append(str1);
pathStr.append(StringUtils.COMMA);
pathStr.append(str2);
pathStr.append(StringUtils.COMMA);
pathStr.append(str3);
FileInputFormat.setInputPaths(jobConf, pathStr.toString());
paths = FileInputFormat.getInputPaths(jobConf);
assertEquals(3, paths.length);
assertEquals(new Path(workingDir, str1).toString(), paths[0].toString());
assertEquals(new Path(workingDir, str2).toString(), paths[1].toString());
assertEquals(new Path(workingDir, str3).toString(), paths[2].toString());
pathStr.setLength(0);
String str4 = "abc";
String str5 = "pq{r,s}";
pathStr.append(str4);
pathStr.append(StringUtils.COMMA);
pathStr.append(str5);
FileInputFormat.addInputPaths(jobConf, pathStr.toString());
paths = FileInputFormat.getInputPaths(jobConf);
assertEquals(5, paths.length);
assertEquals(new Path(workingDir, str1).toString(), paths[0].toString());
assertEquals(new Path(workingDir, str2).toString(), paths[1].toString());
assertEquals(new Path(workingDir, str3).toString(), paths[2].toString());
assertEquals(new Path(workingDir, str4).toString(), paths[3].toString());
assertEquals(new Path(workingDir, str5).toString(), paths[4].toString());
}
示例15: dump
import org.apache.hadoop.mapred.FileInputFormat; //導入方法依賴的package包/類
public void dump(Path segment, Path output) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("SegmentReader: dump segment: " + segment);
}
JobConf job = createJobConf();
job.setJobName("read " + segment);
if (ge)
FileInputFormat.addInputPath(job, new Path(segment,
CrawlDatum.GENERATE_DIR_NAME));
if (fe)
FileInputFormat.addInputPath(job, new Path(segment,
CrawlDatum.FETCH_DIR_NAME));
if (pa)
FileInputFormat.addInputPath(job, new Path(segment,
CrawlDatum.PARSE_DIR_NAME));
if (co)
FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
if (pd)
FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
if (pt)
FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(InputCompatMapper.class);
job.setReducerClass(SegmentReader.class);
Path tempDir = new Path(job.get("hadoop.tmp.dir", "/tmp") + "/segread-"
+ new java.util.Random().nextInt());
fs.delete(tempDir, true);
FileOutputFormat.setOutputPath(job, tempDir);
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NutchWritable.class);
JobClient.runJob(job);
// concatenate the output
Path dumpFile = new Path(output, job.get("segment.dump.dir", "dump"));
// remove the old file
fs.delete(dumpFile, true);
FileStatus[] fstats = fs.listStatus(tempDir,
HadoopFSUtil.getPassAllFilter());
Path[] files = HadoopFSUtil.getPaths(fstats);
PrintWriter writer = null;
int currentRecordNumber = 0;
if (files.length > 0) {
writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(
fs.create(dumpFile))));
try {
for (int i = 0; i < files.length; i++) {
Path partFile = files[i];
try {
currentRecordNumber = append(fs, job, partFile, writer,
currentRecordNumber);
} catch (IOException exception) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't copy the content of " + partFile.toString()
+ " into " + dumpFile.toString());
LOG.warn(exception.getMessage());
}
}
}
} finally {
writer.close();
}
}
fs.delete(tempDir, true);
if (LOG.isInfoEnabled()) {
LOG.info("SegmentReader: done");
}
}