当前位置: 首页>>代码示例>>Java>>正文


Java WritableWarcRecord类代码示例

本文整理汇总了Java中edu.cmu.lemurproject.WritableWarcRecord的典型用法代码示例。如果您正苦于以下问题:Java WritableWarcRecord类的具体用法?Java WritableWarcRecord怎么用?Java WritableWarcRecord使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


WritableWarcRecord类属于edu.cmu.lemurproject包,在下文中一共展示了WritableWarcRecord类的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: toDocuments

import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
@Override
public LWDocument[] toDocuments(
  LongWritable longWritable,
  WritableWarcRecord value,
  Reporter reporter,
  Configuration conf) throws IOException {
  String id = value.getRecord().getHeaderMetadataItem("WARC-Record-ID");
  LWDocument doc = createDocument(id, null);
  WarcRecord record = value.getRecord();
  doc.setContent(record.getContent());
  //doc.contentType = null; // Not setting the content type, that way Tika can detect it
  for (Map.Entry<String, String> entry : record.getHeaderMetadata()) {
    doc.addField(WARC_FIELD + entry.getKey(), entry.getValue());
  }
  return new LWDocument[]{doc};
}
 
开发者ID:lucidworks,项目名称:hadoop-solr,代码行数:17,代码来源:WarcIngestMapper.java

示例2: parseWikipediaPage

import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
@Test
public void parseWikipediaPage() throws IOException, InterruptedException {
	// Set up inputs.
	LongWritable key = new LongWritable(12345);
	WritableWarcRecord value = TestHelper.getWritableWarcRecord(TestHelper.WIKI_URL_1, TestHelper.NO_WIKI_ARTICLES);
	
	// Mock Hadoop outputs.
	OutputCollector<Text, LinkArrayWritable> output = mock(OutputCollector.class);
	Reporter reporter = mock(Reporter.class);
	
	// Perform map.
	WikiReverseMapper mapper = new WikiReverseMapper();
	WikiMetadata wikiMetadata = new WikiMetadata();

	mapper.map(key, value, output, reporter, wikiMetadata);
	
	// Verify no results were found.
	verifyZeroInteractions(output);
	verify(reporter).incrCounter(COUNTER_GROUP, SKIP_WIKIPEDIA_PAGE, 1);
}
 
开发者ID:rossf7,项目名称:wikireverse,代码行数:21,代码来源:WikiReverseMapperTest.java

示例3: parse

import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
private OutputCollector<Text, LinkArrayWritable> parse(String testUrl, String testFilename) throws IOException, InterruptedException {
	// Set up inputs.
	LongWritable key = new LongWritable(12345);
	WritableWarcRecord value = TestHelper.getWritableWarcRecord(testUrl, testFilename);
	
	// Mock Hadoop outputs.
	OutputCollector<Text, LinkArrayWritable> output = mock(OutputCollector.class);
	Reporter reporter = mock(Reporter.class);
	
	// Perform map.
	WikiReverseMapper mapper = new WikiReverseMapper();
	WikiMetadata wikiMetadata = new WikiMetadata();
	
	mapper.map(key, value, output, reporter, wikiMetadata);
			
	return output;	
}
 
开发者ID:rossf7,项目名称:wikireverse,代码行数:18,代码来源:WikiReverseMapperTest.java

示例4: setUp

import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
@Before
public void setUp() throws IOException {
  WarcIngestMapper mapper = new WarcIngestMapper();
  mapDriver = new MapDriver<LongWritable, WritableWarcRecord, Text, LWDocumentWritable>();
  mapDriver.setConfiguration(createConf());

  mapDriver.setMapper(mapper);
  setupCommonConf(mapDriver.getConfiguration());
  mapper.getFixture().init(new JobConf(mapDriver.getConfiguration()));
}
 
开发者ID:lucidworks,项目名称:hadoop-solr,代码行数:11,代码来源:WarcIngestMapperTest.java

示例5: testWarc

import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
@Test
public void testWarc() throws Exception {

  String[] the_ids = new String[] { "<urn:uuid:00fee1bb-1abc-45a6-af31-a164c2fdad88>",
      "<urn:uuid:6ee9accb-a284-47ef-8785-ed28aee2f79e>",
      "<urn:uuid:00fee1bb-1abc-45a6-af31-a164c2fdad88",
      "<urn:uuid:00fee1bb-1abc-45a6-af31-a164c2fdad88>",
      "<urn:uuid:b328f1fe-b2ee-45c0-9139-908850810b52>",
      "<urn:uuid:ccea02fa-a954-4c19-ace2-72a73b04d95d>" };
  Set<String> ids = new HashSet<String>(Arrays.asList(the_ids));//kind of ugly
  Path path = new Path(
      WarcIngestMapperTest.class.getClassLoader().getResource("warc/at.warc").toURI());
  FileSystem fs = FileSystem.get(mapDriver.getConfiguration());
  Path dir = new Path(fs.getWorkingDirectory(), "build");
  Path sub = new Path(dir, "WIMT");
  Path tempDir = new Path(sub, "tmp-dir");
  fs.mkdirs(tempDir);
  Path dst = new Path(tempDir, "warc/at.warc");
  fs.copyFromLocalFile(path, dst);
  InputSplit split = new FileSplit(dst, 0, fs.getFileStatus(dst).getLen(), (String[]) null);
  WarcFileRecordReader wfrr = new WarcFileRecordReader(mapDriver.getConfiguration(), split);
  LongWritable key = new LongWritable(0);
  WritableWarcRecord value = new WritableWarcRecord();
  int i = 0;
  while (wfrr.next(key, value)) {
    mapDriver.withInput(key, value);
    i++;
  }
  List<Pair<Text, LWDocumentWritable>> run = mapDriver.run();
  Assert.assertTrue(i > 1);
  assertEquals(i, run.size());
  //get one doc just to confirm:
  LWDocument doc = run.get(0).getSecond().getLWDocument();
  Assert.assertNotNull("document is null", doc);
  Assert.assertTrue(ids.contains(doc.getId()));
}
 
开发者ID:lucidworks,项目名称:hadoop-solr,代码行数:37,代码来源:WarcIngestMapperTest.java

示例6: run

import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
public void run(RecordReader<LongWritable, WritableWarcRecord> input,
				OutputCollector<Text, LinkArrayWritable> output, Reporter reporter)
				throws IOException {
	try {
		WikiMetadata wikiMetadata = new WikiMetadata();
		
		LongWritable key = input.createKey();
		WritableWarcRecord value = input.createValue();
		
		while (input.next(key, value)) {
			map(key, value, output, reporter, wikiMetadata);
			reporter.incrCounter(COUNTER_GROUP, RECORDS_FETCHED, 1);
		}
		
	} catch(InterruptedException ie) {
		reporter.incrCounter(COUNTER_GROUP, MAPPER_INTERRUPTED, 1);
		LOG.error(StringUtils.stringifyException(ie));
	} catch(IOException io) {
		reporter.incrCounter(COUNTER_GROUP, RUN_IO_EXCEPTION, 1);
		LOG.error(StringUtils.stringifyException(io));
	} catch(Exception e) {
		reporter.incrCounter(COUNTER_GROUP, RUN_EXCEPTION, 1);
		LOG.error(StringUtils.stringifyException(e));
	} finally {
		input.close();
	}
}
 
开发者ID:rossf7,项目名称:wikireverse,代码行数:28,代码来源:WikiReverseMapper.java

示例7: getWritableWarcRecord

import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
public static WritableWarcRecord getWritableWarcRecord(String url, String testFileName) throws IOException {
	Text metadata = getMetadata(testFileName);

	WritableWarcRecord warcWritable = mock(WritableWarcRecord.class);
	WarcRecord warcRecord = mock(WarcRecord.class);
	
	Mockito.when(warcWritable.getRecord()).thenReturn(warcRecord);		
	Mockito.when(warcRecord.getHeaderMetadataItem("WARC-Target-URI")).thenReturn(url);
	Mockito.when(warcRecord.getContent()).thenReturn(metadata.getBytes());
	
	return warcWritable;
}
 
开发者ID:rossf7,项目名称:wikireverse,代码行数:13,代码来源:TestHelper.java

示例8: map

import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
public void map(LongWritable inputKey, WritableWarcRecord inputValue, OutputCollector<Text, LinkArrayWritable> output,
		Reporter reporter, WikiMetadata wikiMetadata)
		throws IOException, InterruptedException {

	try {
		// Get Warc record from the writable wrapper.
		WarcRecord record = inputValue.getRecord();
		String url = record.getHeaderMetadataItem(WARC_TARGET_URI);

		if (wikiMetadata.isWikiPage(url, reporter) == false) {
			Text metadata = new Text(record.getContent());
		
			if (metadata.find(WIKIPEDIA_DOMAIN) >= 0) {
				Page page = new Page(url);
				page = MetadataParser.parse(page, metadata, LINK_TYPE, WIKIPEDIA_DOMAIN);
				Hashtable<String, LinkWritable> results = wikiMetadata.createResults(page, reporter);

				if (results != null && results.isEmpty() == false) {
					Text outputKey = new Text();
					LinkArrayWritable outputValue = new LinkArrayWritable();						
					LinkWritable[] linkArray = new LinkWritable[1];

					for(String key : results.keySet()) {
						linkArray[0] = results.get(key);

						outputKey.set(key);
						outputValue.set(linkArray);
						
						output.collect(outputKey, outputValue);
					}

					reporter.incrCounter(COUNTER_GROUP, URLS_PARSED, results.size());
				}
			}
		}
		
	} catch (URISyntaxException us) {
		reporter.incrCounter(COUNTER_GROUP, URI_SYNTAX_EXCEPTION, 1);
		LOG.error(StringUtils.stringifyException(us));
	} catch (JsonParseException jp) {
		reporter.incrCounter(COUNTER_GROUP, JSON_PARSE_EXCEPTION, 1);
		LOG.error(StringUtils.stringifyException(jp));
	} catch (IOException io) {
		reporter.incrCounter(COUNTER_GROUP, MAP_IO_EXCEPTION, 1);
		LOG.error(StringUtils.stringifyException(io));
	} catch (Exception e) {
		try {
			reporter.incrCounter(COUNTER_GROUP, MAP_EXCEPTION, 1);
			LOG.error(StringUtils.stringifyException(e));
		} catch (Exception ie) {
			// Log and consume inner exceptions when logging.
			LOG.error(ie.toString());
		}
	}
}
 
开发者ID:rossf7,项目名称:wikireverse,代码行数:56,代码来源:WikiReverseMapper.java


注:本文中的edu.cmu.lemurproject.WritableWarcRecord类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。