本文整理汇总了Java中edu.cmu.lemurproject.WritableWarcRecord类的典型用法代码示例。如果您正苦于以下问题:Java WritableWarcRecord类的具体用法?Java WritableWarcRecord怎么用?Java WritableWarcRecord使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
WritableWarcRecord类属于edu.cmu.lemurproject包,在下文中一共展示了WritableWarcRecord类的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: toDocuments
import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
@Override
public LWDocument[] toDocuments(
LongWritable longWritable,
WritableWarcRecord value,
Reporter reporter,
Configuration conf) throws IOException {
String id = value.getRecord().getHeaderMetadataItem("WARC-Record-ID");
LWDocument doc = createDocument(id, null);
WarcRecord record = value.getRecord();
doc.setContent(record.getContent());
//doc.contentType = null; // Not setting the content type, that way Tika can detect it
for (Map.Entry<String, String> entry : record.getHeaderMetadata()) {
doc.addField(WARC_FIELD + entry.getKey(), entry.getValue());
}
return new LWDocument[]{doc};
}
示例2: parseWikipediaPage
import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
@Test
public void parseWikipediaPage() throws IOException, InterruptedException {
// Set up inputs.
LongWritable key = new LongWritable(12345);
WritableWarcRecord value = TestHelper.getWritableWarcRecord(TestHelper.WIKI_URL_1, TestHelper.NO_WIKI_ARTICLES);
// Mock Hadoop outputs.
OutputCollector<Text, LinkArrayWritable> output = mock(OutputCollector.class);
Reporter reporter = mock(Reporter.class);
// Perform map.
WikiReverseMapper mapper = new WikiReverseMapper();
WikiMetadata wikiMetadata = new WikiMetadata();
mapper.map(key, value, output, reporter, wikiMetadata);
// Verify no results were found.
verifyZeroInteractions(output);
verify(reporter).incrCounter(COUNTER_GROUP, SKIP_WIKIPEDIA_PAGE, 1);
}
示例3: parse
import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
private OutputCollector<Text, LinkArrayWritable> parse(String testUrl, String testFilename) throws IOException, InterruptedException {
// Set up inputs.
LongWritable key = new LongWritable(12345);
WritableWarcRecord value = TestHelper.getWritableWarcRecord(testUrl, testFilename);
// Mock Hadoop outputs.
OutputCollector<Text, LinkArrayWritable> output = mock(OutputCollector.class);
Reporter reporter = mock(Reporter.class);
// Perform map.
WikiReverseMapper mapper = new WikiReverseMapper();
WikiMetadata wikiMetadata = new WikiMetadata();
mapper.map(key, value, output, reporter, wikiMetadata);
return output;
}
示例4: setUp
import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
@Before
public void setUp() throws IOException {
WarcIngestMapper mapper = new WarcIngestMapper();
mapDriver = new MapDriver<LongWritable, WritableWarcRecord, Text, LWDocumentWritable>();
mapDriver.setConfiguration(createConf());
mapDriver.setMapper(mapper);
setupCommonConf(mapDriver.getConfiguration());
mapper.getFixture().init(new JobConf(mapDriver.getConfiguration()));
}
示例5: testWarc
import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
@Test
public void testWarc() throws Exception {
String[] the_ids = new String[] { "<urn:uuid:00fee1bb-1abc-45a6-af31-a164c2fdad88>",
"<urn:uuid:6ee9accb-a284-47ef-8785-ed28aee2f79e>",
"<urn:uuid:00fee1bb-1abc-45a6-af31-a164c2fdad88",
"<urn:uuid:00fee1bb-1abc-45a6-af31-a164c2fdad88>",
"<urn:uuid:b328f1fe-b2ee-45c0-9139-908850810b52>",
"<urn:uuid:ccea02fa-a954-4c19-ace2-72a73b04d95d>" };
Set<String> ids = new HashSet<String>(Arrays.asList(the_ids));//kind of ugly
Path path = new Path(
WarcIngestMapperTest.class.getClassLoader().getResource("warc/at.warc").toURI());
FileSystem fs = FileSystem.get(mapDriver.getConfiguration());
Path dir = new Path(fs.getWorkingDirectory(), "build");
Path sub = new Path(dir, "WIMT");
Path tempDir = new Path(sub, "tmp-dir");
fs.mkdirs(tempDir);
Path dst = new Path(tempDir, "warc/at.warc");
fs.copyFromLocalFile(path, dst);
InputSplit split = new FileSplit(dst, 0, fs.getFileStatus(dst).getLen(), (String[]) null);
WarcFileRecordReader wfrr = new WarcFileRecordReader(mapDriver.getConfiguration(), split);
LongWritable key = new LongWritable(0);
WritableWarcRecord value = new WritableWarcRecord();
int i = 0;
while (wfrr.next(key, value)) {
mapDriver.withInput(key, value);
i++;
}
List<Pair<Text, LWDocumentWritable>> run = mapDriver.run();
Assert.assertTrue(i > 1);
assertEquals(i, run.size());
//get one doc just to confirm:
LWDocument doc = run.get(0).getSecond().getLWDocument();
Assert.assertNotNull("document is null", doc);
Assert.assertTrue(ids.contains(doc.getId()));
}
示例6: run
import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
public void run(RecordReader<LongWritable, WritableWarcRecord> input,
OutputCollector<Text, LinkArrayWritable> output, Reporter reporter)
throws IOException {
try {
WikiMetadata wikiMetadata = new WikiMetadata();
LongWritable key = input.createKey();
WritableWarcRecord value = input.createValue();
while (input.next(key, value)) {
map(key, value, output, reporter, wikiMetadata);
reporter.incrCounter(COUNTER_GROUP, RECORDS_FETCHED, 1);
}
} catch(InterruptedException ie) {
reporter.incrCounter(COUNTER_GROUP, MAPPER_INTERRUPTED, 1);
LOG.error(StringUtils.stringifyException(ie));
} catch(IOException io) {
reporter.incrCounter(COUNTER_GROUP, RUN_IO_EXCEPTION, 1);
LOG.error(StringUtils.stringifyException(io));
} catch(Exception e) {
reporter.incrCounter(COUNTER_GROUP, RUN_EXCEPTION, 1);
LOG.error(StringUtils.stringifyException(e));
} finally {
input.close();
}
}
示例7: getWritableWarcRecord
import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
public static WritableWarcRecord getWritableWarcRecord(String url, String testFileName) throws IOException {
Text metadata = getMetadata(testFileName);
WritableWarcRecord warcWritable = mock(WritableWarcRecord.class);
WarcRecord warcRecord = mock(WarcRecord.class);
Mockito.when(warcWritable.getRecord()).thenReturn(warcRecord);
Mockito.when(warcRecord.getHeaderMetadataItem("WARC-Target-URI")).thenReturn(url);
Mockito.when(warcRecord.getContent()).thenReturn(metadata.getBytes());
return warcWritable;
}
示例8: map
import edu.cmu.lemurproject.WritableWarcRecord; //导入依赖的package包/类
public void map(LongWritable inputKey, WritableWarcRecord inputValue, OutputCollector<Text, LinkArrayWritable> output,
Reporter reporter, WikiMetadata wikiMetadata)
throws IOException, InterruptedException {
try {
// Get Warc record from the writable wrapper.
WarcRecord record = inputValue.getRecord();
String url = record.getHeaderMetadataItem(WARC_TARGET_URI);
if (wikiMetadata.isWikiPage(url, reporter) == false) {
Text metadata = new Text(record.getContent());
if (metadata.find(WIKIPEDIA_DOMAIN) >= 0) {
Page page = new Page(url);
page = MetadataParser.parse(page, metadata, LINK_TYPE, WIKIPEDIA_DOMAIN);
Hashtable<String, LinkWritable> results = wikiMetadata.createResults(page, reporter);
if (results != null && results.isEmpty() == false) {
Text outputKey = new Text();
LinkArrayWritable outputValue = new LinkArrayWritable();
LinkWritable[] linkArray = new LinkWritable[1];
for(String key : results.keySet()) {
linkArray[0] = results.get(key);
outputKey.set(key);
outputValue.set(linkArray);
output.collect(outputKey, outputValue);
}
reporter.incrCounter(COUNTER_GROUP, URLS_PARSED, results.size());
}
}
}
} catch (URISyntaxException us) {
reporter.incrCounter(COUNTER_GROUP, URI_SYNTAX_EXCEPTION, 1);
LOG.error(StringUtils.stringifyException(us));
} catch (JsonParseException jp) {
reporter.incrCounter(COUNTER_GROUP, JSON_PARSE_EXCEPTION, 1);
LOG.error(StringUtils.stringifyException(jp));
} catch (IOException io) {
reporter.incrCounter(COUNTER_GROUP, MAP_IO_EXCEPTION, 1);
LOG.error(StringUtils.stringifyException(io));
} catch (Exception e) {
try {
reporter.incrCounter(COUNTER_GROUP, MAP_EXCEPTION, 1);
LOG.error(StringUtils.stringifyException(e));
} catch (Exception ie) {
// Log and consume inner exceptions when logging.
LOG.error(ie.toString());
}
}
}