本文整理汇总了Java中org.apache.nutch.metadata.Metadata.add方法的典型用法代码示例。如果您正苦于以下问题:Java Metadata.add方法的具体用法?Java Metadata.add怎么用?Java Metadata.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.nutch.metadata.Metadata
的用法示例。
在下文中一共展示了Metadata.add方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: filter
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
* Scan the HTML document looking at possible rel-tags
*/
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
// get parse obj
Parse parse = parseResult.get(content.getUrl());
// Trying to find the document's rel-tags
Parser parser = new Parser(doc);
Set<?> tags = parser.getRelTags();
Iterator<?> iter = tags.iterator();
Metadata metadata = parse.getData().getParseMeta();
while (iter.hasNext())
metadata.add(REL_TAG, (String) iter.next());
return parseResult;
}
示例2: testParseData
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
@Test
public void testParseData() throws Exception {
String title = "The Foo Page";
Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo"),
new Outlink("http://bar.com/", "Bar") };
Metadata metaData = new Metadata();
metaData.add("Language", "en/us");
metaData.add("Charset", "UTF-8");
ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
metaData);
WritableTestUtils.testWritable(r, null);
}
示例3: filter
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
* Scan the HTML document looking at possible rel-tags
*/
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
// get parse obj
Parse parse = parseResult.get(content.getUrl());
// Trying to find the document's rel-tags
Parser parser = new Parser(doc);
Set tags = parser.getRelTags();
Iterator iter = tags.iterator();
Metadata metadata = parse.getData().getParseMeta();
while (iter.hasNext()) {
metadata.add(REL_TAG, (String) iter.next());
}
return parseResult;
}
示例4: testParseData
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
public void testParseData() throws Exception {
String title = "The Foo Page";
Outlink[] outlinks = new Outlink[] {
new Outlink("http://foo.com/", "Foo"),
new Outlink("http://bar.com/", "Bar")
};
Metadata metaData = new Metadata();
metaData.add("Language", "en/us");
metaData.add("Charset", "UTF-8");
ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
WritableTestUtils.testWritable(r, null);
}
示例5: assertContentType
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private void assertContentType(Configuration conf, String source,
String expected) throws IndexingException {
Metadata metadata = new Metadata();
metadata.add(Response.CONTENT_TYPE, source);
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(
"text", new ParseData(new ParseStatus(), "title", new Outlink[0],
metadata)), new Text("http://www.example.com/"), new CrawlDatum(),
new Inlinks());
Assert.assertEquals("mime type not detected", expected,
doc.getFieldValue("type"));
}
示例6: setUp
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
@Before
public void setUp() throws Exception {
for (int i = 0; i < MIME_TYPES.length; i++) {
Metadata metadata = new Metadata();
metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
ParseImpl parse = new ParseImpl("text",
new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
parses[i] = parse;
}
}
示例7: mergeMetadata
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private void mergeMetadata(Metadata first, Metadata second) {
for (String name : second.names()) {
String[] values = second.getValues(name);
for (String value : values) {
first.add(name, value);
}
}
}
示例8: addIndexedMetatags
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
* Check whether the metatag is in the list of metatags to be indexed (or if
* '*' is specified). If yes, add it to parse metadata.
*/
private void addIndexedMetatags(Metadata metadata, String metatag,
String value) {
String lcMetatag = metatag.toLowerCase(Locale.ROOT);
if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
}
metadata.add("metatag." + lcMetatag, value);
}
}
示例9: detectOGC
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private void detectOGC(Document dom, Metadata metadata) throws JaxenException {
//System.out.println("Parse Filter");
JDOMXPath xp = new JDOMXPath("//*");
xp.addNamespace(nsc.getPrefix(), nsc.getURI());
List<?> ls = xp.selectNodes(dom);
Element root = (Element) ls.get(0);
//String name = root.getName();
String version = root.getAttributeValue("version");
//System.out.println("Version ->" + version);
if (version != null) {
metadata.add(OGC_VERSION, version);
}
for (Object element : ls) {
String text = ((Element) element).getName();
for (Map.Entry<String, String> e : CONTAINS_MAP.entrySet()) {
//System.out.println("Contains? Text -> " + text + " Key -> " + e.getKey());
if (containsOGC(text, e.getKey())) {
// System.out.println("Contains -> " + e.getKey());
metadata.add(OGC_SERVICE, e.getValue());
return;
}
}
}
if (checkAtom(ls)) {
metadata.add(OGC_VERSION, "1.0");
metadata.add(OGC_SERVICE, "atom");
} else if (checkWMTS(ls)) {
metadata.add(OGC_SERVICE, "wmts");
//System.out.println("Contains -> " + "wmts");
} else {
LOG.info("OGC service not detected");
}
}
示例10: testFilterCacheIndexingFilter
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
* Test behaviour when reset the index filter order will not take effect
*
* @throws IndexingException
*/
@Test
public void testFilterCacheIndexingFilter() throws IndexingException {
Configuration conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);
IndexingFilters filters1 = new IndexingFilters(conf);
NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl(
"text", new ParseData(new ParseStatus(), "title", new Outlink[0],
new Metadata())), new Text("http://www.example.com/"),
new CrawlDatum(), new Inlinks());
// add another index filter
String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
// set content metadata
Metadata md = new Metadata();
md.add("example", "data");
// set content metadata property defined in MetadataIndexer
conf.set("index.content.md", "example");
// add MetadataIndxer filter
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
IndexingFilters filters2 = new IndexingFilters(conf);
NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl(
"text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)),
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames()
.size());
}
示例11: testContentDispositionTitle
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
public void testContentDispositionTitle() throws IndexingException {
Configuration conf = NutchConfiguration.create();
Metadata metadata = new Metadata();
metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
"http://www.example.com/"), new CrawlDatum(), new Inlinks());
assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}
示例12: assertContentType
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
Metadata metadata = new Metadata();
metadata.add(Response.CONTENT_TYPE, source);
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
"http://www.example.com/"), new CrawlDatum(), new Inlinks());
assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}
示例13: checkMetatag
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
* checks if this is the keywords metatag and sotres it at the metadata
* @param metadata
* @param name
* @param value
*/
protected void checkMetatag(Metadata metadata, String name, String value) {
if (name.equals(HTML_METATAG_KEYWORDS))
{
metadata.add(STORE_METADATA_KEYWORDS, value);
}
}
示例14: reduce
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
@Override
public void reduce(Text key, Iterator<References> values,
OutputCollector<Text, NutchParse> output, Reporter reporter)
throws IOException {
// log.info("0> one new value ...");
Date d = new Date();
long[] start = new long[7];
start[0] = d.getTime();
References olinks = null;
References ilinks = null;
while (values.hasNext()) {
References links = values.next();
if (links.len > 0) {
if (null == ilinks) {
ilinks = links;
} else {
long[] ids = new long[links.len + ilinks.len];
int j = 0;
for (int i=0; i<ilinks.len; i++) ids[j++] = ilinks.refs[i];
for (int i=0; i<links.len; i++) ids[j++] = links.refs[i];
ilinks = new References(ids.length, ids);
}
} else {
olinks = links;
}
}
// log.info("1> olinks & ilinks ready ...");
d = new Date();
start[1] = d.getTime();
String[] txtContent = generator.genPageWordsAndTitls();
ParseText text = new ParseText(txtContent[0]);
// log.info("2> text & titles ready ...");
d = new Date();
start[2] = d.getTime();
Outlink[] outlinks = new Outlink[-olinks.len];
for (int i=0; i<-olinks.len; i++) {
outlinks[i] = new Outlink(indexedUrls.get(olinks.refs[i]).toString());
}
d = new Date();
start[3] = d.getTime();
Metadata contentMeta = new Metadata();
contentMeta.add(Nutch.SEGMENT_NAME_KEY, segName);
contentMeta.add(Nutch.SIGNATURE_KEY,
StringUtil.toHexString(MD5Hash.digest(txtContent[0].getBytes()).getDigest()));
ParseData data = new ParseData(new ParseStatus(ParseStatus.SUCCESS), txtContent[1], outlinks, contentMeta, new Metadata());
// log.info("3> outlinks ready ...");
d = new Date();
start[4] = d.getTime();
Inlinks inlinks = new Inlinks();
if (null != ilinks) {
for (int i=0; i<ilinks.len; i++) {
inlinks.add(new Inlink(indexedUrls.get(ilinks.refs[i]).toString()));
}
}
// log.info("4> inlinks ready ...");
d = new Date();
start[5] = d.getTime();
NutchParse parse = new NutchParse(inlinks, text, data);
output.collect(key, parse);
// log.info("5> output finished ...");
d = new Date();
start[6] = d.getTime();
for (int i=0; i<cost.length; i++) {
cost[i] = cost[i] + start[i+1] - start[i];
}
}
示例15: testBasicIndexingFilter
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
@Test
public void testBasicIndexingFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.setInt("indexer.max.title.length", 10);
conf.setBoolean("indexer.add.domain", true);
conf.setInt("indexer.max.content.length", 20);
BasicIndexingFilter filter = new BasicIndexingFilter();
filter.setConf(conf);
Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
String title = "The Foo Page";
Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
Metadata metaData = new Metadata();
metaData.add("Language", "en/us");
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
outlinks, metaData);
ParseImpl parse = new ParseImpl(
"this is a sample foo bar page. hope you enjoy it.", parseData);
CrawlDatum crawlDatum = new CrawlDatum();
crawlDatum.setFetchTime(100L);
Inlinks inlinks = new Inlinks();
try {
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
.getField("title").getValues().get(0));
Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
.getField("domain").getValues().get(0));
Assert.assertEquals("test host, expect \"nutch.apache.org\"",
"nutch.apache.org", doc.getField("host").getValues().get(0));
Assert.assertEquals(
"test url, expect \"http://nutch.apache.org/index.html\"",
"http://nutch.apache.org/index.html", doc.getField("url").getValues()
.get(0));
Assert.assertEquals("test content", "this is a sample foo",
doc.getField("content").getValues().get(0));
Assert.assertEquals("test fetch time", new Date(100L),
(Date) doc.getField("tstamp").getValues().get(0));
}