当前位置: 首页>>代码示例>>Java>>正文


Java Metadata.add方法代码示例

本文整理汇总了Java中org.apache.nutch.metadata.Metadata.add方法的典型用法代码示例。如果您正苦于以下问题:Java Metadata.add方法的具体用法?Java Metadata.add怎么用?Java Metadata.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.nutch.metadata.Metadata的用法示例。


在下文中一共展示了Metadata.add方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: filter

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult,
    HTMLMetaTags metaTags, DocumentFragment doc) {

  // get parse obj
  Parse parse = parseResult.get(content.getUrl());
  // Trying to find the document's rel-tags
  Parser parser = new Parser(doc);
  Set<?> tags = parser.getRelTags();
  Iterator<?> iter = tags.iterator();
  Metadata metadata = parse.getData().getParseMeta();
  while (iter.hasNext())
    metadata.add(REL_TAG, (String) iter.next());

  return parseResult;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:19,代码来源:RelTagParser.java

示例2: testParseData

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
@Test
public void testParseData() throws Exception {

  String title = "The Foo Page";

  Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo"),
      new Outlink("http://bar.com/", "Bar") };

  Metadata metaData = new Metadata();
  metaData.add("Language", "en/us");
  metaData.add("Charset", "UTF-8");

  ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
      metaData);

  WritableTestUtils.testWritable(r, null);
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:18,代码来源:TestParseData.java

示例3: filter

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {
  
  // get parse obj
  Parse parse = parseResult.get(content.getUrl());
  // Trying to find the document's rel-tags
  Parser parser = new Parser(doc);
  Set tags = parser.getRelTags();
  Iterator iter = tags.iterator();
  Metadata metadata = parse.getData().getParseMeta();
  while (iter.hasNext()) {
    metadata.add(REL_TAG, (String) iter.next());
  }
  return parseResult;
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:19,代码来源:RelTagParser.java

示例4: testParseData

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
public void testParseData() throws Exception {

    String title = "The Foo Page";

    Outlink[] outlinks = new Outlink[] {
      new Outlink("http://foo.com/", "Foo"),
      new Outlink("http://bar.com/", "Bar")
    };

    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    metaData.add("Charset", "UTF-8");

    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
                        
    WritableTestUtils.testWritable(r, null);
  }
 
开发者ID:yahoo,项目名称:anthelion,代码行数:18,代码来源:TestParseData.java

示例5: assertContentType

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private void assertContentType(Configuration conf, String source,
    String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(
      "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
          metadata)), new Text("http://www.example.com/"), new CrawlDatum(),
      new Inlinks());
  Assert.assertEquals("mime type not detected", expected,
      doc.getFieldValue("type"));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:14,代码来源:TestMoreIndexingFilter.java

示例6: setUp

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
@Before
public void setUp() throws Exception {
  for (int i = 0; i < MIME_TYPES.length; i++) {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);

    ParseImpl parse = new ParseImpl("text",
        new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));

    parses[i] = parse;
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:13,代码来源:MimeTypeIndexingFilterTest.java

示例7: mergeMetadata

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private void mergeMetadata(Metadata first, Metadata second) {
  for (String name : second.names()) {
    String[] values = second.getValues(name);
    for (String value : values) {
      first.add(name, value);
    }
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:9,代码来源:FeedParser.java

示例8: addIndexedMetatags

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
 * Check whether the metatag is in the list of metatags to be indexed (or if
 * '*' is specified). If yes, add it to parse metadata.
 */
private void addIndexedMetatags(Metadata metadata, String metatag,
    String value) {
  String lcMetatag = metatag.toLowerCase(Locale.ROOT);
  if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
    if (LOG.isDebugEnabled()) {
      LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
    }
    metadata.add("metatag." + lcMetatag, value);
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:15,代码来源:MetaTagsParser.java

示例9: detectOGC

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private void detectOGC(Document dom, Metadata metadata) throws JaxenException {
	//System.out.println("Parse Filter");
	JDOMXPath xp = new JDOMXPath("//*");
	xp.addNamespace(nsc.getPrefix(), nsc.getURI());
	List<?> ls = xp.selectNodes(dom);
	Element root = (Element) ls.get(0);
	//String name = root.getName();
	String version = root.getAttributeValue("version");
	//System.out.println("Version ->" + version);
	if (version != null) {
		metadata.add(OGC_VERSION, version);
	}

	for (Object element : ls) {
		String text = ((Element) element).getName();
		for (Map.Entry<String, String> e : CONTAINS_MAP.entrySet()) {
			//System.out.println("Contains? Text -> " + text + " Key -> " + e.getKey());
			if (containsOGC(text, e.getKey())) {
				// System.out.println("Contains -> " + e.getKey());
				metadata.add(OGC_SERVICE, e.getValue());
				return;
			}
		}
	}

	if (checkAtom(ls)) {
		metadata.add(OGC_VERSION, "1.0");
		metadata.add(OGC_SERVICE, "atom");
	} else if (checkWMTS(ls)) {
		metadata.add(OGC_SERVICE, "wmts");
		//System.out.println("Contains -> " + "wmts");
	} else {
		LOG.info("OGC service not detected");
	}
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:36,代码来源:OgcParseFilter.java

示例10: testFilterCacheIndexingFilter

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
 * Test behaviour when reset the index filter order will not take effect
 * 
 * @throws IndexingException
 */
@Test
public void testFilterCacheIndexingFilter() throws IndexingException {
  Configuration conf = NutchConfiguration.create();
  conf.addResource("nutch-default.xml");
  conf.addResource("crawl-tests.xml");

  String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);

  IndexingFilters filters1 = new IndexingFilters(conf);
  NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl(
      "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
          new Metadata())), new Text("http://www.example.com/"),
      new CrawlDatum(), new Inlinks());

  // add another index filter
  String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
  // set content metadata
  Metadata md = new Metadata();
  md.add("example", "data");
  // set content metadata property defined in MetadataIndexer
  conf.set("index.content.md", "example");
  // add MetadataIndxer filter
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
  IndexingFilters filters2 = new IndexingFilters(conf);
  NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl(
      "text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)),
      new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
  Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames()
      .size());
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:37,代码来源:TestIndexingFilters.java

示例11: testContentDispositionTitle

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:15,代码来源:TestMoreIndexingFilter.java

示例12: assertContentType

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:11,代码来源:TestMoreIndexingFilter.java

示例13: checkMetatag

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
 * checks if this is the keywords metatag and sotres it at the metadata
 * @param metadata
 * @param name
 * @param value
 */
protected void checkMetatag(Metadata metadata, String name, String value) {
  if (name.equals(HTML_METATAG_KEYWORDS))
  {
    metadata.add(STORE_METADATA_KEYWORDS, value);
  }
}
 
开发者ID:dkd,项目名称:nutch-typo3-cms,代码行数:13,代码来源:KeywordsParser.java

示例14: reduce

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
@Override
		public void reduce(Text key, Iterator<References> values,
				OutputCollector<Text, NutchParse> output, Reporter reporter)
				throws IOException {

//			log.info("0> one new value ...");
			Date d = new Date();
			long[] start = new long[7];
			
			start[0] = d.getTime();
			References olinks = null;
			References ilinks = null;
			while (values.hasNext()) {
				References links = values.next();
				if (links.len > 0) {
					if (null == ilinks) {
						ilinks = links;
					} else {
						long[] ids = new long[links.len + ilinks.len];
						int j = 0;
						for (int i=0; i<ilinks.len; i++) ids[j++] = ilinks.refs[i];
						for (int i=0; i<links.len; i++) ids[j++] = links.refs[i];
						ilinks = new References(ids.length, ids);
					}
				} else {
					olinks = links;
				}
			}
//			log.info("1> olinks & ilinks ready ...");
			d = new Date();
			start[1] = d.getTime();
			
			String[] txtContent = generator.genPageWordsAndTitls();
			ParseText text = new ParseText(txtContent[0]);
//			log.info("2> text & titles ready ...");
			d = new Date();
			start[2] = d.getTime();
			
			Outlink[] outlinks = new Outlink[-olinks.len];
			for (int i=0; i<-olinks.len; i++) {
				outlinks[i] = new Outlink(indexedUrls.get(olinks.refs[i]).toString());
			}
			d = new Date();
			start[3] = d.getTime();
			
			Metadata contentMeta = new Metadata();
			contentMeta.add(Nutch.SEGMENT_NAME_KEY, segName);
		    contentMeta.add(Nutch.SIGNATURE_KEY,
		    		StringUtil.toHexString(MD5Hash.digest(txtContent[0].getBytes()).getDigest()));

			ParseData data = new ParseData(new ParseStatus(ParseStatus.SUCCESS), txtContent[1], outlinks, contentMeta, new Metadata());
//			log.info("3> outlinks ready ...");
			d = new Date();
			start[4] = d.getTime();
			
			Inlinks inlinks = new Inlinks();
			if (null != ilinks) {
				for (int i=0; i<ilinks.len; i++) {
					inlinks.add(new Inlink(indexedUrls.get(ilinks.refs[i]).toString()));
				}
			}			
//			log.info("4> inlinks ready ...");
			d = new Date();
			start[5] = d.getTime();
			
			NutchParse parse = new NutchParse(inlinks, text, data);
			output.collect(key, parse);

//			log.info("5> output finished ...");
			d = new Date();
			start[6] = d.getTime();
			for (int i=0; i<cost.length; i++) {
				cost[i] = cost[i] + start[i+1] - start[i];
			}
		}
 
开发者ID:thrill,项目名称:fst-bench,代码行数:76,代码来源:NutchData.java

示例15: testBasicIndexingFilter

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
@Test
public void testBasicIndexingFilter() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setInt("indexer.max.title.length", 10);
  conf.setBoolean("indexer.add.domain", true);
  conf.setInt("indexer.max.content.length", 20);

  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  Assert.assertNotNull(filter);

  NutchDocument doc = new NutchDocument();

  String title = "The Foo Page";
  Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
  Metadata metaData = new Metadata();
  metaData.add("Language", "en/us");
  ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
      outlinks, metaData);
  ParseImpl parse = new ParseImpl(
      "this is a sample foo bar page. hope you enjoy it.", parseData);

  CrawlDatum crawlDatum = new CrawlDatum();
  crawlDatum.setFetchTime(100L);

  Inlinks inlinks = new Inlinks();

  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
        crawlDatum, inlinks);
  } catch (Exception e) {
    e.printStackTrace();
    Assert.fail(e.getMessage());
  }
  Assert.assertNotNull(doc);
  Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
      .getField("title").getValues().get(0));
  Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
      .getField("domain").getValues().get(0));
  Assert.assertEquals("test host, expect \"nutch.apache.org\"",
      "nutch.apache.org", doc.getField("host").getValues().get(0));
  Assert.assertEquals(
      "test url, expect \"http://nutch.apache.org/index.html\"",
      "http://nutch.apache.org/index.html", doc.getField("url").getValues()
          .get(0));
  Assert.assertEquals("test content", "this is a sample foo",
      doc.getField("content").getValues().get(0));
  Assert.assertEquals("test fetch time", new Date(100L),
      (Date) doc.getField("tstamp").getValues().get(0));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:51,代码来源:TestBasicIndexingFilter.java


注:本文中的org.apache.nutch.metadata.Metadata.add方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。