当前位置: 首页>>代码示例>>Java>>正文


Java NutchDocument类代码示例

本文整理汇总了Java中org.apache.nutch.indexer.NutchDocument的典型用法代码示例。如果您正苦于以下问题:Java NutchDocument类的具体用法?Java NutchDocument怎么用?Java NutchDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


NutchDocument类属于org.apache.nutch.indexer包,在下文中一共展示了NutchDocument类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: indexerScore

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if (tlds != null) {
    for (Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if (entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:TLDScoringFilter.java

示例2: testEmptyIndexStatic

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
 * Test that empty {@code index.static} does not add anything to the document
 * 
 * @throws Exception
 */
@Test
public void testEmptyIndexStatic() throws Exception {

  Assert.assertNotNull(filter);
  filter.setConf(conf);

  NutchDocument doc = new NutchDocument();

  try {
    filter.filter(doc, parse, url, crawlDatum, inlinks);
  } catch (Exception e) {
    e.printStackTrace();
    Assert.fail(e.getMessage());
  }

  Assert.assertNotNull(doc);
  Assert.assertTrue("tests if no field is set for empty index.static", doc
      .getFieldNames().isEmpty());
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:25,代码来源:TestStaticFieldIndexerTest.java

示例3: filter

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {
  if (conf != null)
    this.setConf(conf);

  if (urlMetaTags == null || doc == null)
    return doc;

  for (String metatag : urlMetaTags) {
    Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

    if (metadata != null)
      doc.add(metatag, metadata.toString());
  }

  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:25,代码来源:URLMetaIndexingFilter.java

示例4: addTime

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
    CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) { // try parse last-modified
    time = getTime(lastModified, url); // use as time
                                       // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) { // if no last-modified specified in HTTP header
    time = datum.getModifiedTime(); // use value in CrawlDatum
    if (time <= 0) { // if also unset
      time = datum.getFetchTime(); // use time the fetch took place (fetchTime
                                   // of fetchDatum)
    }
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));
  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:24,代码来源:MoreIndexingFilter.java

示例5: testNoParts

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
 * @since NUTCH-901
 */
@Test
public void testNoParts() {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  Assert.assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());

  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
        new CrawlDatum(), new Inlinks());
  } catch (Exception e) {
    e.printStackTrace();
    Assert.fail(e.getMessage());
  }
  Assert.assertNotNull(doc);
  Assert.assertTrue(doc.getFieldNames().contains("type"));
  Assert.assertEquals(1, doc.getField("type").getValues().size());
  Assert.assertEquals("text/html", doc.getFieldValue("type"));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:26,代码来源:TestMoreIndexingFilter.java

示例6: testMissingConfigFile

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testMissingConfigFile() throws Exception {
  String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
  Assert.assertEquals(String
      .format("Property %s must not be present in the the configuration file",
          MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);

  filter.setConf(conf);

  // property not set so in this cases all documents must pass the filter
  for (int i = 0; i < parses.length; i++) {
    NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());

    Assert.assertNotNull("All documents must be allowed by default", doc);
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:18,代码来源:MimeTypeIndexingFilterTest.java

示例7: testAllowOnlyImages

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testAllowOnlyImages() throws Exception {
  conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
  filter.setConf(conf);

  for (int i = 0; i < parses.length; i++) {
    NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());

    if (MIME_TYPES[i].contains("image")) {
      Assert.assertNotNull("Allow only images", doc);
    } else {
      Assert.assertNull("Block everything else", doc);
    }
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:MimeTypeIndexingFilterTest.java

示例8: testBlockHTML

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testBlockHTML() throws Exception {
  conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
  filter.setConf(conf);

  for (int i = 0; i < parses.length; i++) {
    NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());

    if (MIME_TYPES[i].contains("html")) {
      Assert.assertNull("Block only HTML documents", doc);
    } else {
      Assert.assertNotNull("Allow everything else", doc);
    }
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:MimeTypeIndexingFilterTest.java

示例9: filter

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
    lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:20,代码来源:LanguageIndexingFilter.java

示例10: testDeduplicateAnchor

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testDeduplicateAnchor() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("anchorIndexingFilter.deduplicate", true);
  AnchorIndexingFilter filter = new AnchorIndexingFilter();
  filter.setConf(conf);
  Assert.assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://test1.com/", "text1"));
  inlinks.add(new Inlink("http://test2.com/", "text2"));
  inlinks.add(new Inlink("http://test3.com/", "text2"));
  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
        new CrawlDatum(), inlinks);
  } catch (Exception e) {
    e.printStackTrace();
    Assert.fail(e.getMessage());
  }
  Assert.assertNotNull(doc);
  Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
      .contains("anchor"));
  Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
      .getValues().size());
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:27,代码来源:TestAnchorIndexingFilter.java

示例11: addUrlFeatures

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
 * Add the features represented by a license URL. Urls are of the form
 * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
 * license feature.
 */
public void addUrlFeatures(NutchDocument doc, String urlString) {
  try {
    URL url = new URL(urlString);

    // tokenize the path of the url, breaking at slashes and dashes
    StringTokenizer names = new StringTokenizer(url.getPath(), "/-");

    if (names.hasMoreTokens())
      names.nextToken(); // throw away "licenses"

    // add a feature per component after "licenses"
    while (names.hasMoreTokens()) {
      String feature = names.nextToken();
      addFeature(doc, feature);
    }
  } catch (MalformedURLException e) {
    if (LOG.isWarnEnabled()) {
      LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
    }
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:27,代码来源:CCIndexingFilter.java

示例12: testFilterOutlinks

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testFilterOutlinks() throws Exception {
  conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
  filter.setConf(conf);

  Outlink[] outlinks = generateOutlinks();

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
          new ParseData(new ParseStatus(), "title", outlinks, metadata)),
      new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());

  Assert.assertEquals(1, doc.getField("outlinks").getValues().size());

  Assert.assertEquals("Filter outlinks, allow only those from a different host",
      outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:TestLinksIndexingFilter.java

示例13: testFilterInlinks

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testFilterInlinks() throws Exception {
  conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
  filter.setConf(conf);

  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://www.test.com", "test"));
  inlinks.add(new Inlink("http://www.example.com", "example"));

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
      new Text("http://www.example.com/"), new CrawlDatum(), inlinks);

  Assert.assertEquals(1, doc.getField("inlinks").getValues().size());

  Assert.assertEquals("Filter inlinks, allow only those from a different host",
      "http://www.test.com", doc.getFieldValue("inlinks"));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:19,代码来源:TestLinksIndexingFilter.java

示例14: testNoFilterInlinks

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testNoFilterInlinks() throws Exception {
  conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
  filter.setConf(conf);

  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://www.test.com", "test"));
  inlinks.add(new Inlink("http://www.example.com", "example"));

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
      new Text("http://www.example.com/"), new CrawlDatum(), inlinks);

  Assert.assertEquals("All inlinks must be indexed even those from the same host",
      inlinks.size(), doc.getField("inlinks").getValues().size());
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:TestLinksIndexingFilter.java

示例15: testIndexHostsOnlyAndFilterOutlinks

import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
  conf = NutchConfiguration.create();
  conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
  conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");

  Outlink[] outlinks = generateOutlinks(true);

  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
          new ParseData(new ParseStatus(), "title", outlinks, metadata)),
      new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());

  Assert.assertEquals(1, doc.getField("outlinks").getValues().size());

  Assert.assertEquals(
      "Index only the host portion of the outlinks after filtering",
      new URL("http://www.test.com").getHost(),
      doc.getFieldValue("outlinks"));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:22,代码来源:TestLinksIndexingFilter.java


注:本文中的org.apache.nutch.indexer.NutchDocument类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。