當前位置: 首頁>>代碼示例>>Java>>正文


Java ParseImpl類代碼示例

本文整理匯總了Java中org.apache.nutch.parse.ParseImpl的典型用法代碼示例。如果您正苦於以下問題:Java ParseImpl類的具體用法?Java ParseImpl怎麽用?Java ParseImpl使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


ParseImpl類屬於org.apache.nutch.parse包,在下文中一共展示了ParseImpl類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。

示例1: testNoParts

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
/**
 * @since NUTCH-901
 */
@Test
public void testNoParts() {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  Assert.assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());

  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
        new CrawlDatum(), new Inlinks());
  } catch (Exception e) {
    e.printStackTrace();
    Assert.fail(e.getMessage());
  }
  Assert.assertNotNull(doc);
  Assert.assertTrue(doc.getFieldNames().contains("type"));
  Assert.assertEquals(1, doc.getField("type").getValues().size());
  Assert.assertEquals("text/html", doc.getFieldValue("type"));
}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:26,代碼來源:TestMoreIndexingFilter.java

示例2: getParse

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
public ParseResult getParse(Content c) {
  String type = c.getContentType();
  if (type != null && !type.trim().equals("")
      && !type.toLowerCase().startsWith("application/x-javascript"))
    return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
        "Content not JavaScript: '" + type + "'").getEmptyParseResult(
        c.getUrl(), getConf());
  String script = new String(c.getContent());
  Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
  if (outlinks == null)
    outlinks = new Outlink[0];
  // Title? use the first line of the script...
  String title;
  int idx = script.indexOf('\n');
  if (idx != -1) {
    if (idx > MAX_TITLE_LEN)
      idx = MAX_TITLE_LEN;
    title = script.substring(0, idx);
  } else {
    idx = Math.min(MAX_TITLE_LEN, script.length());
    title = script.substring(0, idx);
  }
  ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
      c.getMetadata());
  return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:27,代碼來源:JSParseFilter.java

示例3: testDeduplicateAnchor

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testDeduplicateAnchor() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("anchorIndexingFilter.deduplicate", true);
  AnchorIndexingFilter filter = new AnchorIndexingFilter();
  filter.setConf(conf);
  Assert.assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://test1.com/", "text1"));
  inlinks.add(new Inlink("http://test2.com/", "text2"));
  inlinks.add(new Inlink("http://test3.com/", "text2"));
  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
        new CrawlDatum(), inlinks);
  } catch (Exception e) {
    e.printStackTrace();
    Assert.fail(e.getMessage());
  }
  Assert.assertNotNull(doc);
  Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
      .contains("anchor"));
  Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
      .getValues().size());
}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:27,代碼來源:TestAnchorIndexingFilter.java

示例4: testFilterOutlinks

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testFilterOutlinks() throws Exception {
  conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
  filter.setConf(conf);

  Outlink[] outlinks = generateOutlinks();

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
          new ParseData(new ParseStatus(), "title", outlinks, metadata)),
      new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());

  Assert.assertEquals(1, doc.getField("outlinks").getValues().size());

  Assert.assertEquals("Filter outlinks, allow only those from a different host",
      outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:17,代碼來源:TestLinksIndexingFilter.java

示例5: testFilterInlinks

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testFilterInlinks() throws Exception {
  conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
  filter.setConf(conf);

  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://www.test.com", "test"));
  inlinks.add(new Inlink("http://www.example.com", "example"));

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
      new Text("http://www.example.com/"), new CrawlDatum(), inlinks);

  Assert.assertEquals(1, doc.getField("inlinks").getValues().size());

  Assert.assertEquals("Filter inlinks, allow only those from a different host",
      "http://www.test.com", doc.getFieldValue("inlinks"));
}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:19,代碼來源:TestLinksIndexingFilter.java

示例6: testNoFilterInlinks

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testNoFilterInlinks() throws Exception {
  conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
  filter.setConf(conf);

  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://www.test.com", "test"));
  inlinks.add(new Inlink("http://www.example.com", "example"));

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
      new Text("http://www.example.com/"), new CrawlDatum(), inlinks);

  Assert.assertEquals("All inlinks must be indexed even those from the same host",
      inlinks.size(), doc.getField("inlinks").getValues().size());
}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:17,代碼來源:TestLinksIndexingFilter.java

示例7: testIndexHostsOnlyAndFilterOutlinks

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
  conf = NutchConfiguration.create();
  conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
  conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");

  Outlink[] outlinks = generateOutlinks(true);

  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
          new ParseData(new ParseStatus(), "title", outlinks, metadata)),
      new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());

  Assert.assertEquals(1, doc.getField("outlinks").getValues().size());

  Assert.assertEquals(
      "Index only the host portion of the outlinks after filtering",
      new URL("http://www.test.com").getHost(),
      doc.getFieldValue("outlinks"));
}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:22,代碼來源:TestLinksIndexingFilter.java

示例8: testIndexHostsOnlyAndFilterInlinks

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
  conf = NutchConfiguration.create();
  conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
  conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");

  filter.setConf(conf);

  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://www.test.com", "test"));
  inlinks.add(new Inlink("http://www.example.com", "example"));

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
      new Text("http://www.example.com/"), new CrawlDatum(), inlinks);

  Assert.assertEquals(1, doc.getField("inlinks").getValues().size());

  Assert.assertEquals(
      "Index only the host portion of the inlinks after filtering",
      new URL("http://www.test.com").getHost(),
      doc.getFieldValue("inlinks"));

}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:25,代碼來源:TestLinksIndexingFilter.java

示例9: testPositiveFilter

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
public void testPositiveFilter() throws Exception {
  Configuration conf = NutchConfiguration.create();

  String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
  RegexParseFilter filter = new RegexParseFilter(file);
  filter.setConf(conf);

  String url = "http://nutch.apache.org/";
  String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
  Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
  Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
  
  ParseResult result = ParseResult.createParseResult(url, parse);
  result = filter.filter(content, result, null, null);

  Metadata meta = parse.getData().getParseMeta();
  
  assertEquals("true", meta.get("first"));
  assertEquals("true", meta.get("second"));
}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:21,代碼來源:TestRegexParseFilter.java

示例10: testNegativeFilter

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
public void testNegativeFilter() throws Exception {
  Configuration conf = NutchConfiguration.create();

  String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
  RegexParseFilter filter = new RegexParseFilter(file);
  filter.setConf(conf);

  String url = "http://nutch.apache.org/";
  String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
  Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
  Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
  
  ParseResult result = ParseResult.createParseResult(url, parse);
  result = filter.filter(content, result, null, null);

  Metadata meta = parse.getData().getParseMeta();
  
  assertEquals("false", meta.get("first"));
  assertEquals("false", meta.get("second"));
}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:21,代碼來源:TestRegexParseFilter.java

示例11: testNonExistingIndexingFilter

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
/**
 * Test behaviour when defined filter does not exist.
 * 
 * @throws IndexingException
 */
@Test
public void testNonExistingIndexingFilter() throws IndexingException {
  Configuration conf = NutchConfiguration.create();
  conf.addResource("nutch-default.xml");
  conf.addResource("crawl-tests.xml");

  String class1 = "NonExistingFilter";
  String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

  IndexingFilters filters = new IndexingFilters(conf);
  filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:21,代碼來源:TestIndexingFilters.java

示例12: testNoParts

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
/**
 * @since NUTCH-901
 */
public void testNoParts(){
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
  catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue(doc.getFieldNames().contains("type"));
  assertEquals(1, doc.getField("type").getValues().size());
  assertEquals("text/html", doc.getFieldValue("type"));    
}
 
開發者ID:yahoo,項目名稱:anthelion,代碼行數:25,代碼來源:TestMoreIndexingFilter.java

示例13: getParse

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
public ParseResult getParse(Content c) {
  String type = c.getContentType();
  if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
    return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
            "Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf());
  String script = new String(c.getContent());
  Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
  if (outlinks == null) outlinks = new Outlink[0];
  // Title? use the first line of the script...
  String title;
  int idx = script.indexOf('\n');
  if (idx != -1) {
    if (idx > MAX_TITLE_LEN) idx = MAX_TITLE_LEN;
    title = script.substring(0, idx);
  } else {
    idx = Math.min(MAX_TITLE_LEN, script.length());
    title = script.substring(0, idx);
  }
  ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
                               c.getMetadata());
  return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}
 
開發者ID:yahoo,項目名稱:anthelion,代碼行數:23,代碼來源:JSParseFilter.java

示例14: testDeduplicateAnchor

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
public void testDeduplicateAnchor() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("anchorIndexingFilter.deduplicate", true);
  AnchorIndexingFilter filter = new AnchorIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://test1.com/", "text1"));
  inlinks.add(new Inlink("http://test2.com/", "text2"));
  inlinks.add(new Inlink("http://test3.com/", "text2"));
  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
  assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}
 
開發者ID:yahoo,項目名稱:anthelion,代碼行數:23,代碼來源:TestAnchorIndexingFilter.java

示例15: setup

import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Before
public void setup() {
	String text = "Hola que tal";
	scoringFilter = new SharkScoringFilter();
	parse = new ParseImpl(text, new ParseData());
	url = "http://wms.magrama.es/sig/Agricultura/TurcSecano/wms.aspx?request=GetCapabilities&service=WMS";
	urlText = new Text(url);
	datum = new CrawlDatum();
	inlinks = new Inlinks();
	content = new Content();
}
 
開發者ID:jorcox,項目名稱:GeoCrawler,代碼行數:12,代碼來源:SharkScoringTest.java


注:本文中的org.apache.nutch.parse.ParseImpl類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。