本文整理匯總了Java中org.apache.nutch.parse.ParseImpl類的典型用法代碼示例。如果您正苦於以下問題:Java ParseImpl類的具體用法?Java ParseImpl怎麽用?Java ParseImpl使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
ParseImpl類屬於org.apache.nutch.parse包,在下文中一共展示了ParseImpl類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: testNoParts
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
/**
* @since NUTCH-901
*/
@Test
public void testNoParts() {
Configuration conf = NutchConfiguration.create();
conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
try {
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
new CrawlDatum(), new Inlinks());
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertTrue(doc.getFieldNames().contains("type"));
Assert.assertEquals(1, doc.getField("type").getValues().size());
Assert.assertEquals("text/html", doc.getFieldValue("type"));
}
示例2: getParse
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
public ParseResult getParse(Content c) {
String type = c.getContentType();
if (type != null && !type.trim().equals("")
&& !type.toLowerCase().startsWith("application/x-javascript"))
return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
"Content not JavaScript: '" + type + "'").getEmptyParseResult(
c.getUrl(), getConf());
String script = new String(c.getContent());
Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
if (outlinks == null)
outlinks = new Outlink[0];
// Title? use the first line of the script...
String title;
int idx = script.indexOf('\n');
if (idx != -1) {
if (idx > MAX_TITLE_LEN)
idx = MAX_TITLE_LEN;
title = script.substring(0, idx);
} else {
idx = Math.min(MAX_TITLE_LEN, script.length());
title = script.substring(0, idx);
}
ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
c.getMetadata());
return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}
示例3: testDeduplicateAnchor
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testDeduplicateAnchor() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.setBoolean("anchorIndexingFilter.deduplicate", true);
AnchorIndexingFilter filter = new AnchorIndexingFilter();
filter.setConf(conf);
Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://test1.com/", "text1"));
inlinks.add(new Inlink("http://test2.com/", "text2"));
inlinks.add(new Inlink("http://test3.com/", "text2"));
try {
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
new CrawlDatum(), inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
.contains("anchor"));
Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
.getValues().size());
}
示例4: testFilterOutlinks
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testFilterOutlinks() throws Exception {
conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
filter.setConf(conf);
Outlink[] outlinks = generateOutlinks();
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", outlinks, metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
Assert.assertEquals("Filter outlinks, allow only those from a different host",
outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
}
示例5: testFilterInlinks
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testFilterInlinks() throws Exception {
conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
filter.setConf(conf);
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://www.test.com", "test"));
inlinks.add(new Inlink("http://www.example.com", "example"));
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
Assert.assertEquals("Filter inlinks, allow only those from a different host",
"http://www.test.com", doc.getFieldValue("inlinks"));
}
示例6: testNoFilterInlinks
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testNoFilterInlinks() throws Exception {
conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
filter.setConf(conf);
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://www.test.com", "test"));
inlinks.add(new Inlink("http://www.example.com", "example"));
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
Assert.assertEquals("All inlinks must be indexed even those from the same host",
inlinks.size(), doc.getField("inlinks").getValues().size());
}
示例7: testIndexHostsOnlyAndFilterOutlinks
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
conf = NutchConfiguration.create();
conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
Outlink[] outlinks = generateOutlinks(true);
filter.setConf(conf);
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", outlinks, metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
Assert.assertEquals(
"Index only the host portion of the outlinks after filtering",
new URL("http://www.test.com").getHost(),
doc.getFieldValue("outlinks"));
}
示例8: testIndexHostsOnlyAndFilterInlinks
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Test
public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
conf = NutchConfiguration.create();
conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
filter.setConf(conf);
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://www.test.com", "test"));
inlinks.add(new Inlink("http://www.example.com", "example"));
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
Assert.assertEquals(
"Index only the host portion of the inlinks after filtering",
new URL("http://www.test.com").getHost(),
doc.getFieldValue("inlinks"));
}
示例9: testPositiveFilter
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
public void testPositiveFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
RegexParseFilter filter = new RegexParseFilter(file);
filter.setConf(conf);
String url = "http://nutch.apache.org/";
String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
ParseResult result = ParseResult.createParseResult(url, parse);
result = filter.filter(content, result, null, null);
Metadata meta = parse.getData().getParseMeta();
assertEquals("true", meta.get("first"));
assertEquals("true", meta.get("second"));
}
示例10: testNegativeFilter
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
public void testNegativeFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
RegexParseFilter filter = new RegexParseFilter(file);
filter.setConf(conf);
String url = "http://nutch.apache.org/";
String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
ParseResult result = ParseResult.createParseResult(url, parse);
result = filter.filter(content, result, null, null);
Metadata meta = parse.getData().getParseMeta();
assertEquals("false", meta.get("first"));
assertEquals("false", meta.get("second"));
}
示例11: testNonExistingIndexingFilter
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
/**
* Test behaviour when defined filter does not exist.
*
* @throws IndexingException
*/
@Test
public void testNonExistingIndexingFilter() throws IndexingException {
Configuration conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
String class1 = "NonExistingFilter";
String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
IndexingFilters filters = new IndexingFilters(conf);
filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
"http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
示例12: testNoParts
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
/**
* @since NUTCH-901
*/
public void testNoParts(){
Configuration conf = NutchConfiguration.create();
conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
try{
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
}
catch(Exception e){
e.printStackTrace();
fail(e.getMessage());
}
assertNotNull(doc);
assertTrue(doc.getFieldNames().contains("type"));
assertEquals(1, doc.getField("type").getValues().size());
assertEquals("text/html", doc.getFieldValue("type"));
}
示例13: getParse
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
public ParseResult getParse(Content c) {
String type = c.getContentType();
if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
"Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf());
String script = new String(c.getContent());
Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
if (outlinks == null) outlinks = new Outlink[0];
// Title? use the first line of the script...
String title;
int idx = script.indexOf('\n');
if (idx != -1) {
if (idx > MAX_TITLE_LEN) idx = MAX_TITLE_LEN;
title = script.substring(0, idx);
} else {
idx = Math.min(MAX_TITLE_LEN, script.length());
title = script.substring(0, idx);
}
ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
c.getMetadata());
return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}
示例14: testDeduplicateAnchor
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
public void testDeduplicateAnchor() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.setBoolean("anchorIndexingFilter.deduplicate", true);
AnchorIndexingFilter filter = new AnchorIndexingFilter();
filter.setConf(conf);
assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://test1.com/", "text1"));
inlinks.add(new Inlink("http://test2.com/", "text2"));
inlinks.add(new Inlink("http://test3.com/", "text2"));
try {
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
} catch(Exception e){
e.printStackTrace();
fail(e.getMessage());
}
assertNotNull(doc);
assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}
示例15: setup
import org.apache.nutch.parse.ParseImpl; //導入依賴的package包/類
@Before
public void setup() {
String text = "Hola que tal";
scoringFilter = new SharkScoringFilter();
parse = new ParseImpl(text, new ParseData());
url = "http://wms.magrama.es/sig/Agricultura/TurcSecano/wms.aspx?request=GetCapabilities&service=WMS";
urlText = new Text(url);
datum = new CrawlDatum();
inlinks = new Inlinks();
content = new Content();
}