本文整理汇总了Java中org.apache.nutch.indexer.NutchDocument类的典型用法代码示例。如果您正苦于以下问题:Java NutchDocument类的具体用法?Java NutchDocument怎么用?Java NutchDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
NutchDocument类属于org.apache.nutch.indexer包,在下文中一共展示了NutchDocument类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: indexerScore
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
throws ScoringFilterException {
NutchField tlds = doc.getField("tld");
float boost = 1.0f;
if (tlds != null) {
for (Object tld : tlds.getValues()) {
DomainSuffix entry = tldEntries.get(tld.toString());
if (entry != null)
boost *= entry.getBoost();
}
}
return initScore * boost;
}
示例2: testEmptyIndexStatic
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
* Test that empty {@code index.static} does not add anything to the document
*
* @throws Exception
*/
@Test
public void testEmptyIndexStatic() throws Exception {
Assert.assertNotNull(filter);
filter.setConf(conf);
NutchDocument doc = new NutchDocument();
try {
filter.filter(doc, parse, url, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertTrue("tests if no field is set for empty index.static", doc
.getFieldNames().isEmpty());
}
示例3: filter
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
* This will take the metatags that you have listed in your "urlmeta.tags"
* property, and looks for them inside the CrawlDatum object. If they exist,
* this will add it as an attribute inside the NutchDocument.
*
* @see IndexingFilter#filter
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
if (conf != null)
this.setConf(conf);
if (urlMetaTags == null || doc == null)
return doc;
for (String metatag : urlMetaTags) {
Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
if (metadata != null)
doc.add(metatag, metadata.toString());
}
return doc;
}
示例4: addTime
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
CrawlDatum datum) {
long time = -1;
String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
if (lastModified != null) { // try parse last-modified
time = getTime(lastModified, url); // use as time
// store as string
doc.add("lastModified", new Date(time));
}
if (time == -1) { // if no last-modified specified in HTTP header
time = datum.getModifiedTime(); // use value in CrawlDatum
if (time <= 0) { // if also unset
time = datum.getFetchTime(); // use time the fetch took place (fetchTime
// of fetchDatum)
}
}
// un-stored, indexed and un-tokenized
doc.add("date", new Date(time));
return doc;
}
示例5: testNoParts
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
* @since NUTCH-901
*/
@Test
public void testNoParts() {
Configuration conf = NutchConfiguration.create();
conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
try {
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
new CrawlDatum(), new Inlinks());
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertTrue(doc.getFieldNames().contains("type"));
Assert.assertEquals(1, doc.getField("type").getValues().size());
Assert.assertEquals("text/html", doc.getFieldValue("type"));
}
示例6: testMissingConfigFile
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testMissingConfigFile() throws Exception {
String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
Assert.assertEquals(String
.format("Property %s must not be present in the the configuration file",
MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
filter.setConf(conf);
// property not set so in this cases all documents must pass the filter
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertNotNull("All documents must be allowed by default", doc);
}
}
示例7: testAllowOnlyImages
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testAllowOnlyImages() throws Exception {
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
filter.setConf(conf);
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (MIME_TYPES[i].contains("image")) {
Assert.assertNotNull("Allow only images", doc);
} else {
Assert.assertNull("Block everything else", doc);
}
}
}
示例8: testBlockHTML
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testBlockHTML() throws Exception {
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
filter.setConf(conf);
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (MIME_TYPES[i].contains("html")) {
Assert.assertNull("Block only HTML documents", doc);
} else {
Assert.assertNotNull("Allow everything else", doc);
}
}
}
示例9: filter
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// check if LANGUAGE found, possibly put there by HTMLLanguageParser
String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
// check if HTTP-header tels us the language
if (lang == null) {
lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
}
if (lang == null || lang.length() == 0) {
lang = "unknown";
}
doc.add("lang", lang);
return doc;
}
示例10: testDeduplicateAnchor
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testDeduplicateAnchor() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.setBoolean("anchorIndexingFilter.deduplicate", true);
AnchorIndexingFilter filter = new AnchorIndexingFilter();
filter.setConf(conf);
Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://test1.com/", "text1"));
inlinks.add(new Inlink("http://test2.com/", "text2"));
inlinks.add(new Inlink("http://test3.com/", "text2"));
try {
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
new CrawlDatum(), inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
.contains("anchor"));
Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
.getValues().size());
}
示例11: addUrlFeatures
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
/**
* Add the features represented by a license URL. Urls are of the form
* "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
* license feature.
*/
public void addUrlFeatures(NutchDocument doc, String urlString) {
try {
URL url = new URL(urlString);
// tokenize the path of the url, breaking at slashes and dashes
StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
if (names.hasMoreTokens())
names.nextToken(); // throw away "licenses"
// add a feature per component after "licenses"
while (names.hasMoreTokens()) {
String feature = names.nextToken();
addFeature(doc, feature);
}
} catch (MalformedURLException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
}
}
}
示例12: testFilterOutlinks
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testFilterOutlinks() throws Exception {
conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
filter.setConf(conf);
Outlink[] outlinks = generateOutlinks();
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", outlinks, metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
Assert.assertEquals("Filter outlinks, allow only those from a different host",
outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
}
示例13: testFilterInlinks
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testFilterInlinks() throws Exception {
conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
filter.setConf(conf);
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://www.test.com", "test"));
inlinks.add(new Inlink("http://www.example.com", "example"));
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
Assert.assertEquals("Filter inlinks, allow only those from a different host",
"http://www.test.com", doc.getFieldValue("inlinks"));
}
示例14: testNoFilterInlinks
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testNoFilterInlinks() throws Exception {
conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
filter.setConf(conf);
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://www.test.com", "test"));
inlinks.add(new Inlink("http://www.example.com", "example"));
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
Assert.assertEquals("All inlinks must be indexed even those from the same host",
inlinks.size(), doc.getField("inlinks").getValues().size());
}
示例15: testIndexHostsOnlyAndFilterOutlinks
import org.apache.nutch.indexer.NutchDocument; //导入依赖的package包/类
@Test
public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
conf = NutchConfiguration.create();
conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
Outlink[] outlinks = generateOutlinks(true);
filter.setConf(conf);
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", outlinks, metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
Assert.assertEquals(
"Index only the host portion of the outlinks after filtering",
new URL("http://www.test.com").getHost(),
doc.getFieldValue("outlinks"));
}