本文整理汇总了Java中org.apache.nutch.indexer.NutchDocument.getField方法的典型用法代码示例。如果您正苦于以下问题:Java NutchDocument.getField方法的具体用法?Java NutchDocument.getField怎么用?Java NutchDocument.getField使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.nutch.indexer.NutchDocument
的用法示例。
在下文中一共展示了NutchDocument.getField方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: indexerScore
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
throws ScoringFilterException {
NutchField tlds = doc.getField("tld");
float boost = 1.0f;
if (tlds != null) {
for (Object tld : tlds.getValues()) {
DomainSuffix entry = tldEntries.get(tld.toString());
if (entry != null)
boost *= entry.getBoost();
}
}
return initScore * boost;
}
示例2: indexerScore
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
throws ScoringFilterException {
NutchField tlds = doc.getField("tld");
float boost = 1.0f;
if(tlds != null) {
for(Object tld : tlds.getValues()) {
DomainSuffix entry = tldEntries.get(tld.toString());
if(entry != null)
boost *= entry.getBoost();
}
}
return initScore * boost;
}
示例3: write
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
@Override
public void write(NutchDocument doc) throws IOException {
System.out.println("Writing " + doc.getField("url").toString());
NutchField raw = doc.getField(RAW_CONTENT);
try {
String content = doc.getField(RAW_CONTENT).toString();
content = content.substring(1, content.length()-1);
String url = doc.getField("url").toString();
url = formatUrl(url);
File file = new File(path + url);
/* If file doesn't exists, then create it */
if (!file.exists()) {
file.createNewFile();
}
FileWriter fw = new FileWriter(file.getAbsoluteFile());
BufferedWriter bw = new BufferedWriter(fw);
bw.write(content);
bw.close();
System.out.println("Done");
} catch (IOException e) {
e.printStackTrace();
}
}
示例4: testIndexOnlyHostPart
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
@Test
public void testIndexOnlyHostPart() throws Exception {
conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
filter.setConf(conf);
Outlink[] outlinks = generateOutlinks(true);
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example",
"example"));
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
new ParseData(new ParseStatus(), "title", outlinks, metadata)),
new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
NutchField docOutlinks = doc.getField("outlinks");
Assert.assertEquals("Only the host portion of the outlink URL must be indexed",
new URL("http://www.test.com").getHost(),
docOutlinks.getValues().get(0));
Assert.assertEquals(
"The inlinks coming from the same host must count only once", 1,
doc.getField("inlinks").getValues().size());
Assert.assertEquals("Only the host portion of the inlinks URL must be indexed",
new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
示例5: doReplace
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
/**
* Iterates through the replacement map provided, to update the fields in the
* Nutch Document.
*
* @param doc
* the document we are modifying
* @param keyName
* either "host" or "url" -- the field that determines the
* replacement set used
* @param replaceMap
* the list of FieldReplacers that applies to this keyName.
*/
private void doReplace(NutchDocument doc, String keyName,
Map<Pattern, List<FieldReplacer>> replaceMap) {
if (doc == null || replaceMap.size() == 0) {
return;
}
Collection<String> docFieldNames = doc.getFieldNames();
NutchField keyField = doc.getField(keyName);
if (keyField == null) {
// This document doesn't have the key field; no work to do.
return;
}
List<Object> keyFieldValues = keyField.getValues();
if (keyFieldValues.size() == 0) {
// This document doesn't have any values for the key field; no work to do.
return;
}
// For every value of the keyField (one expected)
for (Object oKeyFieldValue : keyFieldValues) {
if (oKeyFieldValue != null && oKeyFieldValue instanceof java.lang.String) {
String keyFieldValue = (String) oKeyFieldValue;
// For each pattern that we have a replacement list for...
for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap
.entrySet()) {
// If this key is a match for a replacement set...
if (entries.getKey().matcher(keyFieldValue).find()) {
// For each field we will replace for this key...
for (FieldReplacer fp : entries.getValue()) {
String fieldName = fp.getFieldName();
// Does this document contain the FieldReplacer's field?
if (docFieldNames.contains(fieldName)) {
NutchField docField = doc.getField(fieldName);
List<Object> fieldValues = docField.getValues();
ArrayList<String> newFieldValues = new ArrayList<String>();
// For each value of the field, match against our
// replacer...
for (Object oFieldValue : fieldValues) {
if (oFieldValue != null
&& oFieldValue instanceof java.lang.String) {
String fieldValue = (String) oFieldValue;
String newValue = fp.replace(fieldValue);
newFieldValues.add(newValue);
}
}
// Remove the target field and add our replaced values.
String targetFieldName = fp.getToFieldName();
doc.removeField(targetFieldName);
for (String newFieldValue : newFieldValues) {
doc.add(targetFieldName, newFieldValue);
}
}
}
}
}
}
}
}