当前位置: 首页>>代码示例>>Java>>正文


Java NutchDocument.getField方法代码示例

本文整理汇总了Java中org.apache.nutch.indexer.NutchDocument.getField方法的典型用法代码示例。如果您正苦于以下问题:Java NutchDocument.getField方法的具体用法?Java NutchDocument.getField怎么用?Java NutchDocument.getField使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.nutch.indexer.NutchDocument的用法示例。


在下文中一共展示了NutchDocument.getField方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: indexerScore

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if (tlds != null) {
    for (Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if (entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:TLDScoringFilter.java

示例2: indexerScore

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:17,代码来源:TLDScoringFilter.java

示例3: write

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
@Override
public void write(NutchDocument doc) throws IOException {
	System.out.println("Writing " + doc.getField("url").toString());
	NutchField raw = doc.getField(RAW_CONTENT);
	
	try {

		String content = doc.getField(RAW_CONTENT).toString();
		content = content.substring(1, content.length()-1);
		
		String url = doc.getField("url").toString();
		url = formatUrl(url);

		File file = new File(path + url);
		
		/* If file doesn't exists, then create it */
		if (!file.exists()) {
			file.createNewFile();
		}

		FileWriter fw = new FileWriter(file.getAbsoluteFile());
		BufferedWriter bw = new BufferedWriter(fw);
		bw.write(content);
		bw.close();

		System.out.println("Done");

	} catch (IOException e) {
		e.printStackTrace();
	}
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:32,代码来源:OgcIndexWriter.java

示例4: testIndexOnlyHostPart

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
@Test
public void testIndexOnlyHostPart() throws Exception {
  conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
  conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
  conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
  filter.setConf(conf);

  Outlink[] outlinks = generateOutlinks(true);

  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
  inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
  inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example",
      "example"));

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
          new ParseData(new ParseStatus(), "title", outlinks, metadata)),
      new Text("http://www.example.com/"), new CrawlDatum(), inlinks);

  NutchField docOutlinks = doc.getField("outlinks");

  Assert.assertEquals("Only the host portion of the outlink URL must be indexed",
      new URL("http://www.test.com").getHost(),
      docOutlinks.getValues().get(0));

  Assert.assertEquals(
      "The inlinks coming from the same host must count only once", 1,
      doc.getField("inlinks").getValues().size());

  Assert.assertEquals("Only the host portion of the inlinks URL must be indexed",
      new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:33,代码来源:TestLinksIndexingFilter.java

示例5: doReplace

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
/**
 * Iterates through the replacement map provided, to update the fields in the
 * Nutch Document.
 * 
 * @param doc
 *          the document we are modifying
 * @param keyName
 *          either "host" or "url" -- the field that determines the
 *          replacement set used
 * @param replaceMap
 *          the list of FieldReplacers that applies to this keyName.
 */
private void doReplace(NutchDocument doc, String keyName,
    Map<Pattern, List<FieldReplacer>> replaceMap) {

  if (doc == null || replaceMap.size() == 0) {
    return;
  }

  Collection<String> docFieldNames = doc.getFieldNames();
  NutchField keyField = doc.getField(keyName);
  if (keyField == null) {
    // This document doesn't have the key field; no work to do.
    return;
  }

  List<Object> keyFieldValues = keyField.getValues();
  if (keyFieldValues.size() == 0) {
    // This document doesn't have any values for the key field; no work to do.
    return;
  }

  // For every value of the keyField (one expected)
  for (Object oKeyFieldValue : keyFieldValues) {
    if (oKeyFieldValue != null && oKeyFieldValue instanceof java.lang.String) {
      String keyFieldValue = (String) oKeyFieldValue;

      // For each pattern that we have a replacement list for...
      for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap
          .entrySet()) {
        // If this key is a match for a replacement set...
        if (entries.getKey().matcher(keyFieldValue).find()) {

          // For each field we will replace for this key...
          for (FieldReplacer fp : entries.getValue()) {
            String fieldName = fp.getFieldName();

            // Does this document contain the FieldReplacer's field?
            if (docFieldNames.contains(fieldName)) {
              NutchField docField = doc.getField(fieldName);
              List<Object> fieldValues = docField.getValues();
              ArrayList<String> newFieldValues = new ArrayList<String>();

              // For each value of the field, match against our
              // replacer...
              for (Object oFieldValue : fieldValues) {
                if (oFieldValue != null
                    && oFieldValue instanceof java.lang.String) {
                  String fieldValue = (String) oFieldValue;
                  String newValue = fp.replace(fieldValue);
                  newFieldValues.add(newValue);
                }
              }

              // Remove the target field and add our replaced values.
              String targetFieldName = fp.getToFieldName();
              doc.removeField(targetFieldName);
              for (String newFieldValue : newFieldValues) {
                doc.add(targetFieldName, newFieldValue);
              }
            }
          }
        }
      }
    }
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:78,代码来源:ReplaceIndexer.java


注:本文中的org.apache.nutch.indexer.NutchDocument.getField方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。