当前位置: 首页>>代码示例>>Java>>正文


Java NutchDocument.add方法代码示例

本文整理汇总了Java中org.apache.nutch.indexer.NutchDocument.add方法的典型用法代码示例。如果您正苦于以下问题:Java NutchDocument.add方法的具体用法?Java NutchDocument.add怎么用?Java NutchDocument.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.nutch.indexer.NutchDocument的用法示例。


在下文中一共展示了NutchDocument.add方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: filter

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {
  if (conf != null)
    this.setConf(conf);

  if (urlMetaTags == null || doc == null)
    return doc;

  for (String metatag : urlMetaTags) {
    Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

    if (metadata != null)
      doc.add(metatag, metadata.toString());
  }

  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:25,代码来源:URLMetaIndexingFilter.java

示例2: addTime

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
    CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) { // try parse last-modified
    time = getTime(lastModified, url); // use as time
                                       // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) { // if no last-modified specified in HTTP header
    time = datum.getModifiedTime(); // use value in CrawlDatum
    if (time <= 0) { // if also unset
      time = datum.getFetchTime(); // use time the fetch took place (fetchTime
                                   // of fetchDatum)
    }
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));
  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:24,代码来源:MoreIndexingFilter.java

示例3: filter

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
    lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:20,代码来源:LanguageIndexingFilter.java

示例4: filter

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
		throws IndexingException {
	ParseData dataP = parse.getData();
	Metadata meta = dataP.getParseMeta();
	boolean index = false;
	
	for (String key : meta.names()) {
		if(key.equals("ogc_service"))
			index = true;
		String value = meta.get(key);
		LOG.info("Adding " + url + " to NutchDocument");
		doc.add(key, value);
	}
	/* Return the document if it is an ogc service, otherwise return null */
	return index ? doc : null;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:18,代码来源:OgcIndexingFilter.java

示例5: filter

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:25,代码来源:URLMetaIndexingFilter.java

示例6: addTime

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified
    time = datum.getFetchTime();                // use fetch time
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));

  return doc;
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:21,代码来源:MoreIndexingFilter.java

示例7: filter

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:20,代码来源:LanguageIndexingFilter.java

示例8: addRawContent

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
private NutchDocument addRawContent(NutchDocument doc, WebPage page, String url) {
    ByteBuffer raw = page.getContent();
    if (raw != null) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Html indexing for: " + url.toString());
        }
        ByteArrayInputStream arrayInputStream = new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(), raw.remaining());
        Scanner scanner = new Scanner(arrayInputStream);
        scanner.useDelimiter("\\Z");//To read all scanner content in one String
        String data = "";
        if (scanner.hasNext()) {
            data = scanner.next();
        }
        doc.add("rawcontent", StringUtil.cleanField(data));
    }
    return doc;
}
 
开发者ID:Meabed,项目名称:nutch2-index-html,代码行数:18,代码来源:HtmlIndexingFilter.java

示例9: filter

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {

	// convert ISO date to time stamp
	String isoDate = conf.get(CONF_ENDTIME_PROPERTY, "1970-01-01T00:00:00Z");
	long epoch = 0;
	try {
		epoch = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ").parse(isoDate).getTime();
	} catch (ParseException e) {
		LOG.error("ERROR! Cannot parse date, must fit pattern yyyy-MM-dd'T'HH:mm:ssZ : " + isoDate);
	}

	// Index the endtime
	doc.add(INDEXING_FIELD, new Date(epoch));

	return doc;
}
 
开发者ID:dkd,项目名称:nutch-typo3-cms,代码行数:19,代码来源:EndtimeIndexingFilter.java

示例10: filter

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
@Override
public NutchDocument filter(NutchDocument document, String s, WebPage webPage) throws IndexingException {
    if (storageField != null) {
        CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
        try {
            String strippedContent = decoder.decode(webPage.getMetadata().get(new Utf8(storageField))).toString();
            if (strippedContent != null) {
                document.add(storageField, strippedContent);
            }
        } catch (CharacterCodingException e) {
            e.printStackTrace();
        }
    }

    return document;
}
 
开发者ID:kaqqao,项目名称:nutch-element-selector,代码行数:17,代码来源:HtmlElementSelectorIndexer.java

示例11: filter

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  // Check if some Rel-Tags found, possibly put there by RelTagParser
  String[] tags = parse.getData().getParseMeta()
      .getValues(RelTagParser.REL_TAG);
  if (tags != null) {
    for (int i = 0; i < tags.length; i++) {
      doc.add("tag", tags[i]);
    }
  }

  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:15,代码来源:RelTagIndexingFilter.java

示例12: filter

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  try {
    URL url = new URL(urlText.toString());
    DomainSuffix d = URLUtil.getDomainSuffix(url);

    doc.add("tld", d.getDomain());

  } catch (Exception ex) {
    LOG.warn(ex.toString());
  }

  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:16,代码来源:TLDIndexingFilter.java

示例13: addLength

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
  String contentLength = data.getMeta(Response.CONTENT_LENGTH);

  if (contentLength != null) {
    // NUTCH-1010 ContentLength not trimmed
    String trimmed = contentLength.toString().trim();
    if (!trimmed.isEmpty())
      doc.add("contentLength", trimmed);
  }
  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:12,代码来源:MoreIndexingFilter.java

示例14: resetTitle

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) {
  String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
  if (contentDisposition == null || doc.getFieldValue("title") != null)
    return doc;

  for (int i = 0; i < patterns.length; i++) {
    Matcher matcher = patterns[i].matcher(contentDisposition);
    if (matcher.find()) {
      doc.add("title", matcher.group(1));
      break;
    }
  }

  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:16,代码来源:MoreIndexingFilter.java

示例15: addSubCollectionField

import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
/**
 * "Mark" document to be a part of subcollection
 * 
 * @param doc
 * @param url
 */
private void addSubCollectionField(NutchDocument doc, String url) {
  for (Subcollection coll : CollectionManager.getCollectionManager(getConf())
      .getSubCollections(url)) {
    if (coll.getKey() == null) {
      doc.add(fieldName, coll.getName());
    } else {
      doc.add(coll.getKey(), coll.getName());
    }
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:17,代码来源:SubcollectionIndexingFilter.java


注:本文中的org.apache.nutch.indexer.NutchDocument.add方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。