当前位置: 首页>>代码示例>>Java>>正文


Java Metadata.get方法代码示例

本文整理汇总了Java中org.apache.nutch.metadata.Metadata.get方法的典型用法代码示例。如果您正苦于以下问题:Java Metadata.get方法的具体用法?Java Metadata.get怎么用?Java Metadata.get使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.nutch.metadata.Metadata的用法示例。


在下文中一共展示了Metadata.get方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: filter

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
		throws IndexingException {
	ParseData dataP = parse.getData();
	Metadata meta = dataP.getParseMeta();
	boolean index = false;
	
	for (String key : meta.names()) {
		if(key.equals("ogc_service"))
			index = true;
		String value = meta.get(key);
		LOG.info("Adding " + url + " to NutchDocument");
		doc.add(key, value);
	}
	/* Return the document if it is an ogc service, otherwise return null */
	return index ? doc : null;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:18,代码来源:OgcIndexingFilter.java

示例2: isTruncated

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
 * Checks if the page's content is truncated.
 * 
 * @param content
 * @return If the page is truncated <code>true</code>. When it is not, or when
 *         it could be determined, <code>false</code>.
 */
public static boolean isTruncated(Content content) {
  byte[] contentBytes = content.getContent();
  if (contentBytes == null)
    return false;
  Metadata metadata = content.getMetadata();
  if (metadata == null)
    return false;

  String lengthStr = metadata.get(Response.CONTENT_LENGTH);
  if (lengthStr != null)
    lengthStr = lengthStr.trim();
  if (StringUtil.isEmpty(lengthStr)) {
    return false;
  }
  int inHeaderSize;
  String url = content.getUrl();
  try {
    inHeaderSize = Integer.parseInt(lengthStr);
  } catch (NumberFormatException e) {
    LOG.warn("Wrong contentlength format for " + url, e);
    return false;
  }
  int actualSize = contentBytes.length;
  if (inHeaderSize > actualSize) {
    LOG.info(url + " skipped. Content of size " + inHeaderSize
        + " was truncated to " + actualSize);
    return true;
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize="
        + inHeaderSize);
  }
  return false;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:42,代码来源:ParseSegment.java

示例3: getLanguageFromMetadata

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private static String getLanguageFromMetadata(Metadata meta) {
  if (meta == null)
    return null;
  // dublin core
  String lang = meta.get("dc.language");
  if (lang != null)
    return lang;
  // meta content-language
  lang = meta.get("content-language");
  if (lang != null)
    return lang;
  // lang attribute
  return meta.get("lang");
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:15,代码来源:HTMLLanguageParser.java

示例4: filter

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  Metadata metadata = parse.getData().getParseMeta();
  // index the license
  String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
  if (licenseUrl != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
    }

    // add the entire license as cc:license=xxx
    addFeature(doc, "license=" + licenseUrl);

    // index license attributes extracted of the license url
    addUrlFeatures(doc, licenseUrl);
  }

  // index the license location as cc:meta=xxx
  String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
  if (licenseLocation != null) {
    addFeature(doc, "meta=" + licenseLocation);
  }

  // index the work type cc:type=xxx
  String workType = metadata.get(CreativeCommons.WORK_TYPE);
  if (workType != null) {
    addFeature(doc, workType);
  }

  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:33,代码来源:CCIndexingFilter.java

示例5: isTruncated

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
 * Checks if the page's content is truncated.
 * @param content
 * @return If the page is truncated <code>true</code>. When it is not,
 * or when it could be determined, <code>false</code>. 
 */
public static boolean isTruncated(Content content) {
  byte[] contentBytes = content.getContent();
  if (contentBytes == null) return false;
  Metadata metadata = content.getMetadata();
  if (metadata == null) return false;
  
  String lengthStr = metadata.get(Response.CONTENT_LENGTH);
  if (lengthStr != null) lengthStr=lengthStr.trim();
  if (StringUtil.isEmpty(lengthStr)) {
    return false;
  }
  int inHeaderSize;
  String url = content.getUrl();
  try {
    inHeaderSize = Integer.parseInt(lengthStr);
  } catch (NumberFormatException e) {
    LOG.warn("Wrong contentlength format for " + url, e);
    return false;
  }
  int actualSize = contentBytes.length;
  if (inHeaderSize > actualSize) {
    LOG.info(url + " skipped. Content of size " + inHeaderSize
        + " was truncated to " + actualSize);
    return true;
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
  }
  return false;
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:37,代码来源:ParseSegment.java

示例6: getLanguageFromMetadata

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private static String getLanguageFromMetadata(Metadata meta) {
    if (meta == null)
        return null;
    // dublin core
    String lang = meta.get("dc.language");
    if (lang != null)
        return lang;
    // meta content-language
    lang = meta.get("content-language");
    if (lang != null)
        return lang;
    // lang attribute
    return meta.get("lang");
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:15,代码来源:HTMLLanguageParser.java

示例7: filter

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {
  
  Metadata metadata = parse.getData().getParseMeta();
  // index the license
  String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
  if (licenseUrl != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
    }

    // add the entire license as cc:license=xxx
    addFeature(doc, "license=" + licenseUrl);

    // index license attributes extracted of the license url
    addUrlFeatures(doc, licenseUrl);
  }

  // index the license location as cc:meta=xxx
  String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
  if (licenseLocation != null) {
    addFeature(doc, "meta=" + licenseLocation);
  }

  // index the work type cc:type=xxx
  String workType = metadata.get(CreativeCommons.WORK_TYPE);
  if (workType != null) {
    addFeature(doc, workType);
  }

  return doc;
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:33,代码来源:CCIndexingFilter.java

示例8: filter

import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
 * Adds the keywords metatag information to the document
 */
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {

	Metadata metadata = parse.getData().getParseMeta();

	String keywordsString = metadata.get(STORE_METADATA_KEYWORDS);

	if (keywordsString != null) {
		String[] keywords = keywordsString.split(",");

		for (String keyword : keywords) {
			if (keyword.length() > 0 && !keyword.equals(" ")) {
				// remove preceding spaces
				while (keyword.length() > 0 && keyword.charAt(0) == ' ') {
					keyword = keyword.substring(1);
				}

				if(keyword.length() > 0) {
					doc.add(HTML_METATAG_KEYWORDS, keyword);
				}
			}
		}
	}

	return doc;
}
 
开发者ID:dkd,项目名称:nutch-typo3-cms,代码行数:31,代码来源:KeywordsIndexingFilter.java


注:本文中的org.apache.nutch.metadata.Metadata.get方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。