本文整理汇总了Java中org.apache.nutch.metadata.Metadata.get方法的典型用法代码示例。如果您正苦于以下问题:Java Metadata.get方法的具体用法?Java Metadata.get怎么用?Java Metadata.get使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.nutch.metadata.Metadata
的用法示例。
在下文中一共展示了Metadata.get方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: filter
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
ParseData dataP = parse.getData();
Metadata meta = dataP.getParseMeta();
boolean index = false;
for (String key : meta.names()) {
if(key.equals("ogc_service"))
index = true;
String value = meta.get(key);
LOG.info("Adding " + url + " to NutchDocument");
doc.add(key, value);
}
/* Return the document if it is an ogc service, otherwise return null */
return index ? doc : null;
}
示例2: isTruncated
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
* Checks if the page's content is truncated.
*
* @param content
* @return If the page is truncated <code>true</code>. When it is not, or when
* it could be determined, <code>false</code>.
*/
public static boolean isTruncated(Content content) {
byte[] contentBytes = content.getContent();
if (contentBytes == null)
return false;
Metadata metadata = content.getMetadata();
if (metadata == null)
return false;
String lengthStr = metadata.get(Response.CONTENT_LENGTH);
if (lengthStr != null)
lengthStr = lengthStr.trim();
if (StringUtil.isEmpty(lengthStr)) {
return false;
}
int inHeaderSize;
String url = content.getUrl();
try {
inHeaderSize = Integer.parseInt(lengthStr);
} catch (NumberFormatException e) {
LOG.warn("Wrong contentlength format for " + url, e);
return false;
}
int actualSize = contentBytes.length;
if (inHeaderSize > actualSize) {
LOG.info(url + " skipped. Content of size " + inHeaderSize
+ " was truncated to " + actualSize);
return true;
}
if (LOG.isDebugEnabled()) {
LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize="
+ inHeaderSize);
}
return false;
}
示例3: getLanguageFromMetadata
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private static String getLanguageFromMetadata(Metadata meta) {
if (meta == null)
return null;
// dublin core
String lang = meta.get("dc.language");
if (lang != null)
return lang;
// meta content-language
lang = meta.get("content-language");
if (lang != null)
return lang;
// lang attribute
return meta.get("lang");
}
示例4: filter
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
Metadata metadata = parse.getData().getParseMeta();
// index the license
String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
if (licenseUrl != null) {
if (LOG.isInfoEnabled()) {
LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
}
// add the entire license as cc:license=xxx
addFeature(doc, "license=" + licenseUrl);
// index license attributes extracted of the license url
addUrlFeatures(doc, licenseUrl);
}
// index the license location as cc:meta=xxx
String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
if (licenseLocation != null) {
addFeature(doc, "meta=" + licenseLocation);
}
// index the work type cc:type=xxx
String workType = metadata.get(CreativeCommons.WORK_TYPE);
if (workType != null) {
addFeature(doc, workType);
}
return doc;
}
示例5: isTruncated
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
* Checks if the page's content is truncated.
* @param content
* @return If the page is truncated <code>true</code>. When it is not,
* or when it could be determined, <code>false</code>.
*/
public static boolean isTruncated(Content content) {
byte[] contentBytes = content.getContent();
if (contentBytes == null) return false;
Metadata metadata = content.getMetadata();
if (metadata == null) return false;
String lengthStr = metadata.get(Response.CONTENT_LENGTH);
if (lengthStr != null) lengthStr=lengthStr.trim();
if (StringUtil.isEmpty(lengthStr)) {
return false;
}
int inHeaderSize;
String url = content.getUrl();
try {
inHeaderSize = Integer.parseInt(lengthStr);
} catch (NumberFormatException e) {
LOG.warn("Wrong contentlength format for " + url, e);
return false;
}
int actualSize = contentBytes.length;
if (inHeaderSize > actualSize) {
LOG.info(url + " skipped. Content of size " + inHeaderSize
+ " was truncated to " + actualSize);
return true;
}
if (LOG.isDebugEnabled()) {
LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
}
return false;
}
示例6: getLanguageFromMetadata
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
private static String getLanguageFromMetadata(Metadata meta) {
if (meta == null)
return null;
// dublin core
String lang = meta.get("dc.language");
if (lang != null)
return lang;
// meta content-language
lang = meta.get("content-language");
if (lang != null)
return lang;
// lang attribute
return meta.get("lang");
}
示例7: filter
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
Metadata metadata = parse.getData().getParseMeta();
// index the license
String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
if (licenseUrl != null) {
if (LOG.isInfoEnabled()) {
LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
}
// add the entire license as cc:license=xxx
addFeature(doc, "license=" + licenseUrl);
// index license attributes extracted of the license url
addUrlFeatures(doc, licenseUrl);
}
// index the license location as cc:meta=xxx
String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
if (licenseLocation != null) {
addFeature(doc, "meta=" + licenseLocation);
}
// index the work type cc:type=xxx
String workType = metadata.get(CreativeCommons.WORK_TYPE);
if (workType != null) {
addFeature(doc, workType);
}
return doc;
}
示例8: filter
import org.apache.nutch.metadata.Metadata; //导入方法依赖的package包/类
/**
* Adds the keywords metatag information to the document
*/
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
Metadata metadata = parse.getData().getParseMeta();
String keywordsString = metadata.get(STORE_METADATA_KEYWORDS);
if (keywordsString != null) {
String[] keywords = keywordsString.split(",");
for (String keyword : keywords) {
if (keyword.length() > 0 && !keyword.equals(" ")) {
// remove preceding spaces
while (keyword.length() > 0 && keyword.charAt(0) == ' ') {
keyword = keyword.substring(1);
}
if(keyword.length() > 0) {
doc.add(HTML_METATAG_KEYWORDS, keyword);
}
}
}
}
return doc;
}