本文整理汇总了Java中org.apache.nutch.indexer.NutchDocument.add方法的典型用法代码示例。如果您正苦于以下问题:Java NutchDocument.add方法的具体用法?Java NutchDocument.add怎么用?Java NutchDocument.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.nutch.indexer.NutchDocument
的用法示例。
在下文中一共展示了NutchDocument.add方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: filter
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
/**
* This will take the metatags that you have listed in your "urlmeta.tags"
* property, and looks for them inside the CrawlDatum object. If they exist,
* this will add it as an attribute inside the NutchDocument.
*
* @see IndexingFilter#filter
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
if (conf != null)
this.setConf(conf);
if (urlMetaTags == null || doc == null)
return doc;
for (String metatag : urlMetaTags) {
Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
if (metadata != null)
doc.add(metatag, metadata.toString());
}
return doc;
}
示例2: addTime
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
CrawlDatum datum) {
long time = -1;
String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
if (lastModified != null) { // try parse last-modified
time = getTime(lastModified, url); // use as time
// store as string
doc.add("lastModified", new Date(time));
}
if (time == -1) { // if no last-modified specified in HTTP header
time = datum.getModifiedTime(); // use value in CrawlDatum
if (time <= 0) { // if also unset
time = datum.getFetchTime(); // use time the fetch took place (fetchTime
// of fetchDatum)
}
}
// un-stored, indexed and un-tokenized
doc.add("date", new Date(time));
return doc;
}
示例3: filter
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// check if LANGUAGE found, possibly put there by HTMLLanguageParser
String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
// check if HTTP-header tels us the language
if (lang == null) {
lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
}
if (lang == null || lang.length() == 0) {
lang = "unknown";
}
doc.add("lang", lang);
return doc;
}
示例4: filter
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
ParseData dataP = parse.getData();
Metadata meta = dataP.getParseMeta();
boolean index = false;
for (String key : meta.names()) {
if(key.equals("ogc_service"))
index = true;
String value = meta.get(key);
LOG.info("Adding " + url + " to NutchDocument");
doc.add(key, value);
}
/* Return the document if it is an ogc service, otherwise return null */
return index ? doc : null;
}
示例5: filter
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
/**
* This will take the metatags that you have listed in your "urlmeta.tags"
* property, and looks for them inside the CrawlDatum object. If they exist,
* this will add it as an attribute inside the NutchDocument.
*
* @see IndexingFilter#filter
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
if (conf != null)
this.setConf(conf);
if (urlMetaTags == null || doc == null)
return doc;
for (String metatag : urlMetaTags) {
Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
if (metadata != null)
doc.add(metatag, metadata.toString());
}
return doc;
}
示例6: addTime
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
private NutchDocument addTime(NutchDocument doc, ParseData data,
String url, CrawlDatum datum) {
long time = -1;
String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
if (lastModified != null) { // try parse last-modified
time = getTime(lastModified,url); // use as time
// store as string
doc.add("lastModified", new Date(time));
}
if (time == -1) { // if no last-modified
time = datum.getFetchTime(); // use fetch time
}
// un-stored, indexed and un-tokenized
doc.add("date", new Date(time));
return doc;
}
示例7: filter
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
// check if LANGUAGE found, possibly put there by HTMLLanguageParser
String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
// check if HTTP-header tels us the language
if (lang == null) {
lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
}
if (lang == null || lang.length() == 0) {
lang = "unknown";
}
doc.add("lang", lang);
return doc;
}
示例8: addRawContent
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
private NutchDocument addRawContent(NutchDocument doc, WebPage page, String url) {
ByteBuffer raw = page.getContent();
if (raw != null) {
if (LOG.isInfoEnabled()) {
LOG.info("Html indexing for: " + url.toString());
}
ByteArrayInputStream arrayInputStream = new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(), raw.remaining());
Scanner scanner = new Scanner(arrayInputStream);
scanner.useDelimiter("\\Z");//To read all scanner content in one String
String data = "";
if (scanner.hasNext()) {
data = scanner.next();
}
doc.add("rawcontent", StringUtil.cleanField(data));
}
return doc;
}
示例9: filter
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// convert ISO date to time stamp
String isoDate = conf.get(CONF_ENDTIME_PROPERTY, "1970-01-01T00:00:00Z");
long epoch = 0;
try {
epoch = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ").parse(isoDate).getTime();
} catch (ParseException e) {
LOG.error("ERROR! Cannot parse date, must fit pattern yyyy-MM-dd'T'HH:mm:ssZ : " + isoDate);
}
// Index the endtime
doc.add(INDEXING_FIELD, new Date(epoch));
return doc;
}
示例10: filter
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
@Override
public NutchDocument filter(NutchDocument document, String s, WebPage webPage) throws IndexingException {
if (storageField != null) {
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
try {
String strippedContent = decoder.decode(webPage.getMetadata().get(new Utf8(storageField))).toString();
if (strippedContent != null) {
document.add(storageField, strippedContent);
}
} catch (CharacterCodingException e) {
e.printStackTrace();
}
}
return document;
}
示例11: filter
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// Check if some Rel-Tags found, possibly put there by RelTagParser
String[] tags = parse.getData().getParseMeta()
.getValues(RelTagParser.REL_TAG);
if (tags != null) {
for (int i = 0; i < tags.length; i++) {
doc.add("tag", tags[i]);
}
}
return doc;
}
示例12: filter
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
try {
URL url = new URL(urlText.toString());
DomainSuffix d = URLUtil.getDomainSuffix(url);
doc.add("tld", d.getDomain());
} catch (Exception ex) {
LOG.warn(ex.toString());
}
return doc;
}
示例13: addLength
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
String contentLength = data.getMeta(Response.CONTENT_LENGTH);
if (contentLength != null) {
// NUTCH-1010 ContentLength not trimmed
String trimmed = contentLength.toString().trim();
if (!trimmed.isEmpty())
doc.add("contentLength", trimmed);
}
return doc;
}
示例14: resetTitle
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) {
String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
if (contentDisposition == null || doc.getFieldValue("title") != null)
return doc;
for (int i = 0; i < patterns.length; i++) {
Matcher matcher = patterns[i].matcher(contentDisposition);
if (matcher.find()) {
doc.add("title", matcher.group(1));
break;
}
}
return doc;
}
示例15: addSubCollectionField
import org.apache.nutch.indexer.NutchDocument; //导入方法依赖的package包/类
/**
* "Mark" document to be a part of subcollection
*
* @param doc
* @param url
*/
private void addSubCollectionField(NutchDocument doc, String url) {
for (Subcollection coll : CollectionManager.getCollectionManager(getConf())
.getSubCollections(url)) {
if (coll.getKey() == null) {
doc.add(fieldName, coll.getName());
} else {
doc.add(coll.getKey(), coll.getName());
}
}
}