本文整理汇总了Java中org.apache.nutch.metadata.Metadata类的典型用法代码示例。如果您正苦于以下问题:Java Metadata类的具体用法?Java Metadata怎么用?Java Metadata使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Metadata类属于org.apache.nutch.metadata包,在下文中一共展示了Metadata类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: Content
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public Content(String url, String base, byte[] content, String contentType,
Metadata metadata, Configuration conf) {
if (url == null)
throw new IllegalArgumentException("null url");
if (base == null)
throw new IllegalArgumentException("null base");
if (content == null)
throw new IllegalArgumentException("null content");
if (metadata == null)
throw new IllegalArgumentException("null metadata");
this.url = url;
this.base = base;
this.content = content;
this.metadata = metadata;
this.mimeTypes = new MimeUtil(conf);
this.contentType = getContentType(contentType, url, content);
}
示例2: getCommonCrawlFormat
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
/**
* Returns a new instance of a {@see CommonCrawlFormat} object specifying the type of formatter.
* @param formatType the type of formatter to be created.
* @param url the url.
* @param content the content.
* @param metadata the metadata.
* @param nutchConf the configuration.
* @param config the CommonCrawl output configuration.
* @return the new {@see CommonCrawlFormat} object.
* @throws IOException If any I/O error occurs.
* @deprecated
*/
public static CommonCrawlFormat getCommonCrawlFormat(String formatType, String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
if (formatType == null) {
return null;
}
if (formatType.equalsIgnoreCase("jackson")) {
return new CommonCrawlFormatJackson(url, content, metadata, nutchConf, config);
}
else if (formatType.equalsIgnoreCase("jettinson")) {
return new CommonCrawlFormatJettinson(url, content, metadata, nutchConf, config);
}
else if (formatType.equalsIgnoreCase("simple")) {
return new CommonCrawlFormatSimple(url, content, metadata, nutchConf, config);
}
return null;
}
示例3: filter
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
/**
* Scan the HTML document looking at possible rel-tags
*/
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
// get parse obj
Parse parse = parseResult.get(content.getUrl());
// Trying to find the document's rel-tags
Parser parser = new Parser(doc);
Set<?> tags = parser.getRelTags();
Iterator<?> iter = tags.iterator();
Metadata metadata = parse.getData().getParseMeta();
while (iter.hasNext())
metadata.add(REL_TAG, (String) iter.next());
return parseResult;
}
示例4: addTime
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
CrawlDatum datum) {
long time = -1;
String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
if (lastModified != null) { // try parse last-modified
time = getTime(lastModified, url); // use as time
// store as string
doc.add("lastModified", new Date(time));
}
if (time == -1) { // if no last-modified specified in HTTP header
time = datum.getModifiedTime(); // use value in CrawlDatum
if (time <= 0) { // if also unset
time = datum.getFetchTime(); // use time the fetch took place (fetchTime
// of fetchDatum)
}
}
// un-stored, indexed and un-tokenized
doc.add("date", new Date(time));
return doc;
}
示例5: main
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
// LOG.setLevel(Level.FINE);
String name = args[0];
String url = "file:" + name;
File file = new File(name);
byte[] bytes = new byte[(int) file.length()];
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
Configuration conf = NutchConfiguration.create();
HtmlParser parser = new HtmlParser();
parser.setConf(conf);
Parse parse = parser.getParse(
new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(
url);
System.out.println("data: " + parse.getData());
System.out.println("text: " + parse.getText());
}
示例6: main
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
/**
* Runs a command line version of this {@link Parser}.
*
* @param args
* A single argument (expected at arg[0]) representing a path on the
* local filesystem that points to a feed file.
*
* @throws Exception
* If any error occurs.
*/
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.err.println("Usage: FeedParser <feed>");
System.exit(1);
}
String name = args[0];
String url = "file:" + name;
Configuration conf = NutchConfiguration.create();
FeedParser parser = new FeedParser();
parser.setConf(conf);
File file = new File(name);
byte[] bytes = new byte[(int) file.length()];
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
"application/rss+xml", new Metadata(), conf));
for (Entry<Text, Parse> entry : parseResult) {
System.out.println("key: " + entry.getKey());
Parse parse = entry.getValue();
System.out.println("data: " + parse.getData());
System.out.println("text: " + parse.getText() + "\n");
}
}
示例7: filter
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// check if LANGUAGE found, possibly put there by HTMLLanguageParser
String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
// check if HTTP-header tels us the language
if (lang == null) {
lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
}
if (lang == null || lang.length() == 0) {
lang = "unknown";
}
doc.add("lang", lang);
return doc;
}
示例8: testMetaHTMLParsing
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
/**
* Test parsing of language identifiers from html
**/
@Test
public void testMetaHTMLParsing() {
try {
ParseUtil parser = new ParseUtil(NutchConfiguration.create());
/* loop through the test documents and validate result */
for (int t = 0; t < docs.length; t++) {
Content content = getContent(docs[t]);
Parse parse = parser.parse(content).get(content.getUrl());
Assert.assertEquals(metalanguages[t], (String) parse.getData()
.getParseMeta().get(Metadata.LANGUAGE));
}
} catch (Exception e) {
e.printStackTrace(System.out);
Assert.fail(e.toString());
}
}
示例9: pageTest
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public void pageTest(File file, String url, String license, String location,
String type) throws Exception {
String contentType = "text/html";
InputStream in = new FileInputStream(file);
ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
byte[] buffer = new byte[1024];
int i;
while ((i = in.read(buffer)) != -1) {
out.write(buffer, 0, i);
}
in.close();
byte[] bytes = out.toByteArray();
Configuration conf = NutchConfiguration.create();
Content content = new Content(url, url, bytes, contentType, new Metadata(),
conf);
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
Assert.assertEquals(license, metadata.get("License-Url"));
Assert.assertEquals(location, metadata.get("License-Location"));
Assert.assertEquals(type, metadata.get("Work-Type"));
}
示例10: main
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
/**
* Arguments are: 0. Name of input SWF file.
*/
public static void main(String[] args) throws IOException {
FileInputStream in = new FileInputStream(args[0]);
byte[] buf = new byte[in.available()];
in.read(buf);
in.close();
SWFParser parser = new SWFParser();
ParseResult parseResult = parser.getParse(new Content("file:" + args[0],
"file:" + args[0], buf, "application/x-shockwave-flash",
new Metadata(), NutchConfiguration.create()));
Parse p = parseResult.get("file:" + args[0]);
System.out.println("Parse Text:");
System.out.println(p.getText());
System.out.println("Parse Data:");
System.out.println(p.getData());
}
示例11: testPositiveFilter
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public void testPositiveFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
RegexParseFilter filter = new RegexParseFilter(file);
filter.setConf(conf);
String url = "http://nutch.apache.org/";
String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
ParseResult result = ParseResult.createParseResult(url, parse);
result = filter.filter(content, result, null, null);
Metadata meta = parse.getData().getParseMeta();
assertEquals("true", meta.get("first"));
assertEquals("true", meta.get("second"));
}
示例12: testNegativeFilter
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public void testNegativeFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
RegexParseFilter filter = new RegexParseFilter(file);
filter.setConf(conf);
String url = "http://nutch.apache.org/";
String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
ParseResult result = ParseResult.createParseResult(url, parse);
result = filter.filter(content, result, null, null);
Metadata meta = parse.getData().getParseMeta();
assertEquals("false", meta.get("first"));
assertEquals("false", meta.get("second"));
}
示例13: filter
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
ParseData dataP = parse.getData();
Metadata meta = dataP.getParseMeta();
boolean index = false;
for (String key : meta.names()) {
if(key.equals("ogc_service"))
index = true;
String value = meta.get(key);
LOG.info("Adding " + url + " to NutchDocument");
doc.add(key, value);
}
/* Return the document if it is an ogc service, otherwise return null */
return index ? doc : null;
}
示例14: testWMS
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
@Test
public void testWMS() throws FileNotFoundException, URISyntaxException {
// Preparaci
File f = new File(getClass().getResource("testWMS.xml").toURI());
String contentValue = new Scanner(f).useDelimiter("\\Z").next();
String url = "http://wms.magrama.es/sig/Agricultura/TurcSecano/wms.aspx?request=GetCapabilities&service=WMS";
ParseResult testParseResult = createParseResultWithMetadata(new Metadata(), url);
Content testContent = createContent(url, contentValue);
OgcParseFilter parseFilter = new OgcParseFilter();
// Filtrar
ParseResult res = parseFilter.filter(testContent, testParseResult, null, null);
// Comprobaciones
Metadata metadata = res.get(url).getData().getParseMeta();
assertEquals("1.3.0", metadata.get("ogc_version"));
assertEquals("wms", metadata.get("ogc_service"));
}
示例15: testATOM
import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
@Test
public void testATOM() throws FileNotFoundException, URISyntaxException {
// Preparacion
File f = new File(getClass().getResource("testATOM.xml").toURI());
String contentValue = new Scanner(f).useDelimiter("\\Z").next();
String url = "http://www.magrama.gob.es/ide/inspire/atom/CategCalidadEvalAmbiental/downloadservice.xml";
ParseResult testParseResult = createParseResultWithMetadata(new Metadata(), url);
Content testContent = createContent(url, contentValue);
OgcParseFilter parseFilter = new OgcParseFilter();
// Filtrar
ParseResult res = parseFilter.filter(testContent, testParseResult, null, null);
// Comprobaciones
Metadata metadata = res.get(url).getData().getParseMeta();
assertEquals("1.0", metadata.get("ogc_version"));
assertEquals("atom", metadata.get("ogc_service"));
}