当前位置: 首页>>代码示例>>Java>>正文


Java Metadata类代码示例

本文整理汇总了Java中org.apache.nutch.metadata.Metadata的典型用法代码示例。如果您正苦于以下问题:Java Metadata类的具体用法?Java Metadata怎么用?Java Metadata使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


Metadata类属于org.apache.nutch.metadata包,在下文中一共展示了Metadata类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: Content

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public Content(String url, String base, byte[] content, String contentType,
    Metadata metadata, Configuration conf) {

  if (url == null)
    throw new IllegalArgumentException("null url");
  if (base == null)
    throw new IllegalArgumentException("null base");
  if (content == null)
    throw new IllegalArgumentException("null content");
  if (metadata == null)
    throw new IllegalArgumentException("null metadata");

  this.url = url;
  this.base = base;
  this.content = content;
  this.metadata = metadata;

  this.mimeTypes = new MimeUtil(conf);
  this.contentType = getContentType(contentType, url, content);
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:21,代码来源:Content.java

示例2: getCommonCrawlFormat

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
/**
 * Returns a new instance of a {@see CommonCrawlFormat} object specifying the type of formatter. 
 * @param formatType the type of formatter to be created.
 * @param url the url.
 * @param content the content.
 * @param metadata the metadata.
 * @param nutchConf the configuration.
 * @param config the CommonCrawl output configuration.
 * @return the new {@see CommonCrawlFormat} object.
 * @throws IOException If any I/O error occurs.
 * @deprecated
 */
public static CommonCrawlFormat getCommonCrawlFormat(String formatType, String url, Content content,	Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
	if (formatType == null) {
		return null;
	}
	
	if (formatType.equalsIgnoreCase("jackson")) {
		return new CommonCrawlFormatJackson(url, content, metadata, nutchConf, config);
	}
	else if (formatType.equalsIgnoreCase("jettinson")) {
		return new CommonCrawlFormatJettinson(url, content, metadata, nutchConf, config);
	}
	else if (formatType.equalsIgnoreCase("simple")) {
		return new CommonCrawlFormatSimple(url, content, metadata, nutchConf, config);
	}
	
	return null;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:30,代码来源:CommonCrawlFormatFactory.java

示例3: filter

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult,
    HTMLMetaTags metaTags, DocumentFragment doc) {

  // get parse obj
  Parse parse = parseResult.get(content.getUrl());
  // Trying to find the document's rel-tags
  Parser parser = new Parser(doc);
  Set<?> tags = parser.getRelTags();
  Iterator<?> iter = tags.iterator();
  Metadata metadata = parse.getData().getParseMeta();
  while (iter.hasNext())
    metadata.add(REL_TAG, (String) iter.next());

  return parseResult;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:19,代码来源:RelTagParser.java

示例4: addTime

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
    CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) { // try parse last-modified
    time = getTime(lastModified, url); // use as time
                                       // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) { // if no last-modified specified in HTTP header
    time = datum.getModifiedTime(); // use value in CrawlDatum
    if (time <= 0) { // if also unset
      time = datum.getFetchTime(); // use time the fetch took place (fetchTime
                                   // of fetchDatum)
    }
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));
  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:24,代码来源:MoreIndexingFilter.java

示例5: main

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  // LOG.setLevel(Level.FINE);
  String name = args[0];
  String url = "file:" + name;
  File file = new File(name);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  Configuration conf = NutchConfiguration.create();
  HtmlParser parser = new HtmlParser();
  parser.setConf(conf);
  Parse parse = parser.getParse(
      new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(
      url);
  System.out.println("data: " + parse.getData());

  System.out.println("text: " + parse.getText());

}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:20,代码来源:HtmlParser.java

示例6: main

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
/**
 * Runs a command line version of this {@link Parser}.
 * 
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 * 
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
  if (args.length != 1) {
    System.err.println("Usage: FeedParser <feed>");
    System.exit(1);
  }
  String name = args[0];
  String url = "file:" + name;
  Configuration conf = NutchConfiguration.create();
  FeedParser parser = new FeedParser();
  parser.setConf(conf);
  File file = new File(name);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
      "application/rss+xml", new Metadata(), conf));
  for (Entry<Text, Parse> entry : parseResult) {
    System.out.println("key: " + entry.getKey());
    Parse parse = entry.getValue();
    System.out.println("data: " + parse.getData());
    System.out.println("text: " + parse.getText() + "\n");
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:34,代码来源:FeedParser.java

示例7: filter

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
    lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:20,代码来源:LanguageIndexingFilter.java

示例8: testMetaHTMLParsing

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
/**
 * Test parsing of language identifiers from html
 **/
@Test
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      Assert.assertEquals(metalanguages[t], (String) parse.getData()
          .getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    Assert.fail(e.toString());
  }

}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:22,代码来源:TestHTMLLanguageParser.java

示例9: pageTest

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public void pageTest(File file, String url, String license, String location,
    String type) throws Exception {

  String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content = new Content(url, url, bytes, contentType, new Metadata(),
      conf);
  Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());

  Metadata metadata = parse.getData().getParseMeta();
  Assert.assertEquals(license, metadata.get("License-Url"));
  Assert.assertEquals(location, metadata.get("License-Location"));
  Assert.assertEquals(type, metadata.get("Work-Type"));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:25,代码来源:TestCCParseFilter.java

示例10: main

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
/**
 * Arguments are: 0. Name of input SWF file.
 */
public static void main(String[] args) throws IOException {
  FileInputStream in = new FileInputStream(args[0]);

  byte[] buf = new byte[in.available()];
  in.read(buf);
  in.close();
  SWFParser parser = new SWFParser();
  ParseResult parseResult = parser.getParse(new Content("file:" + args[0],
      "file:" + args[0], buf, "application/x-shockwave-flash",
      new Metadata(), NutchConfiguration.create()));
  Parse p = parseResult.get("file:" + args[0]);
  System.out.println("Parse Text:");
  System.out.println(p.getText());
  System.out.println("Parse Data:");
  System.out.println(p.getData());
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:20,代码来源:SWFParser.java

示例11: testPositiveFilter

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public void testPositiveFilter() throws Exception {
  Configuration conf = NutchConfiguration.create();

  String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
  RegexParseFilter filter = new RegexParseFilter(file);
  filter.setConf(conf);

  String url = "http://nutch.apache.org/";
  String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
  Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
  Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
  
  ParseResult result = ParseResult.createParseResult(url, parse);
  result = filter.filter(content, result, null, null);

  Metadata meta = parse.getData().getParseMeta();
  
  assertEquals("true", meta.get("first"));
  assertEquals("true", meta.get("second"));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:21,代码来源:TestRegexParseFilter.java

示例12: testNegativeFilter

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
public void testNegativeFilter() throws Exception {
  Configuration conf = NutchConfiguration.create();

  String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
  RegexParseFilter filter = new RegexParseFilter(file);
  filter.setConf(conf);

  String url = "http://nutch.apache.org/";
  String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
  Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
  Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
  
  ParseResult result = ParseResult.createParseResult(url, parse);
  result = filter.filter(content, result, null, null);

  Metadata meta = parse.getData().getParseMeta();
  
  assertEquals("false", meta.get("first"));
  assertEquals("false", meta.get("second"));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:21,代码来源:TestRegexParseFilter.java

示例13: filter

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
		throws IndexingException {
	ParseData dataP = parse.getData();
	Metadata meta = dataP.getParseMeta();
	boolean index = false;
	
	for (String key : meta.names()) {
		if(key.equals("ogc_service"))
			index = true;
		String value = meta.get(key);
		LOG.info("Adding " + url + " to NutchDocument");
		doc.add(key, value);
	}
	/* Return the document if it is an ogc service, otherwise return null */
	return index ? doc : null;
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:18,代码来源:OgcIndexingFilter.java

示例14: testWMS

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
@Test
public void testWMS() throws FileNotFoundException, URISyntaxException {
	// Preparaci
	File f = new File(getClass().getResource("testWMS.xml").toURI());
	String contentValue = new Scanner(f).useDelimiter("\\Z").next();
	String url = "http://wms.magrama.es/sig/Agricultura/TurcSecano/wms.aspx?request=GetCapabilities&service=WMS";
	ParseResult testParseResult = createParseResultWithMetadata(new Metadata(), url);
	Content testContent = createContent(url, contentValue);

	OgcParseFilter parseFilter = new OgcParseFilter();

	// Filtrar
	ParseResult res = parseFilter.filter(testContent, testParseResult, null, null);

	// Comprobaciones
	Metadata metadata = res.get(url).getData().getParseMeta();
	assertEquals("1.3.0", metadata.get("ogc_version"));
	assertEquals("wms", metadata.get("ogc_service"));

}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:21,代码来源:OgcParseFilterTest.java

示例15: testATOM

import org.apache.nutch.metadata.Metadata; //导入依赖的package包/类
@Test
public void testATOM() throws FileNotFoundException, URISyntaxException {
	// Preparacion
	File f = new File(getClass().getResource("testATOM.xml").toURI());
	String contentValue = new Scanner(f).useDelimiter("\\Z").next();
	String url = "http://www.magrama.gob.es/ide/inspire/atom/CategCalidadEvalAmbiental/downloadservice.xml";
	ParseResult testParseResult = createParseResultWithMetadata(new Metadata(), url);
	Content testContent = createContent(url, contentValue);

	OgcParseFilter parseFilter = new OgcParseFilter();

	// Filtrar
	ParseResult res = parseFilter.filter(testContent, testParseResult, null, null);

	// Comprobaciones
	Metadata metadata = res.get(url).getData().getParseMeta();
	assertEquals("1.0", metadata.get("ogc_version"));
	assertEquals("atom", metadata.get("ogc_service"));

}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:21,代码来源:OgcParseFilterTest.java


注:本文中的org.apache.nutch.metadata.Metadata类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。