当前位置: 首页>>代码示例>>Java>>正文


Java ParseException类代码示例

本文整理汇总了Java中org.apache.nutch.parse.ParseException的典型用法代码示例。如果您正苦于以下问题:Java ParseException类的具体用法?Java ParseException怎么用?Java ParseException使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


ParseException类属于org.apache.nutch.parse包,在下文中一共展示了ParseException类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    Assert.assertTrue(sampleTexts[i].equals(text));
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:22,代码来源:TestSWFParser.java

示例2: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
        .get(content.getUrl());

    Assert.assertEquals("121", parse.getData().getMeta("width"));
    Assert.assertEquals("48", parse.getData().getMeta("height"));
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:22,代码来源:TestImageMetadata.java

示例3: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
        .get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    Assert.assertTrue(index > 0);
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:22,代码来源:TestPdfParser.java

示例4: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  Configuration conf = NutchConfiguration.create();
  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(
        content.getUrl());
    Assert.assertTrue(parse.getText().equals(expectedText));
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:20,代码来源:TestZipParser.java

示例5: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public void testIt() throws ParseException {
  String contentType;

  // now test only on linux platform
  if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
    System.err.println("Current OS is "+System.getProperty("os.name")+".");
    System.err.println("No test is run on OS other than linux.");
    return;
  }

  Configuration conf = NutchConfiguration.create();
  // loop alternately, total 10*2 times of invoking external command
  for (int i=0; i<10; i++) {
    // check external parser that does 'cat'
    contentType = "application/vnd.nutch.example.cat";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
    assertEquals(expectedText,parse.getText());

    // check external parser that does 'md5sum'
    contentType = "application/vnd.nutch.example.md5sum";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
    assertTrue(parse.getText().startsWith(expectedMD5sum));
  }
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:27,代码来源:TestExtParser.java

示例6: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    assertTrue(sampleTexts[i].equals(text));
  }
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:20,代码来源:TestSWFParser.java

示例7: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
  /* Temporarily disabled - see Tika-748

	String urlString;
	Protocol protocol;
	Content content;
	Parse parse;

	Configuration conf = NutchConfiguration.create();
	urlString = "file:" + sampleDir + fileSeparator + rtfFile;
	protocol = new ProtocolFactory(conf).getProtocol(urlString);
	content = protocol.getProtocolOutput(new Text(urlString),
			new CrawlDatum()).getContent();
	parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
			.get(content.getUrl());
	String text = parse.getText();
	assertEquals("The quick brown fox jumps over the lazy dog", text.trim());

	String title = parse.getData().getTitle();
	Metadata meta = parse.getData().getParseMeta();

	// METADATA extraction is not yet supported in Tika
	// assertEquals("test rft document", title);
	// assertEquals("tests", meta.get(DublinCore.SUBJECT));
 */
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:27,代码来源:TestRTFParser.java

示例8: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    assertTrue(index > 0);
  }
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:19,代码来源:TestPdfParser.java

示例9: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  Configuration conf = NutchConfiguration.create();
  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
    assertTrue(parse.getText().equals(expectedText));
  }
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:17,代码来源:TestZipParser.java

示例10: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ParseException {
  String contentType;

  // now test only on linux platform
  if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
    System.err
        .println("Current OS is " + System.getProperty("os.name") + ".");
    System.err.println("No test is run on OS other than linux.");
    return;
  }

  Configuration conf = NutchConfiguration.create();
  // loop alternately, total 10*2 times of invoking external command
  for (int i = 0; i < 10; i++) {
    // check external parser that does 'cat'
    contentType = "application/vnd.nutch.example.cat";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
        content.getUrl());
    Assert.assertEquals(expectedText, parse.getText());

    // check external parser that does 'md5sum'
    contentType = "application/vnd.nutch.example.md5sum";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
        content.getUrl());
    Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:31,代码来源:TestExtParser.java

示例11: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {

  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  Configuration conf = NutchConfiguration.create();
  urlString = "file:" + sampleDir + fileSeparator + rtfFile;
  protocol = new ProtocolFactory(conf).getProtocol(urlString);
  content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
      .getContent();
  parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
      content.getUrl());
  String text = parse.getText();
  Assert.assertEquals("The quick brown fox jumps over the lazy dog",
      text.trim());

  String title = parse.getData().getTitle();
  Metadata meta = parse.getData().getParseMeta();

  Assert.assertEquals("test rft document", title);
  Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));

}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:28,代码来源:TestRTFParser.java

示例12: getTextContent

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public String getTextContent(String fileName) throws ProtocolException,
    ParseException {
  String urlString = "file:" + sampleDir + fileSeparator + fileName;
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  Content content = protocol.getProtocolOutput(new Text(urlString),
      new CrawlDatum()).getContent();
  Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
      .get(content.getUrl());
  return parse.getText();
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:11,代码来源:TestMSWordParser.java

示例13: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
  for (int i = 0; i < sampleFiles.length; i++) {
    String found = getTextContent(sampleFiles[i]);
    Assert.assertTrue("text found : '" + found + "'",
        found.startsWith(expectedText));
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:9,代码来源:TestMSWordParser.java

示例14: testOpeningDocs

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testOpeningDocs() throws ProtocolException, ParseException {
  String[] filenames = new File(sampleDir).list();
  for (int i = 0; i < filenames.length; i++) {
    if (filenames[i].endsWith(".doc") == false)
      continue;
    Assert.assertTrue("cann't read content of " + filenames[i],
        getTextContent(filenames[i]).length() > 0);
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:11,代码来源:TestMSWordParser.java

示例15: testIt

import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  Protocol protocol;
  ProtocolFactory factory = new ProtocolFactory(conf);

  System.out.println("Expected : " + expectedText);

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    if (sampleFiles[i].startsWith("ootest") == false)
      continue;

    protocol = factory.getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
        .get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();

    // simply test for the presence of a text - the ordering of the elements
    // may differ from what was expected
    // in the previous tests
    Assert.assertTrue(text != null && text.length() > 0);

    System.out.println("Found " + sampleFiles[i] + ": " + text);
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:34,代码来源:TestOOParser.java


注:本文中的org.apache.nutch.parse.ParseException类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。