本文整理汇总了Java中org.apache.nutch.parse.ParseException类的典型用法代码示例。如果您正苦于以下问题:Java ParseException类的具体用法?Java ParseException怎么用?Java ParseException使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
ParseException类属于org.apache.nutch.parse包,在下文中一共展示了ParseException类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parse(content).get(content.getUrl());
String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
Assert.assertTrue(sampleTexts[i].equals(text));
}
}
示例2: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
.get(content.getUrl());
Assert.assertEquals("121", parse.getData().getMeta("width"));
Assert.assertEquals("48", parse.getData().getMeta("height"));
}
}
示例3: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
.get(content.getUrl());
int index = parse.getText().indexOf(expectedText);
Assert.assertTrue(index > 0);
}
}
示例4: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(
content.getUrl());
Assert.assertTrue(parse.getText().equals(expectedText));
}
}
示例5: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public void testIt() throws ParseException {
String contentType;
// now test only on linux platform
if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
System.err.println("Current OS is "+System.getProperty("os.name")+".");
System.err.println("No test is run on OS other than linux.");
return;
}
Configuration conf = NutchConfiguration.create();
// loop alternately, total 10*2 times of invoking external command
for (int i=0; i<10; i++) {
// check external parser that does 'cat'
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
assertEquals(expectedText,parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}
示例6: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parse(content).get(content.getUrl());
String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
assertTrue(sampleTexts[i].equals(text));
}
}
示例7: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
/* Temporarily disabled - see Tika-748
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
.get(content.getUrl());
String text = parse.getText();
assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
String title = parse.getData().getTitle();
Metadata meta = parse.getData().getParseMeta();
// METADATA extraction is not yet supported in Tika
// assertEquals("test rft document", title);
// assertEquals("tests", meta.get(DublinCore.SUBJECT));
*/
}
示例8: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
int index = parse.getText().indexOf(expectedText);
assertTrue(index > 0);
}
}
示例9: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
assertTrue(parse.getText().equals(expectedText));
}
}
示例10: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ParseException {
String contentType;
// now test only on linux platform
if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
System.err
.println("Current OS is " + System.getProperty("os.name") + ".");
System.err.println("No test is run on OS other than linux.");
return;
}
Configuration conf = NutchConfiguration.create();
// loop alternately, total 10*2 times of invoking external command
for (int i = 0; i < 10; i++) {
// check external parser that does 'cat'
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
content.getUrl());
Assert.assertEquals(expectedText, parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
content.getUrl());
Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}
示例11: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
.getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
content.getUrl());
String text = parse.getText();
Assert.assertEquals("The quick brown fox jumps over the lazy dog",
text.trim());
String title = parse.getData().getTitle();
Metadata meta = parse.getData().getParseMeta();
Assert.assertEquals("test rft document", title);
Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
}
示例12: getTextContent
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
public String getTextContent(String fileName) throws ProtocolException,
ParseException {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
.get(content.getUrl());
return parse.getText();
}
示例13: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
for (int i = 0; i < sampleFiles.length; i++) {
String found = getTextContent(sampleFiles[i]);
Assert.assertTrue("text found : '" + found + "'",
found.startsWith(expectedText));
}
}
示例14: testOpeningDocs
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testOpeningDocs() throws ProtocolException, ParseException {
String[] filenames = new File(sampleDir).list();
for (int i = 0; i < filenames.length; i++) {
if (filenames[i].endsWith(".doc") == false)
continue;
Assert.assertTrue("cann't read content of " + filenames[i],
getTextContent(filenames[i]).length() > 0);
}
}
示例15: testIt
import org.apache.nutch.parse.ParseException; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
Protocol protocol;
ProtocolFactory factory = new ProtocolFactory(conf);
System.out.println("Expected : " + expectedText);
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
if (sampleFiles[i].startsWith("ootest") == false)
continue;
protocol = factory.getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
.get(content.getUrl());
String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
// simply test for the presence of a text - the ordering of the elements
// may differ from what was expected
// in the previous tests
Assert.assertTrue(text != null && text.length() > 0);
System.out.println("Found " + sampleFiles[i] + ": " + text);
}
}