本文整理汇总了Java中org.apache.nutch.parse.ParseUtil类的典型用法代码示例。如果您正苦于以下问题:Java ParseUtil类的具体用法?Java ParseUtil怎么用?Java ParseUtil使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
ParseUtil类属于org.apache.nutch.parse包,在下文中一共展示了ParseUtil类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testMetaHTMLParsing
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
/**
* Test parsing of language identifiers from html
**/
@Test
public void testMetaHTMLParsing() {
try {
ParseUtil parser = new ParseUtil(NutchConfiguration.create());
/* loop through the test documents and validate result */
for (int t = 0; t < docs.length; t++) {
Content content = getContent(docs[t]);
Parse parse = parser.parse(content).get(content.getUrl());
Assert.assertEquals(metalanguages[t], (String) parse.getData()
.getParseMeta().get(Metadata.LANGUAGE));
}
} catch (Exception e) {
e.printStackTrace(System.out);
Assert.fail(e.toString());
}
}
示例2: pageTest
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void pageTest(File file, String url, String license, String location,
String type) throws Exception {
String contentType = "text/html";
InputStream in = new FileInputStream(file);
ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
byte[] buffer = new byte[1024];
int i;
while ((i = in.read(buffer)) != -1) {
out.write(buffer, 0, i);
}
in.close();
byte[] bytes = out.toByteArray();
Configuration conf = NutchConfiguration.create();
Content content = new Content(url, url, bytes, contentType, new Metadata(),
conf);
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
Assert.assertEquals(license, metadata.get("License-Url"));
Assert.assertEquals(location, metadata.get("License-Location"));
Assert.assertEquals(type, metadata.get("Work-Type"));
}
示例3: testIt
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parse(content).get(content.getUrl());
String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
Assert.assertTrue(sampleTexts[i].equals(text));
}
}
示例4: testIt
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
.get(content.getUrl());
Assert.assertEquals("121", parse.getData().getMeta("width"));
Assert.assertEquals("48", parse.getData().getMeta("height"));
}
}
示例5: testIt
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
.get(content.getUrl());
int index = parse.getText().indexOf(expectedText);
Assert.assertTrue(index > 0);
}
}
示例6: testIt
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(
content.getUrl());
Assert.assertTrue(parse.getText().equals(expectedText));
}
}
示例7: testIt
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void testIt() throws ParseException {
String contentType;
// now test only on linux platform
if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
System.err.println("Current OS is "+System.getProperty("os.name")+".");
System.err.println("No test is run on OS other than linux.");
return;
}
Configuration conf = NutchConfiguration.create();
// loop alternately, total 10*2 times of invoking external command
for (int i=0; i<10; i++) {
// check external parser that does 'cat'
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
assertEquals(expectedText,parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}
示例8: testMetaHTMLParsing
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
/**
* Test parsing of language identifiers from html
**/
public void testMetaHTMLParsing() {
try {
ParseUtil parser = new ParseUtil(NutchConfiguration.create());
/* loop through the test documents and validate result */
for (int t = 0; t < docs.length; t++) {
Content content = getContent(docs[t]);
Parse parse = parser.parse(content).get(content.getUrl());
assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
}
} catch (Exception e) {
e.printStackTrace(System.out);
fail(e.toString());
}
}
示例9: pageTest
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void pageTest(File file, String url,
String license, String location, String type)
throws Exception {
String contentType = "text/html";
InputStream in = new FileInputStream(file);
ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
byte[] buffer = new byte[1024];
int i;
while ((i = in.read(buffer)) != -1) {
out.write(buffer, 0, i);
}
in.close();
byte[] bytes = out.toByteArray();
Configuration conf = NutchConfiguration.create();
Content content =
new Content(url, url, bytes, contentType, new Metadata(), conf);
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
assertEquals(license, metadata.get("License-Url"));
assertEquals(location, metadata.get("License-Location"));
assertEquals(type, metadata.get("Work-Type"));
}
示例10: testIt
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parse(content).get(content.getUrl());
String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
assertTrue(sampleTexts[i].equals(text));
}
}
示例11: testIt
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
int index = parse.getText().indexOf(expectedText);
assertTrue(index > 0);
}
}
示例12: testIt
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void testIt() {
Configuration conf = NutchConfiguration.create();
String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
try {
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
// check that we get the same values
Metadata parseMeta = parse.getData().getParseMeta();
assertEquals(description, parseMeta.get("metatag.description"));
assertEquals(keywords, parseMeta.get("metatag.keywords"));
} catch (Exception e) {
e.printStackTrace();
fail(e.toString());
}
}
示例13: testIt
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
assertTrue(parse.getText().equals(expectedText));
}
}
示例14: configure
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
/**
* <p>
* Configures the job. Sets the url filters, scoring filters, url normalizers
* and other relevant data.
* </p>
*
* @param job
* The job configuration.
*/
public void configure(JobConf job) {
// set the url filters, scoring filters the parse util and the url
// normalizers
this.jobConf = job;
this.urlFilters = new URLFilters(jobConf);
this.scfilters = new ScoringFilters(jobConf);
this.parseUtil = new ParseUtil(jobConf);
this.normalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_FETCHER);
interval = jobConf.getInt("db.fetch.interval.default", 2592000);
}
示例15: testIt
import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
@Test
public void testIt() throws ParseException {
String contentType;
// now test only on linux platform
if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
System.err
.println("Current OS is " + System.getProperty("os.name") + ".");
System.err.println("No test is run on OS other than linux.");
return;
}
Configuration conf = NutchConfiguration.create();
// loop alternately, total 10*2 times of invoking external command
for (int i = 0; i < 10; i++) {
// check external parser that does 'cat'
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
content.getUrl());
Assert.assertEquals(expectedText, parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
content.getUrl());
Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}