当前位置: 首页>>代码示例>>Java>>正文


Java ParseUtil类代码示例

本文整理汇总了Java中org.apache.nutch.parse.ParseUtil的典型用法代码示例。如果您正苦于以下问题:Java ParseUtil类的具体用法?Java ParseUtil怎么用?Java ParseUtil使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


ParseUtil类属于org.apache.nutch.parse包,在下文中一共展示了ParseUtil类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testMetaHTMLParsing

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
/**
 * Test parsing of language identifiers from html
 **/
@Test
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      Assert.assertEquals(metalanguages[t], (String) parse.getData()
          .getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    Assert.fail(e.toString());
  }

}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:22,代码来源:TestHTMLLanguageParser.java

示例2: pageTest

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void pageTest(File file, String url, String license, String location,
    String type) throws Exception {

  String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content = new Content(url, url, bytes, contentType, new Metadata(),
      conf);
  Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());

  Metadata metadata = parse.getData().getParseMeta();
  Assert.assertEquals(license, metadata.get("License-Url"));
  Assert.assertEquals(location, metadata.get("License-Location"));
  Assert.assertEquals(type, metadata.get("Work-Type"));
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:25,代码来源:TestCCParseFilter.java

示例3: testIt

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    Assert.assertTrue(sampleTexts[i].equals(text));
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:22,代码来源:TestSWFParser.java

示例4: testIt

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
        .get(content.getUrl());

    Assert.assertEquals("121", parse.getData().getMeta("width"));
    Assert.assertEquals("48", parse.getData().getMeta("height"));
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:22,代码来源:TestImageMetadata.java

示例5: testIt

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
        .get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    Assert.assertTrue(index > 0);
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:22,代码来源:TestPdfParser.java

示例6: testIt

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
@Test
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  Configuration conf = NutchConfiguration.create();
  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(
        content.getUrl());
    Assert.assertTrue(parse.getText().equals(expectedText));
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:20,代码来源:TestZipParser.java

示例7: testIt

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void testIt() throws ParseException {
  String contentType;

  // now test only on linux platform
  if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
    System.err.println("Current OS is "+System.getProperty("os.name")+".");
    System.err.println("No test is run on OS other than linux.");
    return;
  }

  Configuration conf = NutchConfiguration.create();
  // loop alternately, total 10*2 times of invoking external command
  for (int i=0; i<10; i++) {
    // check external parser that does 'cat'
    contentType = "application/vnd.nutch.example.cat";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
    assertEquals(expectedText,parse.getText());

    // check external parser that does 'md5sum'
    contentType = "application/vnd.nutch.example.md5sum";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
    assertTrue(parse.getText().startsWith(expectedMD5sum));
  }
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:27,代码来源:TestExtParser.java

示例8: testMetaHTMLParsing

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
/**
 * Test parsing of language identifiers from html 
 **/
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    fail(e.toString());
  }

}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:20,代码来源:TestHTMLLanguageParser.java

示例9: pageTest

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void pageTest(File file, String url,
                     String license, String location, String type)
  throws Exception {

  String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content =
    new Content(url, url, bytes, contentType, new Metadata(), conf);
  Parse parse =  new ParseUtil(conf).parse(content).get(content.getUrl());
  
  Metadata metadata = parse.getData().getParseMeta();
  assertEquals(license, metadata.get("License-Url"));
  assertEquals(location, metadata.get("License-Location"));
  assertEquals(type, metadata.get("Work-Type"));
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:26,代码来源:TestCCParseFilter.java

示例10: testIt

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    assertTrue(sampleTexts[i].equals(text));
  }
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:20,代码来源:TestSWFParser.java

示例11: testIt

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    assertTrue(index > 0);
  }
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:19,代码来源:TestPdfParser.java

示例12: testIt

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void testIt() {
  Configuration conf = NutchConfiguration.create();
  
  String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
  
  try {
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    
    // check that we get the same values
    Metadata parseMeta = parse.getData().getParseMeta();
    
    assertEquals(description, parseMeta.get("metatag.description"));
    assertEquals(keywords, parseMeta.get("metatag.keywords"));
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.toString());
  }
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:23,代码来源:TestMetatagParser.java

示例13: testIt

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  Configuration conf = NutchConfiguration.create();
  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
    assertTrue(parse.getText().equals(expectedText));
  }
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:17,代码来源:TestZipParser.java

示例14: configure

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
/**
 * <p>
 * Configures the job. Sets the url filters, scoring filters, url normalizers
 * and other relevant data.
 * </p>
 * 
 * @param job
 *          The job configuration.
 */
public void configure(JobConf job) {

  // set the url filters, scoring filters the parse util and the url
  // normalizers
  this.jobConf = job;
  this.urlFilters = new URLFilters(jobConf);
  this.scfilters = new ScoringFilters(jobConf);
  this.parseUtil = new ParseUtil(jobConf);
  this.normalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_FETCHER);
  interval = jobConf.getInt("db.fetch.interval.default", 2592000);
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:21,代码来源:ArcSegmentCreator.java

示例15: testIt

import org.apache.nutch.parse.ParseUtil; //导入依赖的package包/类
@Test
public void testIt() throws ParseException {
  String contentType;

  // now test only on linux platform
  if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
    System.err
        .println("Current OS is " + System.getProperty("os.name") + ".");
    System.err.println("No test is run on OS other than linux.");
    return;
  }

  Configuration conf = NutchConfiguration.create();
  // loop alternately, total 10*2 times of invoking external command
  for (int i = 0; i < 10; i++) {
    // check external parser that does 'cat'
    contentType = "application/vnd.nutch.example.cat";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
        content.getUrl());
    Assert.assertEquals(expectedText, parse.getText());

    // check external parser that does 'md5sum'
    contentType = "application/vnd.nutch.example.md5sum";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
        content.getUrl());
    Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
  }
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:31,代码来源:TestExtParser.java


注:本文中的org.apache.nutch.parse.ParseUtil类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。