当前位置: 首页>>代码示例>>Java>>正文


Java CommonExtractors类代码示例

本文整理汇总了Java中de.l3s.boilerpipe.extractors.CommonExtractors的典型用法代码示例。如果您正苦于以下问题:Java CommonExtractors类的具体用法?Java CommonExtractors怎么用?Java CommonExtractors使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


CommonExtractors类属于de.l3s.boilerpipe.extractors包,在下文中一共展示了CommonExtractors类的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import de.l3s.boilerpipe.extractors.CommonExtractors; //导入依赖的package包/类
public static void main(String[] args) throws InterruptedException, IOException {
  List<String> lines = Files.readLines(new File("data/query.tsv"), Charsets.UTF_8);
  Set<String> ids = new HashSet<>();
  for (String line : lines) {
    ids.add(line.split("\t")[0]);
  }
  BoilerpipeExtractor extractor = CommonExtractors.ARTICLE_EXTRACTOR;
  ExecutorService es = Executors.newFixedThreadPool(10);
  System.out.println(ids.size());
  DecimalFormat df = new DecimalFormat("00");
  for (String id : ids) {
    String googleHtml = Files.toString(new File("data/googlerp", id + ".html"), Charsets.UTF_8);
    Matcher matcher = pattern.matcher(googleHtml);
    int count = 0;
    while (matcher.find()) {
      count++;
      // check existence
      File docHtmlFile = new File("data/context", id + "-" + df.format(count) + ".html");
      File docTextFile = new File("data/context", id + "-" + df.format(count) + ".txt");
      if (docHtmlFile.exists() && docTextFile.exists()) {
        continue;
      }
      // get url
      String url = matcher.group(1);
      if (url.contains("wikihow") || url.contains("google")) {
        continue;
      }
      es.execute(() -> {
        System.out.println(id + " " + url);
        // download url
        try {
          String docHtml = Request.Get(url).connectTimeout(2000).socketTimeout(2000).execute()
                  .returnContent().asString();
          Files.write(docHtml, docHtmlFile, Charsets.UTF_8);
          String docText = extractor.getText(docHtml);
          Files.write(docText, docTextFile, Charsets.UTF_8);
        } catch (Exception e) {
          e.printStackTrace();
        }
      });
    }
  }
  es.shutdown();
  if (!es.awaitTermination(5, TimeUnit.MINUTES)) {
    System.out.println("Timeout occurs for one or some concept retrieval service.");
  }
}
 
开发者ID:ziy,项目名称:pkb,代码行数:48,代码来源:ContextExtractor.java

示例2: downloadSearchResult

import de.l3s.boilerpipe.extractors.CommonExtractors; //导入依赖的package包/类
public static void downloadSearchResult() throws IOException, BoilerpipeProcessingException,
        SAXException, URISyntaxException, InterruptedException {
  List<String> lines = Files.readLines(new File("data/e2e-apkbc-suggested-query.tsv"),
          Charsets.UTF_8);
  Set<String> ids = new HashSet<>();
  for (String line : lines) {
    ids.add(line.split("\t")[0]);
  }
  Pattern pattern = Pattern.compile("<a href=\"([^>\"]*)\" onmousedown=\"");
  BoilerpipeExtractor extractor = CommonExtractors.ARTICLE_EXTRACTOR;
  ExecutorService es = Executors.newFixedThreadPool(10);
  System.out.println(ids.size());
  DecimalFormat df = new DecimalFormat("00");
  for (String id : ids) {
    String googleHtml = Files.toString(new File("data/e2e-googlerp", id + ".html"),
            Charsets.UTF_8);
    Matcher matcher = pattern.matcher(googleHtml);
    int count = 0;
    while (matcher.find()) {
      count++;
      // check existence
      File docHtmlFile = new File("data/e2e-context", id + "-" + df.format(count) + ".html");
      File docTextFile = new File("data/e2e-context", id + "-" + df.format(count) + ".txt");
      if (docHtmlFile.exists() && docTextFile.exists()) {
        continue;
      }
      // get url
      String url = matcher.group(1);
      if (url.contains("wikihow") || url.contains("google")) {
        continue;
      }
      es.execute(() -> {
        System.out.println(id + " " + url);
        // download url
        try {
          String docHtml = Request.Get(url).connectTimeout(2000).socketTimeout(2000).execute()
                  .returnContent().asString();
          Files.write(docHtml, docHtmlFile, Charsets.UTF_8);
          String docText = extractor.getText(docHtml);
          Files.write(docText, docTextFile, Charsets.UTF_8);
        } catch (Exception e) {
          e.printStackTrace();
        }
      });
    }
  }
  es.shutdown();
  if (!es.awaitTermination(5, TimeUnit.SECONDS)) {
    System.out.println("Timeout occurs for one or some concept retrieval service.");
  }
}
 
开发者ID:ziy,项目名称:pkb,代码行数:52,代码来源:AutomaticProceduralKnowledgeBaseConstructor.java

示例3: prepare

import de.l3s.boilerpipe.extractors.CommonExtractors; //导入依赖的package包/类
public void prepare(@SuppressWarnings("rawtypes") Map conf, TopologyContext context, 
		OutputCollector collector) {
	
	_logger = Logger.getLogger(ArticleExtractionBolt.class);
	
	_collector = collector;
	
	_queue = new LinkedBlockingQueue<WebPage>();
	_tupleQueue =  new LinkedBlockingQueue<Object>();
	
	_cm = new PoolingHttpClientConnectionManager();
	_cm.setMaxTotal(numOfFetchers);
	_cm.setDefaultMaxPerRoute(10);

	_httpclient = HttpClients.custom()
	        .setConnectionManager(_cm)
	        .build();
	
	// Set timeout parameters for Http requests
	_requestConfig = RequestConfig.custom()
	        .setSocketTimeout(30000)
	        .setConnectTimeout(30000)
	        .build();

	_articleExtractor = CommonExtractors.ARTICLE_EXTRACTOR;
    _extractor = CommonExtractors.ARTICLE_EXTRACTOR;
    // The use of Canola extractor increases recall of the returned media items but decreases precision.
    //_extractor = CommonExtractors.CANOLA_EXTRACTOR;
    
    _imageExtractor = ImageExtractor.INSTANCE;
    
    // Quality estimator of the article extraction process
    _estimator = SimpleEstimator.INSTANCE;	
    
    _emitter = new Thread(new Emitter(_collector, _tupleQueue));
    _emitter.start();
    
    _fetchers = new ArrayList<Thread>(numOfFetchers);
    for(int i=0;i<numOfFetchers; i++) {
    	Thread fetcher = new Thread(new HttpFetcher(_queue));
    	fetcher.start();
    	
    	_fetchers.add(fetcher);
    }
}
 
开发者ID:socialsensor,项目名称:storm-focused-crawler,代码行数:46,代码来源:ArticleExtractionBolt.java


注:本文中的de.l3s.boilerpipe.extractors.CommonExtractors类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。