本文整理汇总了Java中de.l3s.boilerpipe.extractors.CommonExtractors类的典型用法代码示例。如果您正苦于以下问题:Java CommonExtractors类的具体用法?Java CommonExtractors怎么用?Java CommonExtractors使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
CommonExtractors类属于de.l3s.boilerpipe.extractors包,在下文中一共展示了CommonExtractors类的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import de.l3s.boilerpipe.extractors.CommonExtractors; //导入依赖的package包/类
public static void main(String[] args) throws InterruptedException, IOException {
List<String> lines = Files.readLines(new File("data/query.tsv"), Charsets.UTF_8);
Set<String> ids = new HashSet<>();
for (String line : lines) {
ids.add(line.split("\t")[0]);
}
BoilerpipeExtractor extractor = CommonExtractors.ARTICLE_EXTRACTOR;
ExecutorService es = Executors.newFixedThreadPool(10);
System.out.println(ids.size());
DecimalFormat df = new DecimalFormat("00");
for (String id : ids) {
String googleHtml = Files.toString(new File("data/googlerp", id + ".html"), Charsets.UTF_8);
Matcher matcher = pattern.matcher(googleHtml);
int count = 0;
while (matcher.find()) {
count++;
// check existence
File docHtmlFile = new File("data/context", id + "-" + df.format(count) + ".html");
File docTextFile = new File("data/context", id + "-" + df.format(count) + ".txt");
if (docHtmlFile.exists() && docTextFile.exists()) {
continue;
}
// get url
String url = matcher.group(1);
if (url.contains("wikihow") || url.contains("google")) {
continue;
}
es.execute(() -> {
System.out.println(id + " " + url);
// download url
try {
String docHtml = Request.Get(url).connectTimeout(2000).socketTimeout(2000).execute()
.returnContent().asString();
Files.write(docHtml, docHtmlFile, Charsets.UTF_8);
String docText = extractor.getText(docHtml);
Files.write(docText, docTextFile, Charsets.UTF_8);
} catch (Exception e) {
e.printStackTrace();
}
});
}
}
es.shutdown();
if (!es.awaitTermination(5, TimeUnit.MINUTES)) {
System.out.println("Timeout occurs for one or some concept retrieval service.");
}
}
示例2: downloadSearchResult
import de.l3s.boilerpipe.extractors.CommonExtractors; //导入依赖的package包/类
public static void downloadSearchResult() throws IOException, BoilerpipeProcessingException,
SAXException, URISyntaxException, InterruptedException {
List<String> lines = Files.readLines(new File("data/e2e-apkbc-suggested-query.tsv"),
Charsets.UTF_8);
Set<String> ids = new HashSet<>();
for (String line : lines) {
ids.add(line.split("\t")[0]);
}
Pattern pattern = Pattern.compile("<a href=\"([^>\"]*)\" onmousedown=\"");
BoilerpipeExtractor extractor = CommonExtractors.ARTICLE_EXTRACTOR;
ExecutorService es = Executors.newFixedThreadPool(10);
System.out.println(ids.size());
DecimalFormat df = new DecimalFormat("00");
for (String id : ids) {
String googleHtml = Files.toString(new File("data/e2e-googlerp", id + ".html"),
Charsets.UTF_8);
Matcher matcher = pattern.matcher(googleHtml);
int count = 0;
while (matcher.find()) {
count++;
// check existence
File docHtmlFile = new File("data/e2e-context", id + "-" + df.format(count) + ".html");
File docTextFile = new File("data/e2e-context", id + "-" + df.format(count) + ".txt");
if (docHtmlFile.exists() && docTextFile.exists()) {
continue;
}
// get url
String url = matcher.group(1);
if (url.contains("wikihow") || url.contains("google")) {
continue;
}
es.execute(() -> {
System.out.println(id + " " + url);
// download url
try {
String docHtml = Request.Get(url).connectTimeout(2000).socketTimeout(2000).execute()
.returnContent().asString();
Files.write(docHtml, docHtmlFile, Charsets.UTF_8);
String docText = extractor.getText(docHtml);
Files.write(docText, docTextFile, Charsets.UTF_8);
} catch (Exception e) {
e.printStackTrace();
}
});
}
}
es.shutdown();
if (!es.awaitTermination(5, TimeUnit.SECONDS)) {
System.out.println("Timeout occurs for one or some concept retrieval service.");
}
}
示例3: prepare
import de.l3s.boilerpipe.extractors.CommonExtractors; //导入依赖的package包/类
public void prepare(@SuppressWarnings("rawtypes") Map conf, TopologyContext context,
OutputCollector collector) {
_logger = Logger.getLogger(ArticleExtractionBolt.class);
_collector = collector;
_queue = new LinkedBlockingQueue<WebPage>();
_tupleQueue = new LinkedBlockingQueue<Object>();
_cm = new PoolingHttpClientConnectionManager();
_cm.setMaxTotal(numOfFetchers);
_cm.setDefaultMaxPerRoute(10);
_httpclient = HttpClients.custom()
.setConnectionManager(_cm)
.build();
// Set timeout parameters for Http requests
_requestConfig = RequestConfig.custom()
.setSocketTimeout(30000)
.setConnectTimeout(30000)
.build();
_articleExtractor = CommonExtractors.ARTICLE_EXTRACTOR;
_extractor = CommonExtractors.ARTICLE_EXTRACTOR;
// The use of Canola extractor increases recall of the returned media items but decreases precision.
//_extractor = CommonExtractors.CANOLA_EXTRACTOR;
_imageExtractor = ImageExtractor.INSTANCE;
// Quality estimator of the article extraction process
_estimator = SimpleEstimator.INSTANCE;
_emitter = new Thread(new Emitter(_collector, _tupleQueue));
_emitter.start();
_fetchers = new ArrayList<Thread>(numOfFetchers);
for(int i=0;i<numOfFetchers; i++) {
Thread fetcher = new Thread(new HttpFetcher(_queue));
fetcher.start();
_fetchers.add(fetcher);
}
}