本文整理汇总了Java中de.l3s.boilerpipe.extractors.ArticleExtractor类的典型用法代码示例。如果您正苦于以下问题:Java ArticleExtractor类的具体用法?Java ArticleExtractor怎么用?Java ArticleExtractor使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
ArticleExtractor类属于de.l3s.boilerpipe.extractors包,在下文中一共展示了ArticleExtractor类的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getPlaintext
import de.l3s.boilerpipe.extractors.ArticleExtractor; //导入依赖的package包/类
@Override
public String getPlaintext(final String htmltext) {
try {
String plaintext = ArticleExtractor.getInstance().getText(htmltext);
return plaintext;
} catch (Throwable t) {
for (int i = 1; t != null && i < 10; i++) {
LOG.log(Level.SEVERE, String.format("Failed to get plaintext from while '%s' (%d %s:%s).", StringUtils.abbreviate(htmltext, 100), i, t.getClass().getName(), t.getMessage()), t);
t = t.getCause();
}
return "Failed to get plaintext content \n" + htmltext;
}
}
示例2: testBoilerpipe
import de.l3s.boilerpipe.extractors.ArticleExtractor; //导入依赖的package包/类
@Test
public void testBoilerpipe() throws IOException, BoilerpipeProcessingException{
String plain = ArticleExtractor.getInstance().getText(_html);
System.out.println("=== BEGIN BOILERPIPE OUTPUT 1 ===");
System.out.println(plain);
System.out.println("=== END BOILERPIPE OUTPUT 1 ===");
plain = DefaultExtractor.getInstance().getText(_html);
System.out.println("=== BEGIN BOILERPIPE OUTPUT 2 ===");
System.out.println(plain);
System.out.println("=== END BOILERPIPE OUTPUT 2 ===");
}
示例3: receive
import de.l3s.boilerpipe.extractors.ArticleExtractor; //导入依赖的package包/类
public PartialFunction receive() {
return ReceiveBuilder.
match(ParseHtmlArticle.class, msg -> {
String body = ArticleExtractor.INSTANCE.getText(msg.htmlString);
sender().tell(new ArticleBody(msg.uri, body), self());
}).build();
}
示例4: fetch
import de.l3s.boilerpipe.extractors.ArticleExtractor; //导入依赖的package包/类
/**
* Get a page from the Internet and clean it.
*/
private String fetch(String key) {
try {
byte[] payload = fetcher.fetch(key.substring(4)).getContent();
InputStreamReader isr = new InputStreamReader(
new ByteArrayInputStream(payload));
return ArticleExtractor.INSTANCE.getText(isr);
} catch (BaseFetchException | BoilerpipeProcessingException e) {
// TODO Auto-generated catch block
System.err.println("Can't connect to " + key);
return "";
}
}
示例5: ArticleFetcher
import de.l3s.boilerpipe.extractors.ArticleExtractor; //导入依赖的package包/类
public ArticleFetcher(String storePath, String urlstr) {
//TODO pass in these two objects when threadsafe
article_extractor = new ArticleExtractor();
html_highlighter = HTMLHighlighter.newHighlightingInstance();
this.storePath = storePath;
this.urlstr = urlstr;
}
示例6: ArticleHandeler
import de.l3s.boilerpipe.extractors.ArticleExtractor; //导入依赖的package包/类
public ArticleHandeler(String storePath){
System.setProperty("sun.net.http.allowRestrictedHeaders", "true");
this.storePath=storePath;
new File(storePath).mkdir();
ArticleExtractor articleExtractor = new ArticleExtractor();
HTMLHighlighter htmlHighlighter = HTMLHighlighter.newHighlightingInstance();
}
示例7: extract
import de.l3s.boilerpipe.extractors.ArticleExtractor; //导入依赖的package包/类
static public String extract(String url) throws MalformedURLException, BoilerpipeProcessingException {
URL target = new URL(url);
if(url.contains("youtube")) {
KeepEverythingWithMinKWordsExtractor extractor = new KeepEverythingWithMinKWordsExtractor(YOUTUBE_MIN_WORDS_TRESHOLD);
String extracted = extractor.getText(target);
if(0 != extracted.length()) {
return extracted;
}
}
return ArticleExtractor.INSTANCE.getText(target);
}
示例8: run
import de.l3s.boilerpipe.extractors.ArticleExtractor; //导入依赖的package包/类
@Override
public void run() {
String html;
try {
html = request_future.get();
post.text_content = ArticleExtractor.getInstance().getText(html);
} catch (InterruptedException | ExecutionException | BoilerpipeProcessingException ex) {
Logger.getLogger(BoilerpipeHandler.class.getName()).log(Level.SEVERE, null, ex);
post.text_content = "<execution error: " + ex.getMessage() + ">";
}
return_future.content(post);
}
示例9: check
import de.l3s.boilerpipe.extractors.ArticleExtractor; //导入依赖的package包/类
public void check() throws Exception {
FileSystemCache cache = new FileSystemCache("gdelt-articles");
ArrayList<GdeltEvent> events = GdeltCsv.allEvents(BASE_DIR);
//TODO: run through events grabbing source text, running that through CLIFF, and checking results
int mentionedSuccesses = 0;
int mentionedFailures = 0;
for(GdeltEvent event:events){
logger.debug("-------------------------------------------------------------------------------------------");
logger.debug("Checking event "+event);
try{
URL url = event.getSourceUrl();
String text;
if(cache.contains(url.toString())){
text = cache.get(url.toString());
logger.debug(" Fetched from cache:"+url.toString());
} else {
HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
text = ArticleExtractor.INSTANCE.getText(doc);
cache.put(url.toString(), text);
logger.debug("Fetched from web:"+url.toString());
}
if(text.length()<100){
logger.debug(" Skipping because it is too short");
continue; //assume we didn't fetch/extract it right
}
logger.debug(text);
ExtractedEntities entities = ParseManager.extractAndResolve(text, true);
List<CountryCode> countries = entities.getUniqueCountries();
if( countries.contains(event.getActor1().getCountryCodeObj()) && countries.contains(event.getActor2().getCountryCodeObj())){
mentionedSuccesses = mentionedSuccesses + 1;
} else {
logger.error(" We found "+countries+" - GDELT Says:"+event.getActor1().getCountryCodeObj()+" and "+event.getActor2().getCountryCodeObj());
mentionedFailures++;
}
} catch(Exception e){
logger.warn(" Skipping url "+event.getSourceUrl()+" because "+e.toString());
}
}
double aboutnessSuccess = (double)mentionedSuccesses/(double)(mentionedSuccesses+mentionedFailures);
logger.info("Checked "+(mentionedSuccesses+mentionedFailures)+" Articles - Mentions success rate: "+aboutnessSuccess);
}