本文整理汇总了Java中de.l3s.boilerpipe.BoilerpipeProcessingException类的典型用法代码示例。如果您正苦于以下问题:Java BoilerpipeProcessingException类的具体用法?Java BoilerpipeProcessingException怎么用?Java BoilerpipeProcessingException使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
BoilerpipeProcessingException类属于de.l3s.boilerpipe包,在下文中一共展示了BoilerpipeProcessingException类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changed = false;
for (TextBlock tb : doc.getTextBlocks()) {
if (tb.getNumWords() > 10) {
continue;
}
final String text = tb.getText();
for (Pattern p : PATTERNS_SHORT) {
if (p.matcher(text).find()) {
changed = true;
tb.setIsContent(true);
tb.addLabel(DefaultLabels.ARTICLE_METADATA);
}
}
}
return changed;
}
示例2: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
List<TextBlock> list = doc.getTextBlocks();
for (ListIterator<TextBlock> it = list.listIterator(list.size()); it.hasPrevious(); ) {
TextBlock tb = it.previous();
if(tb.isContent()) {
if(tb.hasLabel(DefaultLabels.HEADING)) {
tb.setIsContent(false);
changes = true;
} else {
break;
}
}
}
return changes;
}
示例3: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
if (textBlocks.size() < 2) {
return false;
}
boolean changes = false;
TextBlock prevBlock = textBlocks.get(0);
int offset = 1;
for (Iterator<TextBlock> it = textBlocks.listIterator(offset); it
.hasNext();) {
TextBlock block = it.next();
if(equalLabels(prevBlock.getLabels(), block.getLabels())) {
prevBlock.mergeNext(block);
it.remove();
changes = true;
} else {
prevBlock = block;
}
}
return changes;
}
示例4: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
boolean changes = false;
if (textBlocks.size() < 2) {
return false;
}
TextBlock b1 = textBlocks.get(0);
for (Iterator<TextBlock> it = textBlocks.listIterator(1); it.hasNext();) {
TextBlock b2 = it.next();
final boolean similar = (b1.getTextDensity() == b2.getTextDensity());
if(similar) {
b1.mergeNext(b2);
it.remove();
changes = true;
} else {
b1 = b2;
}
}
return changes;
}
示例5: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
if (!tb.isContent()) {
for (String label : labels) {
if (tb.hasLabel(label)) {
tb.setIsContent(true);
changes = true;
continue BLOCK_LOOP;
}
}
}
}
return changes;
}
示例6: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
for (TextBlock tb : doc.getTextBlocks()) {
if (!tb.isContent()) {
continue;
}
if (tb.getNumWords() < minWords) {
tb.setIsContent(false);
changes = true;
}
}
return changes;
}
示例7: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
if (tb.isContent()) {
for (String label : labels) {
if (tb.hasLabel(label)) {
tb.setIsContent(false);
changes = true;
continue BLOCK_LOOP;
}
}
}
}
return changes;
}
示例8: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
boolean hasChanges = false;
for (Iterator<TextBlock> it = textBlocks.iterator(); it.hasNext();) {
TextBlock tb = it.next();
if (!tb.isContent()
&& (labelToKeep == null || !tb
.hasLabel(DefaultLabels.TITLE))) {
it.remove();
hasChanges = true;
}
}
return hasChanges;
}
示例9: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
for (TextBlock tb : doc.getTextBlocks()) {
if (!tb.isContent()) {
continue;
}
if (getNumFullTextWords(tb) < minWords) {
tb.setIsContent(false);
changes = true;
}
}
return changes;
}
示例10: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
int numWords = 0;
boolean foundEndOfText = false;
for (Iterator<TextBlock> it = doc.getTextBlocks().iterator(); it.hasNext();) {
TextBlock block = it.next();
final boolean endOfText = block
.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT);
if (block.isContent()) {
numWords += getNumFullTextWords(block);
}
if (endOfText && numWords >= minNumWords) {
foundEndOfText = true;
}
if (foundEndOfText) {
changes = true;
block.setIsContent(false);
}
}
return changes;
}
示例11: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
return
TerminatingBlocksFinder.INSTANCE.process(doc)
| new DocumentTitleMatchClassifier(doc.getTitle()).process(doc)
| NumWordsRulesClassifier.INSTANCE.process(doc)
| IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc)
| TrailingHeadlineToBoilerplateFilter.INSTANCE.process(doc)
| BlockProximityFusion.MAX_DISTANCE_1.process(doc)
| BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.process(doc)
| BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process(doc)
| KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process(doc)
| ExpandTitleToContentFilter.INSTANCE.process(doc)
| LargeBlockSameTagLevelToContentFilter.INSTANCE.process(doc)
| ListAtEndFilter.INSTANCE.process(doc)
;
}
示例12: testBoilerpipe
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
@Test
public void testBoilerpipe() throws IOException, BoilerpipeProcessingException{
String plain = ArticleExtractor.getInstance().getText(_html);
System.out.println("=== BEGIN BOILERPIPE OUTPUT 1 ===");
System.out.println(plain);
System.out.println("=== END BOILERPIPE OUTPUT 1 ===");
plain = DefaultExtractor.getInstance().getText(_html);
System.out.println("=== BEGIN BOILERPIPE OUTPUT 2 ===");
System.out.println(plain);
System.out.println("=== END BOILERPIPE OUTPUT 2 ===");
}
示例13: main
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
public static void main(String[] args) throws IOException, InterruptedException,
XPathExpressionException, ParserConfigurationException, SAXException,
ClassNotFoundException, URISyntaxException, BoilerpipeProcessingException {
generateFeatureFiles("data/classify-apkbc-corpus.tsv",
"data/classify-apkbc-mallet-summary.features",
"data/classify-apkbc-mallet-explanation.features", "data/classify-apkbc-mallet.ids");
}
示例14: extractHtml
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
private void extractHtml(InputStream record, Document doc) throws TextExtractionException {
try {
BoundedInputStream in = new BoundedInputStream(record, maxDocSize);
TextDocument textDoc = new BoilerpipeSAXInput(new InputSource(in)).getTextDocument();
doc.setTitle(textDoc.getTitle());
doc.setText(textDoc.getText(true, true).replace("\uFFFF", ""));
if (boilingEnabled) {
DefaultExtractor.INSTANCE.process(textDoc);
doc.setBoiled(textDoc.getContent().replace("\uFFFF", ""));
}
} catch (SAXException | BoilerpipeProcessingException | IllegalArgumentException | ArrayIndexOutOfBoundsException e) {
throw new TextExtractionException(e);
}
}
示例15: process
import de.l3s.boilerpipe.BoilerpipeProcessingException; //导入依赖的package包/类
/**
* Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
* retrieved HTML using the specified {@link BoilerpipeExtractor}.
*
* The processed {@link TextDocument}.
* The original HTML document.
* @return A List of enclosed links
* @throws BoilerpipeProcessingException
*/
public List<String> process(final URL url, final BoilerpipeExtractor extractor)
throws IOException, BoilerpipeProcessingException, SAXException {
final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
.getTextDocument();
extractor.process(doc);
final InputSource is = htmlDoc.toInputSource();
return process(doc, is);
}