本文整理汇总了Java中de.l3s.boilerpipe.document.TextDocument类的典型用法代码示例。如果您正苦于以下问题:Java TextDocument类的具体用法?Java TextDocument怎么用?Java TextDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
TextDocument类属于de.l3s.boilerpipe.document包,在下文中一共展示了TextDocument类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changed = false;
for (TextBlock tb : doc.getTextBlocks()) {
if (tb.getNumWords() > 10) {
continue;
}
final String text = tb.getText();
for (Pattern p : PATTERNS_SHORT) {
if (p.matcher(text).find()) {
changed = true;
tb.setIsContent(true);
tb.addLabel(DefaultLabels.ARTICLE_METADATA);
}
}
}
return changed;
}
示例2: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
List<TextBlock> list = doc.getTextBlocks();
for (ListIterator<TextBlock> it = list.listIterator(list.size()); it.hasPrevious(); ) {
TextBlock tb = it.previous();
if(tb.isContent()) {
if(tb.hasLabel(DefaultLabels.HEADING)) {
tb.setIsContent(false);
changes = true;
} else {
break;
}
}
}
return changes;
}
示例3: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
if (textBlocks.size() < 2) {
return false;
}
boolean changes = false;
TextBlock prevBlock = textBlocks.get(0);
int offset = 1;
for (Iterator<TextBlock> it = textBlocks.listIterator(offset); it
.hasNext();) {
TextBlock block = it.next();
if(equalLabels(prevBlock.getLabels(), block.getLabels())) {
prevBlock.mergeNext(block);
it.remove();
changes = true;
} else {
prevBlock = block;
}
}
return changes;
}
示例4: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
boolean changes = false;
if (textBlocks.size() < 2) {
return false;
}
TextBlock b1 = textBlocks.get(0);
for (Iterator<TextBlock> it = textBlocks.listIterator(1); it.hasNext();) {
TextBlock b2 = it.next();
final boolean similar = (b1.getTextDensity() == b2.getTextDensity());
if(similar) {
b1.mergeNext(b2);
it.remove();
changes = true;
} else {
b1 = b2;
}
}
return changes;
}
示例5: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
if (!tb.isContent()) {
for (String label : labels) {
if (tb.hasLabel(label)) {
tb.setIsContent(true);
changes = true;
continue BLOCK_LOOP;
}
}
}
}
return changes;
}
示例6: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
for (TextBlock tb : doc.getTextBlocks()) {
if (!tb.isContent()) {
continue;
}
if (tb.getNumWords() < minWords) {
tb.setIsContent(false);
changes = true;
}
}
return changes;
}
示例7: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
if (tb.isContent()) {
for (String label : labels) {
if (tb.hasLabel(label)) {
tb.setIsContent(false);
changes = true;
continue BLOCK_LOOP;
}
}
}
}
return changes;
}
示例8: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
boolean hasChanges = false;
for (Iterator<TextBlock> it = textBlocks.iterator(); it.hasNext();) {
TextBlock tb = it.next();
if (!tb.isContent()
&& (labelToKeep == null || !tb
.hasLabel(DefaultLabels.TITLE))) {
it.remove();
hasChanges = true;
}
}
return hasChanges;
}
示例9: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
for (TextBlock tb : doc.getTextBlocks()) {
if (!tb.isContent()) {
continue;
}
if (getNumFullTextWords(tb) < minWords) {
tb.setIsContent(false);
changes = true;
}
}
return changes;
}
示例10: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
int numWords = 0;
boolean foundEndOfText = false;
for (Iterator<TextBlock> it = doc.getTextBlocks().iterator(); it.hasNext();) {
TextBlock block = it.next();
final boolean endOfText = block
.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT);
if (block.isContent()) {
numWords += getNumFullTextWords(block);
}
if (endOfText && numWords >= minNumWords) {
foundEndOfText = true;
}
if (foundEndOfText) {
changes = true;
block.setIsContent(false);
}
}
return changes;
}
示例11: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
return
TerminatingBlocksFinder.INSTANCE.process(doc)
| new DocumentTitleMatchClassifier(doc.getTitle()).process(doc)
| NumWordsRulesClassifier.INSTANCE.process(doc)
| IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc)
| TrailingHeadlineToBoilerplateFilter.INSTANCE.process(doc)
| BlockProximityFusion.MAX_DISTANCE_1.process(doc)
| BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.process(doc)
| BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process(doc)
| KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process(doc)
| ExpandTitleToContentFilter.INSTANCE.process(doc)
| LargeBlockSameTagLevelToContentFilter.INSTANCE.process(doc)
| ListAtEndFilter.INSTANCE.process(doc)
;
}
示例12: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
/**
* returns the article from an document with its basic html structure.
*
* @param HTMLDocument
* @param URI the uri from the document for resolving the relative anchors in the document to absolute anchors
* @return String
*/
public String process(HTMLDocument htmlDoc, URI docUri, final BoilerpipeExtractor extractor) {
final HTMLHighlighter hh = HTMLHighlighter.newExtractingInstance();
hh.setOutputHighlightOnly(true);
TextDocument doc;
String text = "";
try {
doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
extractor.process(doc);
final InputSource is = htmlDoc.toInputSource();
text = hh.process(doc, is);
} catch (Exception ex) {
return null;
}
return removeNotAllowedTags(text, docUri);
}
示例13: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
/**
* parses the media (picture, video) out of doc
*
* @param doc document to parse the media out
* @param extractor extractor to use
* @return list of extracted media, with size = 0 if no media found
*/
public List<Media> process(String doc, final BoilerpipeExtractor extractor) {
final HTMLDocument htmlDoc = new HTMLDocument(doc);
List<Media> media = new ArrayList<Media>();
TextDocument tdoc;
try {
tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
extractor.process(tdoc);
final InputSource is = htmlDoc.toInputSource();
media = process(tdoc, is);
} catch (Exception e) {
return null;
}
return media;
}
示例14: extractHtml
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
private void extractHtml(InputStream record, Document doc) throws TextExtractionException {
try {
BoundedInputStream in = new BoundedInputStream(record, maxDocSize);
TextDocument textDoc = new BoilerpipeSAXInput(new InputSource(in)).getTextDocument();
doc.setTitle(textDoc.getTitle());
doc.setText(textDoc.getText(true, true).replace("\uFFFF", ""));
if (boilingEnabled) {
DefaultExtractor.INSTANCE.process(textDoc);
doc.setBoiled(textDoc.getContent().replace("\uFFFF", ""));
}
} catch (SAXException | BoilerpipeProcessingException | IllegalArgumentException | ArrayIndexOutOfBoundsException e) {
throw new TextExtractionException(e);
}
}
示例15: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
/**
* Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
* retrieved HTML using the specified {@link BoilerpipeExtractor}.
*
* The processed {@link TextDocument}.
* The original HTML document.
* @return A List of enclosed links
* @throws BoilerpipeProcessingException
*/
public List<String> process(final URL url, final BoilerpipeExtractor extractor)
throws IOException, BoilerpipeProcessingException, SAXException {
final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
.getTextDocument();
extractor.process(doc);
final InputSource is = htmlDoc.toInputSource();
return process(doc, is);
}