本文整理汇总了Java中org.apache.pdfbox.text.PDFTextStripper.setEndPage方法的典型用法代码示例。如果您正苦于以下问题:Java PDFTextStripper.setEndPage方法的具体用法?Java PDFTextStripper.setEndPage怎么用?Java PDFTextStripper.setEndPage使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.pdfbox.text.PDFTextStripper
的用法示例。
在下文中一共展示了PDFTextStripper.setEndPage方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: process
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
private boolean process() throws IOException {
boolean toReturn = false;
PDFTextStripper stripper = new TitleExtractor();
PDDocument document = null;
try {
document = PDDocument.load(new File(this.getFileNamePathWithExtension()));
//((TitleExtractor) stripper).setFileNamePathWithExtension(this.getFileNamePathWithExtension());
stripper.setSortByPosition(true);
stripper.setStartPage(0);
stripper.setEndPage(1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
setTitle(((TitleExtractor) stripper).getTitle());
toReturn = true;
} finally {
if (document != null) {
document.close();
}
}
return toReturn;
}
示例2: process
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
private boolean process() throws IOException {
boolean toReturn = false;
PDFTextStripper stripper = new AuthorExtractor();
PDDocument document = null;
try {
document = PDDocument.load(new File(this.getFileNamePathWithExtension()));
//((TitleExtractor) stripper).setFileNamePathWithExtension(this.getFileNamePathWithExtension());
stripper.setSortByPosition(true);
stripper.setStartPage(0);
stripper.setEndPage(1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
setAuthorNames(((AuthorExtractor) stripper).getAuthorNames());
setAuthorAffiliations(((AuthorExtractor) stripper).getAuthorAffiliations());
setAuthorContacts(((AuthorExtractor) stripper).getAuthorContacts());
toReturn = true;
} finally {
if (document != null) {
document.close();
}
}
return toReturn;
}
示例3: extractFromPDF
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
* Reads the PDF file and extracts the content and table of contents to text files.
*/
public static void extractFromPDF() throws IOException {
PDDocument document = PDDocument.load(new File(MCV_PDF));
PDFTextStripper stripper = new PDFTextStripper();
// Extract the main contents
stripper.setStartPage(15);
String text = stripper.getText(document);
Files.write(Paths.get(MCV_TXT), text.getBytes());
// Extract the table of contents
stripper.setStartPage(3);
stripper.setEndPage(14);
text = stripper.getText(document);
Files.write(Paths.get(MCV_TXT_TOC), text.getBytes());
document.close();
}
示例4: findSubwords
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
List<TextPositionSequence> findSubwords(PDDocument document, int page, String searchTerm) throws IOException
{
final List<TextPositionSequence> hits = new ArrayList<TextPositionSequence>();
PDFTextStripper stripper = new PDFTextStripper()
{
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException
{
System.out.printf(" -- %s\n", text);
TextPositionSequence word = new TextPositionSequence(textPositions);
String string = word.toString();
int fromIndex = 0;
int index;
while ((index = string.indexOf(searchTerm, fromIndex)) > -1)
{
hits.add(word.subSequence(index, index + searchTerm.length()));
fromIndex = index + 1;
}
super.writeString(text, textPositions);
}
};
stripper.setSortByPosition(true);
stripper.setStartPage(page);
stripper.setEndPage(page);
stripper.getText(document);
return hits;
}
示例5: extractFromPDF
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
* Reads the PDF file and extracts the SIMS contents to the internal sorted map.
*/
public void extractFromPDF() throws IOException {
// Reinitialize the sorted map
entryMap = new TreeMap<String, String>();
// Extract the SIMS contents into a list of strings
logger.debug("Starting PDF extraction from " + SIMS_PDF);
PDDocument document = PDDocument.load(new File(SIMS_PDF));
PDFTextStripper stripper = new PDFTextStripper();
stripper.setStartPage(SIMS_PAGE);
stripper.setEndPage(SIMS_PAGE);
String text = stripper.getText(document);
String[] lines = text.split("\r\n");
int lineIndex = 0;
StringBuilder currentLine = new StringBuilder();
for (String line : lines) {
String trimmedLine = line.trim();
if ((lineIndex++ < SIMS_SKIP) || (trimmedLine.length() == 0)) continue; // First lines are titles
// If line does not start with 'S.', it is a continuation of the previous line
if (!trimmedLine.startsWith("S.")) {
currentLine.append(" ").append(trimmedLine);
continue;
}
// If line starts with 'S.', it is a new item. Store previous item if there is one.
if (currentLine.length() > 0) {
// The code is everything before the first space (we assume there is one)
int codeEnd = currentLine.indexOf(" ");
entryMap.put(currentLine.substring(0, codeEnd), currentLine.substring(codeEnd + 1));
}
currentLine = new StringBuilder(trimmedLine);
}
logger.debug("End of PDF extraction");
document.close();
}
示例6: findpages
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
public List<SearchResult> findpages(String path,
List<String> searchKeywordList, int fileCounter) throws IOException {
int i; // page no.
boolean hasKeywords = false;
PDDocument finalDocument = new PDDocument();
List<PDPage> pageList = new ArrayList<PDPage>();
File file = new File(path);
PDFParser parser = new PDFParser(new RandomAccessBuffer(
new FileInputStream(file)));
parser.parse();
COSDocument cosDoc = parser.getDocument();
PDFTextStripper reader = new PDFTextStripper();
PDDocument doc = new PDDocument(cosDoc);
List<SearchResult> list = new ArrayList<SearchResult>();
for (i = 0; i <= doc.getNumberOfPages(); i++) {
reader.setStartPage(i);
reader.setEndPage(i);
hasKeywords = true;
for (String keyword : searchKeywordList) {
if (!reader.getText(doc).toLowerCase()
.contains(keyword.toLowerCase())) {
hasKeywords = false;
break;
}
}
if (hasKeywords) {
if (falseCounter > 1) {
SearchResult result = new PageResult();
result.setFileContent(reader.getText(doc));
result.setFilePath(path);
result.setPageNumber(i);
list.add(result);
pageList.add(doc.getPage(i));
}
falseCounter++;
}
}
for (PDPage page : pageList) {
finalDocument.addPage(page);
}
finalDocument
.save(ConfigCBSI.getResultPdfPath() + fileCounter + ".pdf");
finalDocument.close();
logger.info("Result Saved");
return list;
}
示例7: findpages
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
public List<SearchResult> findpages(String path,
List<String> searchKeywordList, int fileCounter) throws IOException {
int i; // page no.
boolean hasKeywords = false;
boolean hasSingleKeyword = false;
PDDocument finalDocument = new PDDocument();
List<PDPage> pageList = new ArrayList<PDPage>();
File file = new File(path);
PDFParser parser = new PDFParser(new RandomAccessBuffer(
new FileInputStream(file)));
parser.parse();
COSDocument cosDoc = parser.getDocument();
PDFTextStripper reader = new PDFTextStripper();
PDDocument doc = new PDDocument(cosDoc);
List<SearchResult> list = new ArrayList<SearchResult>();
for (i = 0; i <= doc.getNumberOfPages() - 1; i++) {
reader.setStartPage(i);
reader.setEndPage(i);
hasKeywords = true;
hasSingleKeyword = false;
for (String keyword : searchKeywordList) {
if (!reader.getText(doc).toLowerCase()
.contains(keyword.toLowerCase())) {
hasKeywords = false;
break;
}
}
if (hasKeywords) {
/* if (falseCounter > 1) { */
SearchResult result = new PageResult();
result.setFileContent(reader.getText(doc));
result.setFilePath(path);
result.setPageNumber(i);
list.add(result);
pageList.add(doc.getPage(i));
/* } */
falseCounter++;
}
}
for (PDPage page : pageList) {
finalDocument.addPage(page);
validResult = true;
}
if (validResult) {
finalDocument.save(ConfigCBSI.getResultPdfPath() + fileCounter
+ ".pdf");
finalDocument.close();
logger.info("Result Saved");
validResult = false;
}
return list;
}