本文整理汇总了Java中org.apache.pdfbox.text.PDFTextStripper.getText方法的典型用法代码示例。如果您正苦于以下问题:Java PDFTextStripper.getText方法的具体用法?Java PDFTextStripper.getText怎么用?Java PDFTextStripper.getText使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.pdfbox.text.PDFTextStripper
的用法示例。
在下文中一共展示了PDFTextStripper.getText方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testNoToUnicodeTest2
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
* <a href="https://stackoverflow.com/questions/45895768/pdfbox-2-0-7-extracttext-not-working-but-1-8-13-does-and-pdfreader-as-well">
* PDFBox 2.0.7 ExtractText not working but 1.8.13 does and PDFReader as well
* </a>
* <br/>
* <a href="https://wetransfer.com/downloads/214674449c23713ee481c5a8f529418320170827201941/b2bea6">
* test-2.pdf
* </a>
* <p>
* Due to the broken <b>ToUnicode</b> maps the output of immediate text
* extraction from this document is unsatisfying, cf. {@link #testTest2()}.
* It can be improved by removing these <b>ToUnicode</b> maps as this test
* shows.
* </p>
*/
@Test
public void testNoToUnicodeTest2() throws IOException
{
try ( InputStream resource = getClass().getResourceAsStream("test-2.pdf") )
{
PDDocument document = PDDocument.load(resource);
for (int pageNr = 0; pageNr < document.getNumberOfPages(); pageNr++)
{
PDPage page = document.getPage(pageNr);
PDResources resources = page.getResources();
removeToUnicodeMaps(resources);
}
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(document);
System.out.printf("\n*\n* test-2.pdf without ToUnicode\n*\n%s\n", text);
Files.write(new File(RESULT_FOLDER, "test-2_NoToUnicode.txt").toPath(), Collections.singleton(text));
}
}
示例2: main
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
public static void main(String[] args) {
try {
PDDocument document = PDDocument.load(new File("PDF File.pdf"));
PDFTextStripper Tstripper = new PDFTextStripper();
String documentText = Tstripper.getText(document);
System.out.println(documentText);
} catch (Exception e) {
e.printStackTrace();
}
}
示例3: readPdf
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
private static String readPdf(InputStream is) throws Exception {
String result;
PDDocument doc = PDDocument.load(is);
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(doc);
if(doc!= null) {
doc.close();
}
if (is != null) {
is.close();
}
return result;
}
示例4: testPdfFromStringTo
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
@Test
public void testPdfFromStringTo() throws Exception {
// GIVEN an html template containing special characters that java stores in utf-16 internally
Pdf pdf = pdfBuilder.build();
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>", PageType.htmlAsString);
String tempFolder = temporaryFolder.newFolder().getPath();
pdf.saveAs(tempFolder+"/output.pdf");
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes)));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}
示例5: extractFromPDF
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
* Reads the PDF file and extracts the content and table of contents to text files.
*/
public static void extractFromPDF() throws IOException {
PDDocument document = PDDocument.load(new File(MCV_PDF));
PDFTextStripper stripper = new PDFTextStripper();
// Extract the main contents
stripper.setStartPage(15);
String text = stripper.getText(document);
Files.write(Paths.get(MCV_TXT), text.getBytes());
// Extract the table of contents
stripper.setStartPage(3);
stripper.setEndPage(14);
text = stripper.getText(document);
Files.write(Paths.get(MCV_TXT_TOC), text.getBytes());
document.close();
}
示例6: readPdf
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
public static String readPdf(InputStream is) throws Exception {
PDDocument document = PDDocument.load(is);
PDFTextStripper textStripper = new PDFTextStripper();
return textStripper.getText(document);
}
示例7: extractPdf
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
public static String extractPdf(String path) throws IOException {
// Loading an existing document
File file = new File(path);
PDDocument document = PDDocument.load(file);
// Instantiate PDFTextStripper class
PDFTextStripper pdfStripper = new PDFTextStripper();
// Retrieving text from PDF document
String text = pdfStripper.getText(document);
document.close();
return text;
}
示例8: findSubwords
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
List<TextPositionSequence> findSubwords(PDDocument document, int page, String searchTerm) throws IOException
{
final List<TextPositionSequence> hits = new ArrayList<TextPositionSequence>();
PDFTextStripper stripper = new PDFTextStripper()
{
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException
{
System.out.printf(" -- %s\n", text);
TextPositionSequence word = new TextPositionSequence(textPositions);
String string = word.toString();
int fromIndex = 0;
int index;
while ((index = string.indexOf(searchTerm, fromIndex)) > -1)
{
hits.add(word.subSequence(index, index + searchTerm.length()));
fromIndex = index + 1;
}
super.writeString(text, textPositions);
}
};
stripper.setSortByPosition(true);
stripper.setStartPage(page);
stripper.setEndPage(page);
stripper.getText(document);
return hits;
}
示例9: testPnL_500010_0314
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
* <a href="http://stackoverflow.com/questions/37862159/pdf-reading-via-pdfbox-in-java">
* pdf reading via pdfbox in java
* </a>
* <br/>
* <a href="https://drive.google.com/file/d/0B_Ke2amBgdpedUNwVTR3RVlRTFE/view?usp=sharing">
* PnL_500010_0314.pdf
* </a>
* <p>
* Indeed, the <code>PDFTextStripper</code> is not even informed about those undecipherable
* text sections. Essentially the underlying method `PDFTextStreamEngine.showGlyph` filters
* all unmappable glyphs from composite fonts.
* </p>
*/
@Test
public void testPnL_500010_0314() throws IOException
{
try ( InputStream resource = getClass().getResourceAsStream("PnL_500010_0314.pdf") )
{
PDDocument document = PDDocument.load(resource);
PDFTextStripper stripper = new PDFTextStripper();
//stripper.setSortByPosition(true);
String text = stripper.getText(document);
System.out.printf("\n*\n* PnL_500010_0314.pdf\n*\n%s\n", text);
Files.write(new File(RESULT_FOLDER, "PnL_500010_0314.txt").toPath(), Collections.singleton(text));
}
}
示例10: testBal_532935_0314
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
* <a href="http://stackoverflow.com/questions/37862159/pdf-reading-via-pdfbox-in-java">
* pdf reading via pdfbox in java
* </a>
* <br/>
* <a href="https://drive.google.com/file/d/0B_Ke2amBgdpebm96U05FcWFsSXM/view?usp=sharing">
* Bal_532935_0314.pdf
* </a>
* <p>
* The issue here is caused by PDFBox guessing an encoding. The underlying method
* `PDFTextStreamEngine.showGlyph` does this for all unmappable glyphs from simple
* fonts.
* </p>
*/
@Test
public void testBal_532935_0314() throws IOException
{
try ( InputStream resource = getClass().getResourceAsStream("Bal_532935_0314.pdf") )
{
PDDocument document = PDDocument.load(resource);
PDFTextStripper stripper = new PDFTextStripper();
//stripper.setSortByPosition(true);
String text = stripper.getText(document);
System.out.printf("\n*\n* Bal_532935_0314.pdf\n*\n%s\n", text);
Files.write(new File(RESULT_FOLDER, "Bal_532935_0314.txt").toPath(), Collections.singleton(text));
}
}
示例11: test03WpEnterpriseBlackBerryCompeteDatasheet_041612FinalDraft
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
* <a href="http://stackoverflow.com/questions/38975091/pdfbox-gettext-not-returning-all-of-the-visible-text">
* PDFBox getText not returning all of the visible text
* </a>
* <br>
* <a href="https://dl.dropboxusercontent.com/u/14898138/03%20WP%20Enterprise%20BlackBerry%20Compete%20Datasheet_041612%20FINAL%20DRAFT.pdf">
* 03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.pdf
* </a>
* <p>
* There is some 'writing' actually done using vector graphics, not text,
* but aside from that all is accounted for.
* </p>
*/
@Test
public void test03WpEnterpriseBlackBerryCompeteDatasheet_041612FinalDraft() throws IOException
{
try ( InputStream resource = getClass().getResourceAsStream("03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.pdf") )
{
PDDocument document = PDDocument.load(resource);
PDFTextStripper stripper = new PDFTextStripper();
//stripper.setSortByPosition(true);
String text = stripper.getText(document);
System.out.printf("\n*\n* 03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.pdf\n*\n%s\n", text);
Files.write(new File(RESULT_FOLDER, "03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.txt").toPath(), Collections.singleton(text));
}
}
示例12: testNoTemplateInError
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
* <a href="https://stackoverflow.com/questions/47515609/invalid-block-type-while-using-pdfbox-2-0-8">
* Invalid block type while using pdfbox 2.0.8
* </a>
* <br>
* <a href="https://www.dropbox.com/s/xjeksj0cay4x3vo/NoTemplateInError.pdf?dl=0">
* NoTemplateInError.pdf
* </a>
* <p>
* The issue cannot be reproduced.
* </p>
*/
@Test
public void testNoTemplateInError() throws IOException
{
try ( InputStream resource = getClass().getResourceAsStream("NoTemplateInError.pdf") )
{
PDDocument document = PDDocument.load(resource);
PDFTextStripper stripper = new PDFTextStripper();
//stripper.setSortByPosition(true);
String text = stripper.getText(document);
System.out.printf("\n*\n* NoTemplateInError.pdf\n*\n%s\n", text);
Files.write(new File(RESULT_FOLDER, "NoTemplateInError.txt").toPath(), Collections.singleton(text));
}
}
示例13: extractFromPDF
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
* Reads the PDF file and extracts the SIMS contents to the internal sorted map.
*/
public void extractFromPDF() throws IOException {
// Reinitialize the sorted map
entryMap = new TreeMap<String, String>();
// Extract the SIMS contents into a list of strings
logger.debug("Starting PDF extraction from " + SIMS_PDF);
PDDocument document = PDDocument.load(new File(SIMS_PDF));
PDFTextStripper stripper = new PDFTextStripper();
stripper.setStartPage(SIMS_PAGE);
stripper.setEndPage(SIMS_PAGE);
String text = stripper.getText(document);
String[] lines = text.split("\r\n");
int lineIndex = 0;
StringBuilder currentLine = new StringBuilder();
for (String line : lines) {
String trimmedLine = line.trim();
if ((lineIndex++ < SIMS_SKIP) || (trimmedLine.length() == 0)) continue; // First lines are titles
// If line does not start with 'S.', it is a continuation of the previous line
if (!trimmedLine.startsWith("S.")) {
currentLine.append(" ").append(trimmedLine);
continue;
}
// If line starts with 'S.', it is a new item. Store previous item if there is one.
if (currentLine.length() > 0) {
// The code is everything before the first space (we assume there is one)
int codeEnd = currentLine.indexOf(" ");
entryMap.put(currentLine.substring(0, codeEnd), currentLine.substring(codeEnd + 1));
}
currentLine = new StringBuilder(trimmedLine);
}
logger.debug("End of PDF extraction");
document.close();
}
示例14: extractTextContent
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
* Extract text content using PDFBox
*
* @param result
* @param pdf
* @throws IOException
*/
private int extractTextContent(ParserResultItem result, PDDocument pdf) throws IOException {
final PDFTextStripper stripper = new PDFTextStripper();
final String text = stripper.getText(pdf);
if (StringUtils.isEmpty(text))
return 0;
final String[] lines = StringUtils.splitLines(text);
int characterCount = 0;
for (String line : lines)
characterCount += addLine(result, line);
return characterCount;
}
示例15: getNotesPDF
import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
* Extracts the explanatory notes from the official PDF publication of the UK SIC classification.
*
* <i>Note:</i> This method is unfinished.
*
* @throws IOException
*/
@SuppressWarnings("unused")
private void getNotesPDF() throws IOException {
PDDocument document = PDDocument.load(new File(LOCAL_FOLDER + SIC_NOTES_FILE));
PDFTextStripper stripper = new PDFTextStripper();
// Extract the main contents and saves it to a file (this is only for debugging purposes)
stripper.setStartPage(59);
String rawText = stripper.getText(document);
//Files.write(Paths.get(LOCAL_FOLDER + "sic-notes.txt"), rawText.getBytes());
document.close();
// Read the string containing the raw text line by line and try to make sense of it
String noteLine = null;
List<String> currentNote = null;
int lineNumber = 0;
boolean ignore = true; // Lines 1 and 2 must be ignored
BufferedReader noteReader = new BufferedReader(new StringReader(rawText));
while ((noteLine = noteReader.readLine()) != null) {
lineNumber++;
// Ignore the lines that correspond to the headers of the PDF pages
// The pattern for the first page is irregular, so we explicitly eliminate the line containing the section letter by its number
if (lineNumber == 52) ignore = true;
// For all other pages, the pattern is - line beginning or ending with 'Explanatory Notes', then line with page number, then line with section letter
if ((noteLine.startsWith("Explanatory Notes")) || (noteLine.endsWith("Explanatory Notes"))) ignore = true; // Checked: we don't eliminate note lines with this test
if (ignore) {
//System.out.println("Ignored line " + lineNumber + " - " + noteLine);
if ((lineNumber == 2) || (lineNumber == 52) || (noteLine.length() == 1)) ignore = false; // Resume reading after current line
continue;
}
// Find the lines that are item titles
String code = null;
if (noteLine.startsWith("Section")) {
System.out.println(noteLine);
code = noteLine.substring(8, 9); // Checked: first test identifies exactly the section lines
}
// A too loose test like 'begins with two digits' (^\\d{2}.+$) misses vicious cases, so we have to be more precise
// Examples of pathological cases at line numbers 322, 1496, 1827, 3088, 6598, 7098, 8534, 8535, 8622...
if (noteLine.matches("^\\d{2}.+$")) { // We still use a catch-fall test for optimization
if ((noteLine.matches("^\\d{2} .+$")) && (lineNumber != 322)) code = noteLine.substring(0, 2); // Checked: the test identifies exactly the division lines
else if (noteLine.matches("^\\d{2}\\.\\d .+$")) code = noteLine.substring(0, 4); // Checked: the test identifies exactly the group lines
else if (noteLine.matches("^\\d{2}\\.\\d{2} .+$")) code = noteLine.substring(0, 5); // Checked: the test identifies exactly the classes lines
else if (noteLine.matches("^\\d{2}\\.\\d{2}/\\d .+$")) code = noteLine.substring(0, 7); // Checked: the test identifies exactly the subclasses lines
}
if (code != null) { // Start of a note for item identified by 'code'
currentNote = new ArrayList<String>();
System.out.println(lineNumber + " - " + code + " - '" + noteLine.substring(code.length() + 1) + "'");
} else {
if (currentNote != null) currentNote.add(noteLine); // We could avoid the null test since we jumped directly to the first title
}
}
}