本文整理匯總了Java中org.apache.pdfbox.util.PDFTextStripper類的典型用法代碼示例。如果您正苦於以下問題:Java PDFTextStripper類的具體用法?Java PDFTextStripper怎麽用?Java PDFTextStripper使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
PDFTextStripper類屬於org.apache.pdfbox.util包,在下文中一共展示了PDFTextStripper類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: readPDFDocument
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
private void readPDFDocument() {
try {
FileInputStream fs = new FileInputStream(f);
String text = "";
PDFParser parser = new PDFParser(fs);
parser.parse();
COSDocument cosDoc = parser.getDocument();
PDFTextStripper pdfStripper = new PDFTextStripper();
PDDocument pdDoc = new PDDocument(cosDoc);
text = pdfStripper.getText(pdDoc);
String[] docxLines = text.split(System.lineSeparator());
for (String line : docxLines) {
lines.add(line);
}
fs.close();
} catch (Exception e) {
JOptionPane.showMessageDialog(null, "Fehler in readPDFDocument",
"Fehler", JOptionPane.ERROR_MESSAGE);
e.printStackTrace();
}
}
示例2: transformTextAndCheck
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
private void transformTextAndCheck(String text, String encoding, String checkText)
throws IOException
{
// Get a reader for the text
ContentReader reader = buildContentReader(text, Charset.forName(encoding));
// And a temp writer
File out = TempFileProvider.createTempFile("AlfrescoTest_", ".pdf");
ContentWriter writer = new FileContentWriter(out);
writer.setMimetype("application/pdf");
// Transform to PDF
transformer.transform(reader, writer);
// Read back in the PDF and check it
PDDocument doc = PDDocument.load(out);
PDFTextStripper textStripper = new PDFTextStripper();
StringWriter textWriter = new StringWriter();
textStripper.writeText(doc, textWriter);
doc.close();
String roundTrip = clean(textWriter.toString());
assertEquals(
"Incorrect text in PDF when starting from text in " + encoding,
checkText, roundTrip
);
}
示例3: pdftoText
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
static String pdftoText(String fileName,int pageno) throws IOException, CryptographyException
{
File file = new File(fileName);
PDFParser parser = new PDFParser(new FileInputStream(file));
parser.parse();
COSDocument cosDoc = parser.getDocument();
PDFTextStripper pdfStripper = new PDFTextStripper();
PDDocument pdDoc = new PDDocument(cosDoc);
/*pdDoc.decrypt("");
pdDoc.setAllSecurityToBeRemoved(true);*/
pdfStripper.setStartPage(pageno);
pdfStripper.setEndPage(pageno);
String parsedText = pdfStripper.getText(pdDoc);
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
return parsedText;
}
示例4: testPdfFromStringTo
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
@Test
public void testPdfFromStringTo() throws Exception {
// GIVEN a html template containing special characters that java stores in utf-16 internally
Pdf pdf = new Pdf();
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>", PageType.htmlAsString);
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}
示例5: testMultiplePages
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
@Test
public void testMultiplePages() throws Exception {
Pdf pdf = new Pdf();
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Page 1</h1></html>", PageType.htmlAsString);
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Page 2</h1></html>", PageType.htmlAsString);
pdf.addPage("http://www.google.com", PageType.url);
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Page 4</h1></html>", PageType.htmlAsString);
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the fourth page name", pdfText, containsString("Page 4"));
}
示例6: pdftoText
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
public String pdftoText(InputStream is, boolean stats) throws IOException {
PDDocument pdDoc = null;
COSDocument cosDoc = null;
try {
PDFParser parser = new PDFParser(is);
parser.parse();
cosDoc = parser.getDocument();
PDFTextStripper pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
String text = pdfStripper.getText(pdDoc);
if (stats) {
vc.addAll(text);
}
return text;
} finally {
if (cosDoc != null) {
cosDoc.close();
}
if (pdDoc != null) {
pdDoc.close();
}
}
}
示例7: extractWordLocations
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
String extractWordLocations(PDDocument document) throws IOException
{
PDFTextStripper stripper = new PDFTextStripper()
{
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException
{
super.writeString(text, textPositions);
TextPosition firstProsition = textPositions.get(0);
TextPosition lastPosition = textPositions.get(textPositions.size() - 1);
writeString(String.format("[%s - %s / %s]", firstProsition.getXDirAdj(), lastPosition.getXDirAdj() + lastPosition.getWidthDirAdj(), firstProsition.getYDirAdj()));
}
};
stripper.setSortByPosition(true);
return stripper.getText(document);
}
示例8: process
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
public void process(File pdf, File output){
PDDocument pdDoc;
try {//Kudos for closing: http://stackoverflow.com/questions/156508/closing-a-java-fileinputstream
File tmpfile = File.createTempFile(String.format("txttmp-%s", UUID.randomUUID().toString()), null);
RandomAccessFile raf = new RandomAccessFile(tmpfile, "rw");
pdDoc = PDDocument.loadNonSeq(pdf, raf);
FileWriter writer = new FileWriter(output);
try {
PDFTextStripper stripper = new PDFTextStripper();
int numberOfPages = pdDoc.getNumberOfPages();
for (int j = 1; j < numberOfPages+1; j++) {
stripper.setStartPage(j);
stripper.setEndPage(j);
writer.write(stripper.getText(pdDoc));
writer.flush();
}
} finally {
pdDoc.close();
raf.close();
tmpfile.delete();
writer.close();
}
} catch (IOException ioe) {
// log.warn(String.format("Failed to create txt for file: %s", pdf.getName()), ioe);
}
}
示例9: testPdfWithXvfb
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
@Test
public void testPdfWithXvfb() throws Exception {
XvfbConfig xc = new XvfbConfig();
xc.addParams(new Param("--auto-servernum"), new Param("--server-num=1"));
WrapperConfig wc = new WrapperConfig();
wc.setXvfbConfig(xc);
Pdf pdf = new Pdf(wc);
pdf.addPage("http://www.google.com", PageType.url);
pdf.saveAs("output.pdf");
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should be generated", pdfText, containsString("Google"));
}
示例10: doExtractText
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
private String doExtractText(Exchange exchange) throws IOException, CryptographyException, InvalidPasswordException, BadSecurityHandlerException {
LOG.debug("Got {} operation, going to extract text from provided pdf.", pdfConfiguration.getOperation());
PDDocument document = exchange.getIn().getBody(PDDocument.class);
if (document.isEncrypted()) {
DecryptionMaterial decryptionMaterial = exchange.getIn().getHeader(DECRYPTION_MATERIAL_HEADER_NAME,
DecryptionMaterial.class);
if (decryptionMaterial == null) {
throw new IllegalArgumentException(String.format("%s header is expected for %s operation "
+ "on encrypted document",
DECRYPTION_MATERIAL_HEADER_NAME,
pdfConfiguration.getOperation()));
}
document.openProtection(decryptionMaterial);
}
PDFTextStripper pdfTextStripper = new PDFTextStripper();
return pdfTextStripper.getText(document);
}
示例11: testPdfCreation
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
@Test
public void testPdfCreation() throws Exception {
final String expectedText = "expectedText";
template.sendBody("direct:start", expectedText);
resultEndpoint.setExpectedMessageCount(1);
resultEndpoint.expectedMessagesMatches(new Predicate() {
@Override
public boolean matches(Exchange exchange) {
Object body = exchange.getIn().getBody();
assertThat(body, instanceOf(ByteArrayOutputStream.class));
try {
PDDocument doc = PDDocument.load(new ByteArrayInputStream(((ByteArrayOutputStream) body).toByteArray()));
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String text = pdfTextStripper.getText(doc);
assertEquals(1, doc.getNumberOfPages());
assertThat(text, containsString(expectedText));
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
}
});
resultEndpoint.assertIsSatisfied();
}
示例12: testExtractAsMudit
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
/**
* <a href="http://stackoverflow.com/questions/37566288/pdfbox-is-not-giving-right-output">
* PDFBOX is not giving right output
* </a>
* <br/>
* <a href="https://www.dropbox.com/s/bsm4zgv5v0mvj7v/Airtel.pdf?dl=0">
* Airtel.pdf
* </a>
* <p>
* Indeed, PDFBox text extraction hardly returns anything. But inspecting the PDF in question
* makes clear why it is so: Virtually all "text" in the document is not drawn using text
* drawing instructions but instead by defining the character outlines as paths and filling
* them. Thus, hardly anything short of OCR will help here.
* </p>
*/
@Test
public void testExtractAsMudit() throws COSVisitorException, IOException
{
try ( InputStream documentStream = getClass().getResourceAsStream("Airtel.pdf");
PDDocument document = PDDocument.load(documentStream))
{
PDFTextStripper pdfStripper = new PDFTextStripper();
pdfStripper.setStartPage(1);
pdfStripper.setEndPage(1);
String parsedText = pdfStripper.getText(document);
System.out.println("\n'Airtel.pdf', extract as Mudit:");
System.out.println(parsedText);
System.out.println("***********************************");
}
}
示例13: extractNoSpaces
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
String extractNoSpaces(PDDocument document) throws IOException
{
PDFTextStripper stripper = new PDFTextStripper()
{
@Override
protected void processTextPosition(TextPosition text)
{
String character = text.getCharacter();
if (character != null && character.trim().length() != 0)
super.processTextPosition(text);
}
};
stripper.setSortByPosition(true);
return stripper.getText(document);
}
示例14: testPdfFromStringTo
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
@Test
public void testPdfFromStringTo() throws Exception {
// GIVEN a html template containing special characters that java stores in utf-16 internally
Pdf pdf = new Pdf();
pdf.addPageFromString("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>");
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}
示例15: testMultiplePages
import org.apache.pdfbox.util.PDFTextStripper; //導入依賴的package包/類
@Test
public void testMultiplePages() throws Exception {
Pdf pdf = new Pdf();
pdf.addPageFromString("<html><head><meta charset=\"utf-8\"></head><h1>Page 1</h1></html>");
pdf.addPageFromString("<html><head><meta charset=\"utf-8\"></head><h1>Page 2</h1></html>");
pdf.addPageFromUrl("http://www.google.com");
pdf.addPageFromString("<html><head><meta charset=\"utf-8\"></head><h1>Page 4</h1></html>");
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the fourth page name", pdfText, containsString("Page 4"));
}