本文整理汇总了Java中org.apache.pdfbox.text.PDFTextStripper类的典型用法代码示例。如果您正苦于以下问题:Java PDFTextStripper类的具体用法?Java PDFTextStripper怎么用?Java PDFTextStripper使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
PDFTextStripper类属于org.apache.pdfbox.text包,在下文中一共展示了PDFTextStripper类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: process
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
private boolean process() throws IOException {
boolean toReturn = false;
PDFTextStripper stripper = new TitleExtractor();
PDDocument document = null;
try {
document = PDDocument.load(new File(this.getFileNamePathWithExtension()));
//((TitleExtractor) stripper).setFileNamePathWithExtension(this.getFileNamePathWithExtension());
stripper.setSortByPosition(true);
stripper.setStartPage(0);
stripper.setEndPage(1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
setTitle(((TitleExtractor) stripper).getTitle());
toReturn = true;
} finally {
if (document != null) {
document.close();
}
}
return toReturn;
}
示例2: testNoToUnicodeTest2
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
/**
* <a href="https://stackoverflow.com/questions/45895768/pdfbox-2-0-7-extracttext-not-working-but-1-8-13-does-and-pdfreader-as-well">
* PDFBox 2.0.7 ExtractText not working but 1.8.13 does and PDFReader as well
* </a>
* <br/>
* <a href="https://wetransfer.com/downloads/214674449c23713ee481c5a8f529418320170827201941/b2bea6">
* test-2.pdf
* </a>
* <p>
* Due to the broken <b>ToUnicode</b> maps the output of immediate text
* extraction from this document is unsatisfying, cf. {@link #testTest2()}.
* It can be improved by removing these <b>ToUnicode</b> maps as this test
* shows.
* </p>
*/
@Test
public void testNoToUnicodeTest2() throws IOException
{
try ( InputStream resource = getClass().getResourceAsStream("test-2.pdf") )
{
PDDocument document = PDDocument.load(resource);
for (int pageNr = 0; pageNr < document.getNumberOfPages(); pageNr++)
{
PDPage page = document.getPage(pageNr);
PDResources resources = page.getResources();
removeToUnicodeMaps(resources);
}
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(document);
System.out.printf("\n*\n* test-2.pdf without ToUnicode\n*\n%s\n", text);
Files.write(new File(RESULT_FOLDER, "test-2_NoToUnicode.txt").toPath(), Collections.singleton(text));
}
}
示例3: main
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
public static void main(String[] args) {
try {
PDDocument document = PDDocument.load(new File("PDF File.pdf"));
PDFTextStripper Tstripper = new PDFTextStripper();
String documentText = Tstripper.getText(document);
System.out.println(documentText);
} catch (Exception e) {
e.printStackTrace();
}
}
示例4: extractPdfText
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
/**
* Extracts all the Text inside a Pdf
*/
private static String extractPdfText(byte[] pdfData) throws IOException {
PDDocument pdfDocument = PDDocument.load(new ByteArrayInputStream(pdfData));
try {
return new PDFTextStripper().getText(pdfDocument);
} finally {
pdfDocument.close();
}
}
开发者ID:jonashackt,项目名称:cxf-spring-cloud-netflix-docker,代码行数:12,代码来源:WeatherBackendApplicationTests.java
示例5: readPdf
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
private static String readPdf(InputStream is) throws Exception {
String result;
PDDocument doc = PDDocument.load(is);
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(doc);
if(doc!= null) {
doc.close();
}
if (is != null) {
is.close();
}
return result;
}
示例6: process
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
private boolean process() throws IOException {
boolean toReturn = false;
PDFTextStripper stripper = new AuthorExtractor();
PDDocument document = null;
try {
document = PDDocument.load(new File(this.getFileNamePathWithExtension()));
//((TitleExtractor) stripper).setFileNamePathWithExtension(this.getFileNamePathWithExtension());
stripper.setSortByPosition(true);
stripper.setStartPage(0);
stripper.setEndPage(1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
setAuthorNames(((AuthorExtractor) stripper).getAuthorNames());
setAuthorAffiliations(((AuthorExtractor) stripper).getAuthorAffiliations());
setAuthorContacts(((AuthorExtractor) stripper).getAuthorContacts());
toReturn = true;
} finally {
if (document != null) {
document.close();
}
}
return toReturn;
}
示例7: createTitleBlockForPartIterationTest
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
@Test
public void createTitleBlockForPartIterationTest() throws Exception {
PartTitleBlockData partTitleBlockData = new PartTitleBlockData(partIteration, new Locale("en"));
byte[] titleBlock = new TitleBlockWriter(partTitleBlockData).createTitleBlock();
PDDocument loadedDocument = PDDocument.load(titleBlock);
Assert.assertNotNull(loadedDocument);
String text = new PDFTextStripper().getText(loadedDocument);
loadedDocument.close();
Assert.assertFalse(text.isEmpty());
Assert.assertTrue(text.contains(user.getLogin()));
Assert.assertTrue(text.contains(partIteration.getNumber()));
Assert.assertTrue(text.contains(partIteration.getPartRevision().getDescription()));
}
示例8: testPdfFromStringTo
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
@Test
public void testPdfFromStringTo() throws Exception {
// GIVEN an html template containing special characters that java stores in utf-16 internally
Pdf pdf = pdfBuilder.build();
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>", PageType.htmlAsString);
String tempFolder = temporaryFolder.newFolder().getPath();
pdf.saveAs(tempFolder+"/output.pdf");
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes)));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}
示例9: extractFromPDF
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
/**
* Reads the PDF file and extracts the content and table of contents to text files.
*/
public static void extractFromPDF() throws IOException {
PDDocument document = PDDocument.load(new File(MCV_PDF));
PDFTextStripper stripper = new PDFTextStripper();
// Extract the main contents
stripper.setStartPage(15);
String text = stripper.getText(document);
Files.write(Paths.get(MCV_TXT), text.getBytes());
// Extract the table of contents
stripper.setStartPage(3);
stripper.setEndPage(14);
text = stripper.getText(document);
Files.write(Paths.get(MCV_TXT_TOC), text.getBytes());
document.close();
}
示例10: PDF
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
private PDF(String name, byte[] content) {
this.content = content;
try (InputStream inputStream = new ByteArrayInputStream(content)) {
try (PDDocument pdf = PDDocument.load(inputStream)) {
this.text = new PDFTextStripper().getText(pdf);
this.numberOfPages = pdf.getNumberOfPages();
this.author = pdf.getDocumentInformation().getAuthor();
this.creationDate = pdf.getDocumentInformation().getCreationDate();
this.creator = pdf.getDocumentInformation().getCreator();
this.keywords = pdf.getDocumentInformation().getKeywords();
this.producer = pdf.getDocumentInformation().getProducer();
this.subject = pdf.getDocumentInformation().getSubject();
this.title = pdf.getDocumentInformation().getTitle();
this.encrypted = pdf.isEncrypted();
PDSignature signature = pdf.getLastSignatureDictionary();
this.signed = signature != null;
this.signerName = signature == null ? null : signature.getName();
this.signatureTime = signature == null ? null : signature.getSignDate();
}
}
catch (Exception e) {
throw new IllegalArgumentException("Invalid PDF file: " + name, e);
}
}
示例11: readPdf
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
public static String readPdf(InputStream is) throws Exception {
PDDocument document = PDDocument.load(is);
PDFTextStripper textStripper = new PDFTextStripper();
return textStripper.getText(document);
}
示例12: extractPdf
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
public static String extractPdf(String path) throws IOException {
// Loading an existing document
File file = new File(path);
PDDocument document = PDDocument.load(file);
// Instantiate PDFTextStripper class
PDFTextStripper pdfStripper = new PDFTextStripper();
// Retrieving text from PDF document
String text = pdfStripper.getText(document);
document.close();
return text;
}
示例13: createTitleBlockForDocumentIterationTest
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
@Test
public void createTitleBlockForDocumentIterationTest() throws Exception {
DocumentTitleBlockData documentTitleData = new DocumentTitleBlockData(documentIteration, new Locale("en"));
byte[] titleBlock = new TitleBlockWriter(documentTitleData).createTitleBlock();
PDDocument loadedDocument = PDDocument.load(titleBlock);
Assert.assertNotNull(loadedDocument);
String text = new PDFTextStripper().getText(loadedDocument);
loadedDocument.close();
Assert.assertFalse(text.isEmpty());
Assert.assertTrue(text.contains(user.getLogin()));
Assert.assertTrue(text.contains(documentIteration.getId()));
Assert.assertTrue(text.contains(documentIteration.getDocumentRevision().getDescription()));
}
示例14: findSubwords
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
List<TextPositionSequence> findSubwords(PDDocument document, int page, String searchTerm) throws IOException
{
final List<TextPositionSequence> hits = new ArrayList<TextPositionSequence>();
PDFTextStripper stripper = new PDFTextStripper()
{
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException
{
System.out.printf(" -- %s\n", text);
TextPositionSequence word = new TextPositionSequence(textPositions);
String string = word.toString();
int fromIndex = 0;
int index;
while ((index = string.indexOf(searchTerm, fromIndex)) > -1)
{
hits.add(word.subSequence(index, index + searchTerm.length()));
fromIndex = index + 1;
}
super.writeString(text, textPositions);
}
};
stripper.setSortByPosition(true);
stripper.setStartPage(page);
stripper.setEndPage(page);
stripper.getText(document);
return hits;
}
示例15: testPnL_500010_0314
import org.apache.pdfbox.text.PDFTextStripper; //导入依赖的package包/类
/**
* <a href="http://stackoverflow.com/questions/37862159/pdf-reading-via-pdfbox-in-java">
* pdf reading via pdfbox in java
* </a>
* <br/>
* <a href="https://drive.google.com/file/d/0B_Ke2amBgdpedUNwVTR3RVlRTFE/view?usp=sharing">
* PnL_500010_0314.pdf
* </a>
* <p>
* Indeed, the <code>PDFTextStripper</code> is not even informed about those undecipherable
* text sections. Essentially the underlying method `PDFTextStreamEngine.showGlyph` filters
* all unmappable glyphs from composite fonts.
* </p>
*/
@Test
public void testPnL_500010_0314() throws IOException
{
try ( InputStream resource = getClass().getResourceAsStream("PnL_500010_0314.pdf") )
{
PDDocument document = PDDocument.load(resource);
PDFTextStripper stripper = new PDFTextStripper();
//stripper.setSortByPosition(true);
String text = stripper.getText(document);
System.out.printf("\n*\n* PnL_500010_0314.pdf\n*\n%s\n", text);
Files.write(new File(RESULT_FOLDER, "PnL_500010_0314.txt").toPath(), Collections.singleton(text));
}
}