本文整理汇总了Java中org.apache.pdfbox.pdfparser.PDFParser类的典型用法代码示例。如果您正苦于以下问题:Java PDFParser类的具体用法?Java PDFParser怎么用?Java PDFParser使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
PDFParser类属于org.apache.pdfbox.pdfparser包,在下文中一共展示了PDFParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: readPDFDocument
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
private void readPDFDocument() {
try {
FileInputStream fs = new FileInputStream(f);
String text = "";
PDFParser parser = new PDFParser(fs);
parser.parse();
COSDocument cosDoc = parser.getDocument();
PDFTextStripper pdfStripper = new PDFTextStripper();
PDDocument pdDoc = new PDDocument(cosDoc);
text = pdfStripper.getText(pdDoc);
String[] docxLines = text.split(System.lineSeparator());
for (String line : docxLines) {
lines.add(line);
}
fs.close();
} catch (Exception e) {
JOptionPane.showMessageDialog(null, "Fehler in readPDFDocument",
"Fehler", JOptionPane.ERROR_MESSAGE);
e.printStackTrace();
}
}
示例2: pdftoText
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
static String pdftoText(String fileName,int pageno) throws IOException, CryptographyException
{
File file = new File(fileName);
PDFParser parser = new PDFParser(new FileInputStream(file));
parser.parse();
COSDocument cosDoc = parser.getDocument();
PDFTextStripper pdfStripper = new PDFTextStripper();
PDDocument pdDoc = new PDDocument(cosDoc);
/*pdDoc.decrypt("");
pdDoc.setAllSecurityToBeRemoved(true);*/
pdfStripper.setStartPage(pageno);
pdfStripper.setEndPage(pageno);
String parsedText = pdfStripper.getText(pdDoc);
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
return parsedText;
}
示例3: rotate180
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
public ActionForward rotate180(ActionMapping mapping, ActionForm form, HttpServletRequest request, HttpServletResponse response) throws Exception {
Document doc = documentDao.getDocument(request.getParameter("document"));
String docdownload = oscar.OscarProperties.getInstance().getProperty("DOCUMENT_DIR");
FileInputStream input = new FileInputStream(docdownload + doc.getDocfilename());
PDFParser parser = new PDFParser(input);
parser.parse();
PDDocument pdf = parser.getPDDocument();
int x = 1;
for (Object p : pdf.getDocumentCatalog().getAllPages()) {
PDPage pg = (PDPage)p;
Integer r = (pg.getRotation() != null ? pg.getRotation() : 0);
pg.setRotation((r+180)%360);
ManageDocumentAction.deleteCacheVersion(doc, x);
x++;
}
pdf.save(docdownload + doc.getDocfilename());
pdf.close();
input.close();
return null;
}
示例4: rotate90
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
public ActionForward rotate90(ActionMapping mapping, ActionForm form, HttpServletRequest request, HttpServletResponse response) throws Exception {
Document doc = documentDao.getDocument(request.getParameter("document"));
String docdownload = oscar.OscarProperties.getInstance().getProperty("DOCUMENT_DIR");
FileInputStream input = new FileInputStream(docdownload + doc.getDocfilename());
PDFParser parser = new PDFParser(input);
parser.parse();
PDDocument pdf = parser.getPDDocument();
int x = 1;
for (Object p : pdf.getDocumentCatalog().getAllPages()) {
PDPage pg = (PDPage)p;
Integer r = (pg.getRotation() != null ? pg.getRotation() : 0);
pg.setRotation((r+90)%360);
ManageDocumentAction.deleteCacheVersion(doc, x);
x++;
}
pdf.save(docdownload + doc.getDocfilename());
pdf.close();
input.close();
return null;
}
示例5: testPdfFromStringTo
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
@Test
public void testPdfFromStringTo() throws Exception {
// GIVEN a html template containing special characters that java stores in utf-16 internally
Pdf pdf = new Pdf();
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>", PageType.htmlAsString);
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}
示例6: testMultiplePages
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
@Test
public void testMultiplePages() throws Exception {
Pdf pdf = new Pdf();
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Page 1</h1></html>", PageType.htmlAsString);
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Page 2</h1></html>", PageType.htmlAsString);
pdf.addPage("http://www.google.com", PageType.url);
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Page 4</h1></html>", PageType.htmlAsString);
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the fourth page name", pdfText, containsString("Page 4"));
}
示例7: pdftoText
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
public String pdftoText(InputStream is, boolean stats) throws IOException {
PDDocument pdDoc = null;
COSDocument cosDoc = null;
try {
PDFParser parser = new PDFParser(is);
parser.parse();
cosDoc = parser.getDocument();
PDFTextStripper pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
String text = pdfStripper.getText(pdDoc);
if (stats) {
vc.addAll(text);
}
return text;
} finally {
if (cosDoc != null) {
cosDoc.close();
}
if (pdDoc != null) {
pdDoc.close();
}
}
}
示例8: testPdfFromStringTo
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
@Test
public void testPdfFromStringTo() throws Exception {
// GIVEN an html template containing special characters that java stores in utf-16 internally
Pdf pdf = pdfBuilder.build();
pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>", PageType.htmlAsString);
String tempFolder = temporaryFolder.newFolder().getPath();
pdf.saveAs(tempFolder+"/output.pdf");
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes)));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}
示例9: testPdfWithXvfb
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
@Test
public void testPdfWithXvfb() throws Exception {
XvfbConfig xc = new XvfbConfig();
xc.addParams(new Param("--auto-servernum"), new Param("--server-num=1"));
WrapperConfig wc = new WrapperConfig();
wc.setXvfbConfig(xc);
Pdf pdf = new Pdf(wc);
pdf.addPage("http://www.google.com", PageType.url);
pdf.saveAs("output.pdf");
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should be generated", pdfText, containsString("Google"));
}
示例10: readThesaurus
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
private List<SubstanceInteraction> readThesaurus(File dir) throws IOException {
File file = new File(dir, "thesaurus.pdf");
PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(file));
parser.parse();
COSDocument cosDoc = parser.getDocument();
ThesaurusPDFStripper pdfStripper = new ThesaurusPDFStripper();
PDDocument pdDoc = new PDDocument(cosDoc);
pdfStripper.setStartPage(2);
pdfStripper.setEndPage(pdDoc.getNumberOfPages());
pdfStripper.getText(pdDoc);
cosDoc.close();
return pdfStripper.substances;
}
示例11: parse
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
@Test
public void parse() throws IOException {
File file = new File(MedicamentTest.class.getClassLoader().getResource(".").getFile(), "thesaurus.pdf");
PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(file));
parser.parse();
COSDocument cosDoc = parser.getDocument();
AdvancedPDFStripper pdfStripper = new AdvancedPDFStripper();
PDDocument pdDoc = new PDDocument(cosDoc);
pdfStripper.setStartPage(2);
pdfStripper.setEndPage(pdDoc.getNumberOfPages());
pdfStripper.getText(pdDoc);
System.out.println(pdfStripper.substances);
}
示例12: testPdfFromStringTo
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
@Test
public void testPdfFromStringTo() throws Exception {
// GIVEN a html template containing special characters that java stores in utf-16 internally
Pdf pdf = new Pdf();
pdf.addPageFromString("<html><head><meta charset=\"utf-8\"></head><h1>Müller</h1></html>");
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the creditorName", pdfText, containsString("Müller"));
}
示例13: testMultiplePages
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
@Test
public void testMultiplePages() throws Exception {
Pdf pdf = new Pdf();
pdf.addPageFromString("<html><head><meta charset=\"utf-8\"></head><h1>Page 1</h1></html>");
pdf.addPageFromString("<html><head><meta charset=\"utf-8\"></head><h1>Page 2</h1></html>");
pdf.addPageFromUrl("http://www.google.com");
pdf.addPageFromString("<html><head><meta charset=\"utf-8\"></head><h1>Page 4</h1></html>");
// WHEN
byte[] pdfBytes = pdf.getPDF();
PDFParser parser = new PDFParser(new ByteArrayInputStream(pdfBytes));
// that is a valid PDF (otherwise an IOException occurs)
parser.parse();
PDFTextStripper pdfTextStripper = new PDFTextStripper();
String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));
Assert.assertThat("document should contain the fourth page name", pdfText, containsString("Page 4"));
}
示例14: getFile
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
private boolean getFile(String url)
{
try
{
URL u = new URL(url);
URLConnection con = u.openConnection();
InputStream in = con.getInputStream();
PDFParser p = new PDFParser(in);
p.parse();
PDDocument pdoc = new PDDocument(p.getDocument());
PDFTextStripper pts = new PDFTextStripper();
text = pts.getText(pdoc);
pdoc.close();
return true;
}
catch (Exception e)
{
e.printStackTrace();
return false;
}
}
示例15: shrinkMe
import org.apache.pdfbox.pdfparser.PDFParser; //导入依赖的package包/类
/**
* Shrink a PDF
* @param f {@code File} pointing to the PDF to shrink
* @param compQual Compression quality parameter. 0 is
* smallest file, 1 is highest quality.
* @return The compressed {@code PDDocument}
* @throws FileNotFoundException
* @throws IOException
*/
private PDDocument shrinkMe()
throws FileNotFoundException, IOException {
if(compQual < 0)
compQual = compQualDefault;
final FileInputStream fis = new FileInputStream(input);
final PDFParser parser = new PDFParser(fis);
parser.parse();
final PDDocument doc = parser.getPDDocument();
List pages = doc.getDocumentCatalog().getAllPages();
for(Object p : pages) {
if(!(p instanceof PDPage))
continue;
PDPage page = (PDPage) p;
scanResources(page.getResources(), doc);
}
return doc;
}