本文整理汇总了Java中org.pdfbox.pdmodel.PDDocument类的典型用法代码示例。如果您正苦于以下问题:Java PDDocument类的具体用法?Java PDDocument怎么用?Java PDDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
PDDocument类属于org.pdfbox.pdmodel包,在下文中一共展示了PDDocument类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getWordsToHighlight
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
public String[] getWordsToHighlight(String[] highlightWords) {
NRC_PDFHighlighter hl;
CharArrayWriter xmlOutput = null;
String[] wordsToHighlight = null;
try {
hl = new NRC_PDFHighlighter();
PDDocument pdDocument = new PDDocument(document);
xmlOutput = new CharArrayWriter();
hl.generateXMLHighlight(pdDocument, highlightWords, xmlOutput);
wordsToHighlight = hl.getWordsToHighlight();
} catch (IOException e) {
e.printStackTrace();
}
return wordsToHighlight;
}
示例2: generateXMLHighlight
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
/**
* Generate an XML highlight string based on the PDF.
*
* @param pdDocument The PDF to find words in.
* @param sWords The words to search for.
* @param xmlOutput The resulting output xml file.
*
* @throws IOException If there is an error reading from the PDF, or writing to the XML.
*/
public void generateXMLHighlight(PDDocument pdDocument, String[] sWords, Writer xmlOutput ) throws IOException
{
String ls = System.getProperty("line.separator");
highlighterOutput = xmlOutput;
searchedWords = sWords;
foundWords = new Vector(); // initialization - vector filled in endPage()
highlighterOutput.write("<XML>"+ls+"<Body units=characters " +
//color and mode are not implemented by the highlight spec
//so don't include them for now
//" color=#" + getHighlightColorAsString() +
//" mode=active " + */
" version=2>"+ls+"<Highlight>");
highlighterOutput.write(ls);
textOS = new ByteArrayOutputStream();
textWriter = new OutputStreamWriter( textOS, "UTF-16" );
writeText(pdDocument, textWriter);
highlighterOutput.write("</Highlight>"+ls+"</Body>"+ls+"</XML>");
highlighterOutput.flush();
}
示例3: getPDFdocument
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
/**
* @param inputStream
* @param contentHandler
* @return
*/
private PDDocument getPDFdocument(InputStream inputStream, ContentHandler contentHandler) {
PDDocument doc = null;
// Create access to PDF Document
try {
// We get the document from the inputstream
doc = PDDocument.load(inputStream);
} catch (IOException e) {
logger.error("PDFParser(InputStream)", e);
doc = null; // We reset the object
// We write our some stuff into output document, so we have a
// chance to see what went wrong
addErrorTagToOutput(contentHandler, e.toString());
}
return doc;
}
示例4: textContentOf
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
private static String textContentOf(byte[] pdfData) throws IOException {
PDDocument pdfDocument = PDDocument.load(new ByteArrayInputStream(pdfData));
try {
return new PDFTextStripper().getText(pdfDocument);
} finally {
pdfDocument.close();
}
}
示例5: getHighlightPositions
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
public OutputStreamWriter getHighlightPositions(String highlightWord, File filePath) {
NRC_PDFHighlighter hl;
OutputStreamWriter xmlOutput = null;
try {
hl = new NRC_PDFHighlighter();
PDDocument pdDocument = new PDDocument(document);
xmlOutput = new OutputStreamWriter(new FileOutputStream(filePath),"UTF-8");
hl.generateXMLHighlight(pdDocument, highlightWord, xmlOutput);
} catch (IOException e) {
e.printStackTrace();
}
return xmlOutput;
}
示例6: main
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
/**
* Command line application.
*
* @param args The command line arguments to the application.
*
* @throws IOException If there is an error generating the highlight file.
*/
public static void main(String[] args) throws IOException
{
NRC_PDFHighlighter xmlExtractor = new NRC_PDFHighlighter();
PDDocument doc = null;
try
{
if( args.length < 2 )
{
usage();
}
String[] highlightStrings = new String[ args.length - 1];
System.arraycopy( args, 1, highlightStrings, 0, highlightStrings.length );
doc = PDDocument.load( args[0] );
xmlExtractor.generateXMLHighlight(
doc,
highlightStrings,
new OutputStreamWriter( System.out ) );
}
finally
{
if( doc != null )
{
doc.close();
}
}
}
示例7: getText
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
/**
* This will return the text of a document. See writeText. <br />
* NOTE: The document must not be encrypted when coming into this method.
*
* @param doc The document to get the text from.
*
* @return The text of the PDF document.
*
* @throws IOException if the doc state is invalid or it is encrypted.
*/
public String getText (PDDocument doc) throws IOException
{
StringWriter outputStream = new StringWriter();
List ft = new ArrayList();
writeText( doc, outputStream, ft);
String fullText = null;
fullText = outputStream.toString();
return fullText;
}
示例8: startDocument
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
/**
* This method is available for subclasses of this class. It will be called before processing
* of the document start.
*
* @param pdf The PDF document that is being processed.
* @throws IOException If an IO error occurs.
*/
protected void startDocument(PDDocument pdf) throws IOException
{
Iterator textIter = getCharactersByArticle().iterator();
guessTitle(textIter);
writeHeader();
pageNumber = 0;
}
示例9: endDocument
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
/**
* @see PDFTextStripper#endDocument( PDDocument )
*/
public void endDocument(PDDocument pdf) throws IOException
{
output.write("</body>");
output.write("</xmlstream>");
output.flush();
}
示例10: toPDDocument
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
public PDDocument toPDDocument() throws CryptographyException, InvalidPasswordException, IOException {
PDDocument doc;
if(barr!=null)
doc= PDDocument.load(new ByteArrayInputStream(barr,0,barr.length));
else if(resource instanceof FileResource)
doc= PDDocument.load((File)resource);
else
doc= PDDocument.load(new ByteArrayInputStream(IOUtil.toBytes(resource),0,barr.length));
if(password!=null)doc.decrypt(password);
return doc;
}
示例11: extractText
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
public static Object extractText(PDFDocument doc, Set<Integer> pageNumbers) throws IOException, CryptographyException, InvalidPasswordException {
PDDocument pdDoc = doc.toPDDocument();
//PDPageNode pages = pdDoc.getDocumentCatalog().getPages();
//pages.
//pdDoc.getDocumentCatalog().
/*Iterator<Integer> it = pageNumbers.iterator();
int p;
while(it.hasNext()){
p=it.next().intValue();
pdDoc.getDocumentCatalog().getPages()
}
*/
//print.o(pages);
//pdDoc.
//PDFTextStripperByArea stripper = new PDFTextStripperByArea();
//PDFHighlighter stripper = new PDFHighlighter();
PDFText2HTML stripper = new PDFText2HTML();
//PDFTextStripper stripper = new PDFTextStripper();
StringWriter writer = new StringWriter();
stripper.writeText(pdDoc, writer);
return writer.toString();
}
示例12: getDocumentInformation
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
/**
* @param doc
* @return PDFDocumentInformation
*/
private PDDocumentInformation getDocumentInformation(PDDocument doc, ContentHandler contentHandler) {
PDDocumentInformation tmpInfo = null;
try {
tmpInfo = doc.getDocumentInformation();
} catch (Exception e) {
logger.error(e);
addErrorTagToOutput(contentHandler, e.toString());
}
return tmpInfo;
}
示例13: addPageCountAttribute
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
/**
* @param atts
* @param doc
*/
private void addPageCountAttribute(AttributesImpl atts, PDDocument doc) {
int pageCount = 0; //The number of pages in this document
try {
pageCount = doc.getPageCount();
} catch (IOException e) {
logger.error(e);
pageCount = 0;
}
if (pageCount > 0) {
atts.addAttribute("", ATT_PAGES, ATT_PAGES, ATT_CDATA, String.valueOf(pageCount));
}
}
示例14: endDocument
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
/**
* @see PDFTextStripper#endDocument( PDDocument )
*/
public void endDocument(PDDocument pdf) throws IOException
{
output.write("</body></html>");
}
示例15: getText
import org.pdfbox.pdmodel.PDDocument; //导入依赖的package包/类
/**
* This will return the text of a document. See writeText. <br />
* NOTE: The document must not be encrypted when coming into this method.
*
* @param doc The document to get the text from.
*
* @return The text of the PDF document.
*
* @throws IOException if the doc state is invalid or it is encrypted.
*/
public Object[][] getText( PDDocument doc ) throws IOException
{
StringWriter outputStream = new StringWriter();
List ft = new ArrayList();
writeText( doc, outputStream, ft);
return (Object[][])ft.toArray(new Object[][]{});
}