当前位置: 首页>>代码示例>>Java>>正文


Java COSDocument类代码示例

本文整理汇总了Java中org.apache.pdfbox.cos.COSDocument的典型用法代码示例。如果您正苦于以下问题:Java COSDocument类的具体用法?Java COSDocument怎么用?Java COSDocument使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


COSDocument类属于org.apache.pdfbox.cos包,在下文中一共展示了COSDocument类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: readPDFDocument

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
private void readPDFDocument() {
	try {
		FileInputStream fs = new FileInputStream(f);
		String text = "";
		PDFParser parser = new PDFParser(fs);
		parser.parse();
		COSDocument cosDoc = parser.getDocument();
		PDFTextStripper pdfStripper = new PDFTextStripper();
		PDDocument pdDoc = new PDDocument(cosDoc);
		text = pdfStripper.getText(pdDoc);
		String[] docxLines = text.split(System.lineSeparator());
		for (String line : docxLines) {
			lines.add(line);
		}
		fs.close();
	} catch (Exception e) {
		JOptionPane.showMessageDialog(null, "Fehler in readPDFDocument",
				"Fehler", JOptionPane.ERROR_MESSAGE);
		e.printStackTrace();
	}
}
 
开发者ID:Steffen93,项目名称:filterit,代码行数:22,代码来源:FileObject.java

示例2: pdftoText

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
static String pdftoText(String fileName,int pageno) throws IOException, CryptographyException 
{
		
		File file = new File(fileName);
		
		
			PDFParser parser = new PDFParser(new FileInputStream(file));
			parser.parse();
			COSDocument cosDoc = parser.getDocument();
			PDFTextStripper pdfStripper = new PDFTextStripper();
			PDDocument pdDoc = new PDDocument(cosDoc);
			/*pdDoc.decrypt("");
			pdDoc.setAllSecurityToBeRemoved(true);*/
			pdfStripper.setStartPage(pageno);
			pdfStripper.setEndPage(pageno);
			String parsedText = pdfStripper.getText(pdDoc);
	
		
				if (cosDoc != null)
					cosDoc.close();
				if (pdDoc != null)
					pdDoc.close();
		
		return parsedText;
	}
 
开发者ID:arks-api,项目名称:arks-api,代码行数:26,代码来源:PDFTextParser.java

示例3: pdftoText

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
public  String pdftoText(InputStream is, boolean stats) throws IOException {
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    try {
        PDFParser parser = new PDFParser(is);
        parser.parse();
        cosDoc = parser.getDocument();
        PDFTextStripper pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        String text = pdfStripper.getText(pdDoc);
        if (stats) {
            vc.addAll(text);
        }
        return text;
    } finally {
        if (cosDoc != null) {
            cosDoc.close();
        }
        if (pdDoc != null) {
            pdDoc.close();
        }
    }
}
 
开发者ID:judovana,项目名称:JavadocOfflineSearch,代码行数:24,代码来源:PdfAttempter.java

示例4: readThesaurus

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
private List<SubstanceInteraction> readThesaurus(File dir) throws IOException {
    File file = new File(dir, "thesaurus.pdf");

    PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(file));
    parser.parse();
    COSDocument cosDoc = parser.getDocument();
    ThesaurusPDFStripper pdfStripper = new ThesaurusPDFStripper();
    PDDocument pdDoc = new PDDocument(cosDoc);
    pdfStripper.setStartPage(2);
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    pdfStripper.getText(pdDoc);

    cosDoc.close();

    return pdfStripper.substances;
}
 
开发者ID:Ellixo,项目名称:MedicamentDB,代码行数:18,代码来源:InteractionService.java

示例5: parse

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
@Test
public void parse() throws IOException {
    File file = new File(MedicamentTest.class.getClassLoader().getResource(".").getFile(), "thesaurus.pdf");

    PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(file));
    parser.parse();
    COSDocument cosDoc = parser.getDocument();
    AdvancedPDFStripper pdfStripper = new AdvancedPDFStripper();
    PDDocument pdDoc = new PDDocument(cosDoc);
    pdfStripper.setStartPage(2);
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    pdfStripper.getText(pdDoc);

    System.out.println(pdfStripper.substances);
}
 
开发者ID:Ellixo,项目名称:MedicamentDB,代码行数:17,代码来源:PDFTest.java

示例6: findpages

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
public List<Container> findpages(String path, String searchKeyword) throws IOException 
{
	// TODO Auto-generated method stub
	
	int i; // page no.
	
	File file = new File(path);
	PDFParser parser = new PDFParser(new FileInputStream(file));
	parser.parse();
	
       COSDocument cosDoc = parser.getDocument();
       PDFTextStripper reader = new PDFTextStripper();
       PDDocument doc = new PDDocument(cosDoc);
       
       List<Container> list = new ArrayList<Container>();
       
	for(i=0;i<=doc.getNumberOfPages();i++)
	{
		reader.setStartPage(i);
		reader.setEndPage(i);
		
		if(reader.getText(doc).contains(searchKeyword))
		{
			Container container = new Container();
			container.setContent(reader.getText(doc));
			container.setFilepath(path);
			container.setPageno(i);
			list.add(container);
		}
		
	}
	
	return list;
	
}
 
开发者ID:arks-api,项目名称:arks-api,代码行数:36,代码来源:PageByPageSearch.java

示例7: findpages

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
public List<SearchResult> findpages(String path,
        List<String> searchKeywordList, int fileCounter) throws IOException {

    int i; // page no.
    boolean hasKeywords = false;

    PDDocument finalDocument = new PDDocument();
    List<PDPage> pageList = new ArrayList<PDPage>();

    File file = new File(path);
    PDFParser parser = new PDFParser(new RandomAccessBuffer(
            new FileInputStream(file)));
    parser.parse();

    COSDocument cosDoc = parser.getDocument();
    PDFTextStripper reader = new PDFTextStripper();
    PDDocument doc = new PDDocument(cosDoc);

    List<SearchResult> list = new ArrayList<SearchResult>();

    for (i = 0; i <= doc.getNumberOfPages(); i++) {
        reader.setStartPage(i);
        reader.setEndPage(i);
        hasKeywords = true;

        for (String keyword : searchKeywordList) {

            if (!reader.getText(doc).toLowerCase()
                    .contains(keyword.toLowerCase())) {
                hasKeywords = false;
                break;
            }
        }

        if (hasKeywords) {

            if (falseCounter > 1) {
                SearchResult result = new PageResult();
                result.setFileContent(reader.getText(doc));
                result.setFilePath(path);
                result.setPageNumber(i);
                list.add(result);
                pageList.add(doc.getPage(i));
            }

            falseCounter++;
        }

    }

    for (PDPage page : pageList) {
        finalDocument.addPage(page);
    }

    finalDocument
            .save(ConfigCBSI.getResultPdfPath() + fileCounter + ".pdf");
    finalDocument.close();
    logger.info("Result Saved");

    return list;

}
 
开发者ID:arks-api,项目名称:arks-api,代码行数:63,代码来源:PageByPageSearch.java

示例8: buildWordMap

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
public static Map<String, Integer> buildWordMap(String fileName) 
  {
      
      Map<String, Integer> wordMap = new HashMap<String,Integer>();
try 
{
	File file = new File(fileName);
	PDFParser parser = new PDFParser(new FileInputStream(file));
	parser.parse();
	
       COSDocument cosDoc = parser.getDocument();
       PDFTextStripper reader = new PDFTextStripper();
       PDDocument doc = new PDDocument(cosDoc);
       
	StringBuilder sb = new StringBuilder();
	for (int i = 0; i < doc.getNumberOfPages(); i++)
	{
		reader.setStartPage(i);
		reader.setEndPage(i);
		sb.append(reader.getText(doc));
	}
	String pdffulltext = sb.toString();
	String processedtext = pdffulltext.replaceAll("\\p{Punct}|\\d", "").toLowerCase();
	String[] words = processedtext.split(" ");
	for (String word : words) 
	{
              if (wordMap.containsKey(word))
              {
                  wordMap.put(word, (wordMap.get(word) + 1));
              }
              else
              {
                  wordMap.put(word, 1);
              }
          }
	
} 
catch (IOException e)
{
	// TODO Auto-generated catch block
	e.printStackTrace();
}
      return wordMap;
  }
 
开发者ID:arks-api,项目名称:arks-api,代码行数:45,代码来源:WordCount.java

示例9: getDocument

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
/**
 * {@inheritDoc}
 */
public final COSDocument getDocument() {
	return document.getDocument();
}
 
开发者ID:juliusHuelsmann,项目名称:paint,代码行数:7,代码来源:XDocument.java

示例10: parse

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
void parse(COSDocument doc) throws IOException
{
    parse(new PDDocument(doc));
}
 
开发者ID:nemausus,项目名称:research-paper-parser,代码行数:5,代码来源:PDFParser.java

示例11: findpages

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
public List<SearchResult> findpages(String path,
		List<String> searchKeywordList, int fileCounter) throws IOException {

	int i; // page no.
	boolean hasKeywords = false;
	boolean hasSingleKeyword = false;

	PDDocument finalDocument = new PDDocument();
	List<PDPage> pageList = new ArrayList<PDPage>();

	File file = new File(path);
	PDFParser parser = new PDFParser(new RandomAccessBuffer(
			new FileInputStream(file)));
	parser.parse();

	COSDocument cosDoc = parser.getDocument();
	PDFTextStripper reader = new PDFTextStripper();
	PDDocument doc = new PDDocument(cosDoc);

	List<SearchResult> list = new ArrayList<SearchResult>();

	for (i = 0; i <= doc.getNumberOfPages() - 1; i++) {
		reader.setStartPage(i);
		reader.setEndPage(i);
		hasKeywords = true;
		hasSingleKeyword = false;

		for (String keyword : searchKeywordList) {

			if (!reader.getText(doc).toLowerCase()
					.contains(keyword.toLowerCase())) {
				hasKeywords = false;
				break;
			}

		}

		if (hasKeywords) {

			/* if (falseCounter > 1) { */
			SearchResult result = new PageResult();
			result.setFileContent(reader.getText(doc));
			result.setFilePath(path);
			result.setPageNumber(i);
			list.add(result);
			pageList.add(doc.getPage(i));

			/* } */

			falseCounter++;
		}

	}

	for (PDPage page : pageList) {
		finalDocument.addPage(page);
		validResult = true;
	}

	if (validResult) {
		finalDocument.save(ConfigCBSI.getResultPdfPath() + fileCounter
				+ ".pdf");
		finalDocument.close();
		logger.info("Result Saved");
		validResult = false;
	}

	return list;

}
 
开发者ID:arks-api,项目名称:arks-api,代码行数:71,代码来源:PageByPageSearch.java

示例12: readInteractions

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
private Map<String, List<Interaction>> readInteractions(File dir) throws IOException {
    File file = new File(dir, "interactions.pdf");

    PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(file));

    parser.parse();
    COSDocument cosDoc = parser.getDocument();
    InteractionPDFStripper pdfStripper = new InteractionPDFStripper();
    PDDocument pdDoc = new PDDocument(cosDoc);
    pdfStripper.setStartPage(2);
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    pdfStripper.getText(pdDoc);

    cosDoc.close();

    Map<String, List<Interaction>> interactions = pdfStripper.interactions;
    Map<String, List<Interaction>> newInteractions = new HashMap<>();

    for (List<Interaction> tmp : interactions.values()) {
        for (Interaction interaction : tmp) {
            String famille2 = normalize(interaction.getFamille2(), true);

            // cas particuliers
            switch (famille2) {
                case "medicaments hyponatremiants":
                    famille2 = "hyponatremiants";
                    break;
            }

            List<Interaction> interactions2 = interactions.get(famille2);
            if (interactions2 == null) {
                LOG.warn("interaction " + interaction.getFamille2() + " inconnu");

                Interaction newInteraction;
                if (newInteractions.get(famille2) == null) {
                    newInteractions.put(famille2, new ArrayList<>());

                    newInteraction = pdfStripper.createEmptyInteraction(-1, interaction.getFamille2());
                } else {
                    newInteraction = pdfStripper.createEmptyInteraction(Integer.parseInt(newInteractions.get(famille2).get(0).getId1()), interaction.getFamille2());
                }

                newInteraction.setId2(interaction.getId1());
                newInteraction.setFamille2(interaction.getFamille1());
                newInteraction.setDescription(interaction.getDescription());
                newInteraction.setConseil(interaction.getConseil());

                newInteractions.get(famille2).add(newInteraction);

                interaction.setId2(newInteraction.getId1());
            } else {
                interaction.setId2(interactions2.get(0).getId1());
            }
        }
    }

    for (String key : newInteractions.keySet()) {
        interactions.put(key, newInteractions.get(key));
    }

    return interactions;

}
 
开发者ID:Ellixo,项目名称:MedicamentDB,代码行数:65,代码来源:InteractionService.java

示例13: getText

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
/**
 * @deprecated
 * @see PDFTextStripper#getText( PDDocument )
 * @param doc The document to extract the text from.
 * @return The document text.
 * @throws IOException If there is an error extracting the text.
 */
public String getText( COSDocument doc ) throws IOException
{
    return getText( new PDDocument( doc ) );
}
 
开发者ID:hemangandhi,项目名称:my-cv-site,代码行数:12,代码来源:FormattedReader.java

示例14: writeText

import org.apache.pdfbox.cos.COSDocument; //导入依赖的package包/类
/**
 * @deprecated
 * @see PDFTextStripper#writeText( PDDocument, Writer )
 * @param doc The document to extract the text.
 * @param outputStream The stream to write the text to.
 * @throws IOException If there is an error extracting the text.
 */
public void writeText( COSDocument doc, Writer outputStream ) throws IOException
{
    writeText( new PDDocument( doc ), outputStream );
}
 
开发者ID:hemangandhi,项目名称:my-cv-site,代码行数:12,代码来源:FormattedReader.java


注:本文中的org.apache.pdfbox.cos.COSDocument类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。