当前位置: 首页>>代码示例>>Java>>正文


Java PDFTextStripper.setStartPage方法代码示例

本文整理汇总了Java中org.apache.pdfbox.text.PDFTextStripper.setStartPage方法的典型用法代码示例。如果您正苦于以下问题:Java PDFTextStripper.setStartPage方法的具体用法?Java PDFTextStripper.setStartPage怎么用?Java PDFTextStripper.setStartPage使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.pdfbox.text.PDFTextStripper的用法示例。


在下文中一共展示了PDFTextStripper.setStartPage方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: process

import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
private boolean process() throws IOException {
    boolean toReturn = false;
    PDFTextStripper stripper = new TitleExtractor();
    PDDocument document = null;

    try {
        document = PDDocument.load(new File(this.getFileNamePathWithExtension()));

        //((TitleExtractor) stripper).setFileNamePathWithExtension(this.getFileNamePathWithExtension());
        stripper.setSortByPosition(true);
        stripper.setStartPage(0);
        stripper.setEndPage(1);

        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        stripper.writeText(document, dummy);
        
        setTitle(((TitleExtractor) stripper).getTitle());

        toReturn = true;
    } finally {
        if (document != null) {
            document.close();
        }
    }
    return toReturn;
}
 
开发者ID:malikalamgirian,项目名称:PDF2RDF,代码行数:27,代码来源:TitleExtractor.java

示例2: process

import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
private boolean process() throws IOException {
    boolean toReturn = false;
    PDFTextStripper stripper = new AuthorExtractor();
    PDDocument document = null;

    try {
        document = PDDocument.load(new File(this.getFileNamePathWithExtension()));

        //((TitleExtractor) stripper).setFileNamePathWithExtension(this.getFileNamePathWithExtension());
        stripper.setSortByPosition(true);
        stripper.setStartPage(0);
        stripper.setEndPage(1);

        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        stripper.writeText(document, dummy);

        setAuthorNames(((AuthorExtractor) stripper).getAuthorNames());
        setAuthorAffiliations(((AuthorExtractor) stripper).getAuthorAffiliations());
        setAuthorContacts(((AuthorExtractor) stripper).getAuthorContacts());

        toReturn = true;
    } finally {
        if (document != null) {
            document.close();
        }
    }
    return toReturn;
}
 
开发者ID:malikalamgirian,项目名称:PDF2RDF,代码行数:29,代码来源:AuthorExtractor.java

示例3: extractFromPDF

import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
 * Reads the PDF file and extracts the content and table of contents to text files.
 */
public static void extractFromPDF() throws IOException {

	PDDocument document = PDDocument.load(new File(MCV_PDF));
	PDFTextStripper stripper = new PDFTextStripper();

	// Extract the main contents
	stripper.setStartPage(15);
	String text = stripper.getText(document);
	Files.write(Paths.get(MCV_TXT), text.getBytes());

	// Extract the table of contents
	stripper.setStartPage(3);
	stripper.setEndPage(14);
	text = stripper.getText(document);
	Files.write(Paths.get(MCV_TXT_TOC), text.getBytes());

	document.close();
}
 
开发者ID:FranckCo,项目名称:Stamina,代码行数:22,代码来源:MCVReader.java

示例4: findSubwords

import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
List<TextPositionSequence> findSubwords(PDDocument document, int page, String searchTerm) throws IOException
{
    final List<TextPositionSequence> hits = new ArrayList<TextPositionSequence>();
    PDFTextStripper stripper = new PDFTextStripper()
    {
        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException
        {
            System.out.printf("  -- %s\n", text);

            TextPositionSequence word = new TextPositionSequence(textPositions);
            String string = word.toString();

            int fromIndex = 0;
            int index;
            while ((index = string.indexOf(searchTerm, fromIndex)) > -1)
            {
                hits.add(word.subSequence(index, index + searchTerm.length()));
                fromIndex = index + 1;
            }
            super.writeString(text, textPositions);
        }
    };
    
    stripper.setSortByPosition(true);
    stripper.setStartPage(page);
    stripper.setEndPage(page);
    stripper.getText(document);
    return hits;
}
 
开发者ID:mkl-public,项目名称:testarea-pdfbox2,代码行数:31,代码来源:SearchSubword.java

示例5: extractFromPDF

import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
 * Reads the PDF file and extracts the SIMS contents to the internal sorted map.
 */
public void extractFromPDF() throws IOException {

	// Reinitialize the sorted map
	entryMap = new TreeMap<String, String>();

	// Extract the SIMS contents into a list of strings
	logger.debug("Starting PDF extraction from " + SIMS_PDF);
	PDDocument document = PDDocument.load(new File(SIMS_PDF));
	PDFTextStripper stripper = new PDFTextStripper();
	stripper.setStartPage(SIMS_PAGE);
	stripper.setEndPage(SIMS_PAGE);
	String text = stripper.getText(document);
	String[] lines = text.split("\r\n");
	int lineIndex = 0;
	StringBuilder currentLine = new StringBuilder();
	for (String line : lines) {
		String trimmedLine = line.trim();
		if ((lineIndex++ < SIMS_SKIP) || (trimmedLine.length() == 0)) continue; // First lines are titles
		// If line does not start with 'S.', it is a continuation of the previous line
		if (!trimmedLine.startsWith("S.")) {
			currentLine.append(" ").append(trimmedLine);
			continue;
		}
		// If line starts with 'S.', it is a new item. Store previous item if there is one.
		if (currentLine.length() > 0) {
			// The code is everything before the first space (we assume there is one)
			int codeEnd = currentLine.indexOf(" ");
			entryMap.put(currentLine.substring(0, codeEnd), currentLine.substring(codeEnd + 1));
		}
		currentLine = new StringBuilder(trimmedLine);
	}
	logger.debug("End of PDF extraction");

	document.close();
}
 
开发者ID:FranckCo,项目名称:Stamina,代码行数:39,代码来源:SIMSSimpleModelMaker.java

示例6: findpages

import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
public List<SearchResult> findpages(String path,
        List<String> searchKeywordList, int fileCounter) throws IOException {

    int i; // page no.
    boolean hasKeywords = false;

    PDDocument finalDocument = new PDDocument();
    List<PDPage> pageList = new ArrayList<PDPage>();

    File file = new File(path);
    PDFParser parser = new PDFParser(new RandomAccessBuffer(
            new FileInputStream(file)));
    parser.parse();

    COSDocument cosDoc = parser.getDocument();
    PDFTextStripper reader = new PDFTextStripper();
    PDDocument doc = new PDDocument(cosDoc);

    List<SearchResult> list = new ArrayList<SearchResult>();

    for (i = 0; i <= doc.getNumberOfPages(); i++) {
        reader.setStartPage(i);
        reader.setEndPage(i);
        hasKeywords = true;

        for (String keyword : searchKeywordList) {

            if (!reader.getText(doc).toLowerCase()
                    .contains(keyword.toLowerCase())) {
                hasKeywords = false;
                break;
            }
        }

        if (hasKeywords) {

            if (falseCounter > 1) {
                SearchResult result = new PageResult();
                result.setFileContent(reader.getText(doc));
                result.setFilePath(path);
                result.setPageNumber(i);
                list.add(result);
                pageList.add(doc.getPage(i));
            }

            falseCounter++;
        }

    }

    for (PDPage page : pageList) {
        finalDocument.addPage(page);
    }

    finalDocument
            .save(ConfigCBSI.getResultPdfPath() + fileCounter + ".pdf");
    finalDocument.close();
    logger.info("Result Saved");

    return list;

}
 
开发者ID:arks-api,项目名称:arks-api,代码行数:63,代码来源:PageByPageSearch.java

示例7: getNotesPDF

import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
/**
 * Extracts the explanatory notes from the official PDF publication of the UK SIC classification.
 * 
 * <i>Note:</i> This method is unfinished.
 * 
 * @throws IOException
 */
@SuppressWarnings("unused")
private void getNotesPDF() throws IOException {

	PDDocument document = PDDocument.load(new File(LOCAL_FOLDER + SIC_NOTES_FILE));
	PDFTextStripper stripper = new PDFTextStripper();

	// Extract the main contents and saves it to a file (this is only for debugging purposes)
	stripper.setStartPage(59);
	String rawText = stripper.getText(document);
	//Files.write(Paths.get(LOCAL_FOLDER + "sic-notes.txt"), rawText.getBytes());

	document.close();

	// Read the string containing the raw text line by line and try to make sense of it
	String noteLine = null;
	List<String> currentNote = null;
	int lineNumber = 0;
	boolean ignore = true; // Lines 1 and 2 must be ignored
	BufferedReader noteReader = new BufferedReader(new StringReader(rawText));
	while ((noteLine = noteReader.readLine()) != null) {
		lineNumber++;
		// Ignore the lines that correspond to the headers of the PDF pages
		// The pattern for the first page is irregular, so we explicitly eliminate the line containing the section letter by its number
		if (lineNumber == 52) ignore = true;
		// For all other pages, the pattern is - line beginning or ending with 'Explanatory Notes', then line with page number, then line with section letter
		if ((noteLine.startsWith("Explanatory Notes")) || (noteLine.endsWith("Explanatory Notes"))) ignore = true; // Checked: we don't eliminate note lines with this test

		if (ignore) {
			//System.out.println("Ignored line " + lineNumber + " - " + noteLine);
			if ((lineNumber == 2) || (lineNumber == 52) || (noteLine.length() == 1)) ignore = false; // Resume reading after current line
			continue;
		}

		// Find the lines that are item titles
		String code = null;
		if (noteLine.startsWith("Section")) {
			System.out.println(noteLine);
			code = noteLine.substring(8, 9); // Checked: first test identifies exactly the section lines
		}
		// A too loose test like 'begins with two digits' (^\\d{2}.+$) misses vicious cases, so we have to be more precise
		// Examples of pathological cases at line numbers 322, 1496, 1827, 3088, 6598, 7098, 8534, 8535, 8622...
		if (noteLine.matches("^\\d{2}.+$")) { // We still use a catch-fall test for optimization
			if ((noteLine.matches("^\\d{2} .+$")) && (lineNumber != 322)) code = noteLine.substring(0, 2); // Checked: the test identifies exactly the division lines
			else if (noteLine.matches("^\\d{2}\\.\\d .+$")) code = noteLine.substring(0, 4); // Checked: the test identifies exactly the group lines
			else if (noteLine.matches("^\\d{2}\\.\\d{2} .+$")) code = noteLine.substring(0, 5); // Checked: the test identifies exactly the classes lines
			else if (noteLine.matches("^\\d{2}\\.\\d{2}/\\d .+$")) code = noteLine.substring(0, 7); // Checked: the test identifies exactly the subclasses lines
		}
		if (code != null) { // Start of a note for item identified by 'code'
			currentNote = new ArrayList<String>();
			System.out.println(lineNumber + " - " + code + " - '" + noteLine.substring(code.length() + 1) + "'");
		} else {
			if (currentNote != null) currentNote.add(noteLine); // We could avoid the null test since we jumped directly to the first title
		}
	}
}
 
开发者ID:FranckCo,项目名称:Stamina,代码行数:63,代码来源:SICModelMaker.java

示例8: findpages

import org.apache.pdfbox.text.PDFTextStripper; //导入方法依赖的package包/类
public List<SearchResult> findpages(String path,
		List<String> searchKeywordList, int fileCounter) throws IOException {

	int i; // page no.
	boolean hasKeywords = false;
	boolean hasSingleKeyword = false;

	PDDocument finalDocument = new PDDocument();
	List<PDPage> pageList = new ArrayList<PDPage>();

	File file = new File(path);
	PDFParser parser = new PDFParser(new RandomAccessBuffer(
			new FileInputStream(file)));
	parser.parse();

	COSDocument cosDoc = parser.getDocument();
	PDFTextStripper reader = new PDFTextStripper();
	PDDocument doc = new PDDocument(cosDoc);

	List<SearchResult> list = new ArrayList<SearchResult>();

	for (i = 0; i <= doc.getNumberOfPages() - 1; i++) {
		reader.setStartPage(i);
		reader.setEndPage(i);
		hasKeywords = true;
		hasSingleKeyword = false;

		for (String keyword : searchKeywordList) {

			if (!reader.getText(doc).toLowerCase()
					.contains(keyword.toLowerCase())) {
				hasKeywords = false;
				break;
			}

		}

		if (hasKeywords) {

			/* if (falseCounter > 1) { */
			SearchResult result = new PageResult();
			result.setFileContent(reader.getText(doc));
			result.setFilePath(path);
			result.setPageNumber(i);
			list.add(result);
			pageList.add(doc.getPage(i));

			/* } */

			falseCounter++;
		}

	}

	for (PDPage page : pageList) {
		finalDocument.addPage(page);
		validResult = true;
	}

	if (validResult) {
		finalDocument.save(ConfigCBSI.getResultPdfPath() + fileCounter
				+ ".pdf");
		finalDocument.close();
		logger.info("Result Saved");
		validResult = false;
	}

	return list;

}
 
开发者ID:arks-api,项目名称:arks-api,代码行数:71,代码来源:PageByPageSearch.java


注:本文中的org.apache.pdfbox.text.PDFTextStripper.setStartPage方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。