Python pdfdocument.PDFDocument类代码示例

本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument类的典型用法代码示例。如果您正苦于以下问题：Python PDFDocument类的具体用法？Python PDFDocument怎么用？Python PDFDocument使用的例子？那么, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了PDFDocument类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: dumppdf

def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return

开发者ID:coolioxlr，项目名称:ziply，代码行数:27，代码来源:dumppdf.py

示例2: parse_paragraphs

    def parse_paragraphs(self, text):
        # Will only work for markdown elements
        #   divided by '##' markers
        #   or for pdf like chapters, e.g. \n\n 2 Conclusion \n\n
        lines = text.split('\n')
        headlines = []

        if self.is_pdf:
            with open(self.paper_filename, 'rb') as pdf:
                parser = PDFParser(pdf)
                document = PDFDocument(parser)

                try:
                    outlines = document.get_outlines()
                    for (level, title, _, _, _) in outlines:
                        if level == 1:
                            headlines.append(title)
                except PDFNoOutlines:
                    logging.info(
                        "No outline found -> skipping paragraph search..."
                    )
        else:  # check markdown headlines
            for index, line in enumerate(lines):
                if line.startswith('## '):
                    headlines.append(line)

        if len(headlines) > 0:
            self.count_paragraphs(text, lines, headlines)

开发者ID:dahoo，项目名称:paper-gamification，代码行数:28，代码来源:tracker.py

示例3: with_pdf

def with_pdf(pdf_doc, fn, pdf_pwd, *args):
    """Open the pdf document, and apply the function, returning the results"""
    result = None
    try:
        # open the pdf file
        fp = open(pdf_doc, "rb")
        # create a parser object associated with the file object
        parser = PDFParser(fp)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument(parser)
        # connect the parser and document objects
        parser.set_document(doc)
        # supply the password for initialization
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            result = fn(doc, *args)

        # close the pdf file
        fp.close()
    except IOError:
        # the file doesn't exist or similar problem
        pass
    return result

开发者ID:ichraibi，项目名称:pdfminer-layout-scanner，代码行数:25，代码来源:layout_scanner.py

示例4: extract_pdf

def extract_pdf(file):
    """
    extract the string content of a pdf
    """
    parser = PDFParser(file)
    document = PDFDocument(parser)
    document.initialize("")
    if not document.is_extractable:
        return -1

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    codec = 'utf-8'
    device = TextConverter(rsrcmgr, retstr, codec = codec, showpageno=False, laparams = laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()

    for page in PDFPage.get_pages(file, pagenos, maxpages=0, password="", caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)

    content = retstr.getvalue()
    return content

开发者ID:stpddream，项目名称:DHSpaceCollector，代码行数:25，代码来源:fileio.py

示例5: parse

 def parse (self):
     fp = file(self.pdf, 'rb')
     parser = PDFParser(fp, dbg=self.debug)
     doc = PDFDocument(parser, dbg=self.debug)
     #extract blob of data after EOF (if it exists)
     if doc.found_eof and doc.eof_distance > 3:
         self.bin_blob = parser.read_from_end(doc.eof_distance)
     res = '<pdf>'
     visited = set() #keep track of the objects already visited
     for xref in doc.xrefs:
         for objid in xref.get_objids():
             if objid in visited:
                 continue
             if objid == 21 or objid == 67:
                 print objid
             visited.add(objid)
             try:
                 obj = doc.getobj(objid)
                 res += '<object id="' + str(objid) + '">\n'
                 res += self.dump(obj)
                 res += '\n</object>\n\n'
             except PDFObjectNotFound as e:
                 mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096)
                 mal_obj = mal_obj.replace('<', '0x3C')
                 res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj)
                 self.takenote(self.malformed, 'objects', objid)
             except Exception as e:
                 res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message)
     fp.close()
     res += self.dumptrailers(doc)
     res += '</pdf>'
     self.xml=res
     self.errors = doc.errors
     self.bytes_read = parser.BYTES
     return

开发者ID:toejamhoney，项目名称:thisneedsacoolname，代码行数:35，代码来源:xml_creator.py

示例6: extractembedded

def extractembedded(outfp, fname, objids, pagenos, password='',
                    dumpall=False, codec=None, extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile' %
                (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print >>sys.stderr, 'extracting: %r' % path
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    return

开发者ID:toejamhoney，项目名称:peepdf-js_analyse，代码行数:32，代码来源:dumppdf.py

示例7: dumppdf

def dumppdf(fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    res = ""
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            res += dumpxml(obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        res += dumpxml( obj, codec=codec)
                else:
                    res += dumpxml(page.attrs)
    #print "before dumpall"
    if dumpall:
        res += dumpallobjs( doc, codec=codec)
        #print "after dumpall"
    if (not objids) and (not pagenos) and (not dumpall):
        res += dumptrailers( doc)
    fp.close()
    if codec not in ('raw','binary'):
        res += '\n'
    #print "end proc"
    return res

开发者ID:toejamhoney，项目名称:peepdf-js_analyse，代码行数:30，代码来源:dumppdf.py

示例8: main

def main():
    # Open a PDF file.
    with open('/home/chris/Documents/Literature/DFT Primer.pdf', 'rb') as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        print rsrcmgr
        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            print interpreter.process_page(page)
        outlines = document.get_outlines()
        for (level,title,dest,a,se) in outlines:
            print (level, title)
    return 0

开发者ID:cmthompson，项目名称:weiss，代码行数:25，代码来源:Layout.py

示例9: pdf_to_text

def pdf_to_text(page_object):
    parser = PDFParser(page_object)
    # Create a PDF document object that stores the document structure
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.initialize('')
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF page aggregator object
    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = []
    # i = page number #without this it doesn't work
    # page are items in page
    for i, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for object in layout:
            if isinstance(object, LTTextBox) or isinstance(object, LTTextLine):
                trial = []
                trial.append(object.get_text())
                for word in trial:
                    text_content.append(word)                    
    return text_content

开发者ID:ConstanzaSchibber，项目名称:PythonClasses，代码行数:26，代码来源:PDFtoTxT.py

示例10: pdf2metadata

def pdf2metadata(fp):
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    doc.initialize()

    if 'Metadata' in doc.catalog:
        metadata = resolve1(doc.catalog['Metadata']).get_data()
        #print metadata  # The raw XMP metadata
    return doc.info  # The "Info" metadata

开发者ID:lidingpku，项目名称:open-conference-data，代码行数:10，代码来源:lib_pdf.py

示例11: get_toc

def get_toc(pdf_path):
    infile = open(pdf_path, "rb")
    parser = PDFParser(infile)
    document = PDFDocument(parser)

    toc = list()
    for (level, title, dest, a, structelem) in document.get_outlines():
        toc.append((level, title))

    return toc

开发者ID:erexhepa，项目名称:IF_COLOC_ENGINE，代码行数:10，代码来源:pdf_metadata.py

示例12: loadPDF

def loadPDF(library, file_name):
	"""adds a paper to the library"""
	fp = open(file_name, 'rb')
	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	document = PDFDocument(parser)
	# Supply the password for initialization.
	# (If no password is set, give an empty string.)
	password = ""
	document.initialize(password)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
		print "CANT"
	#	raise PDFTextExtractionNotAllowed
	# Create a PDF resource manager object that stores shared resources.
	rsrcmgr = PDFResourceManager()
	# Set parameters for analysis.
	laparams = LAParams()
	# Create a PDF page aggregator object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	
	text_content = []
	authors = []       #list of authors
	citations = []     #list of authors that have been cited

	#pages_length = sum(1 for page in document.get_pages())

	for ii, page in enumerate(PDFPage.create_pages(document)):
		print '---------------------------------------------------------------------------------------------------'
		print "page number {}".format(ii)
		interpreter.process_page(page)
		# receive the LTPage object for the page.
		layout = device.get_result()
		for jj, lt_obj in enumerate(layout._objs):
			if jj>3:
				break
			if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
				cur_line = lt_obj.get_text().encode('ascii', 'ignore')
				match = pattern_ignore.match(cur_line)
				if match is None and len(cur_line)<200:
					print bcolors.OKGREEN +" "+cur_line+bcolors.ENDC
				else:
					print bcolors.FAIL+" "+cur_line[0:150]+bcolors.ENDC
				
			else:
				print "PICTURE"
		break


	paper_title = file_name
	paper = library.getPaper(paper_title)
	paper.addAuthorIds(authors)
	paper.addCitationIds(citations)

开发者ID:exrhizo，项目名称:citation_graph，代码行数:55，代码来源:loadPDF.py

示例13: print_all_obj

def print_all_obj(filename):
    with file(filename, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser, None)
        visited_objids = set()
        for xref in doc.xrefs:
            for objid in xref.get_objids():
                if objid in visited_objids:
                    continue
                visited_objids.add(objid)
                print objid, get_obj_type(doc.getobj(objid))

开发者ID:kexplo，项目名称:extract_images_from_pdf，代码行数:11，代码来源:extimgpdf.py

示例14: proc

    def proc(self, pdfFp):
        """Get meta-data as available from a PDF document"""

        parser = PDFParser(pdfFp)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        doc.initialize()
        self.info = doc.info
        if 'Metadata' in doc.catalog:
            self.metadata = xmp_to_dict(
                resolve1(doc.catalog['Metadata']).get_data()
            )
        self.raw_doc = pdfFp.getvalue()

开发者ID:dunlevyt，项目名称:fda-docs-ix，代码行数:13，代码来源:fda-docs-ix.py

示例15: getDocumentInfoAndAnnotations

def getDocumentInfoAndAnnotations(pdfFile):
   logger.info("Parsing pdf file " + pdfFile);
   # Open PDF file.
   fp = open(pdfFile, 'rb');
   docInfo = None;
   docAnnotations = [];
   # Create a PDF parser object associated with the file object.
   parser = PDFParser(fp);
   # Create a PDF document object that stores the document structure.
   document = PDFDocument(parser);
   # Supply the password for initialization.
   # (If no password is set, give an empty string.)
   document.initialize('');
   # Create a PDF resource manager object that stores shared resources.
   rsrcmgr = PDFResourceManager();
   # Create a PDF device object.
   device = PDFDevice(rsrcmgr);
   # Create a PDF interpreter object.
   interpreter = PDFPageInterpreter(rsrcmgr, device);
   # Process each page contained in the document.
   pageNum = 0;
   for page in PDFPage.create_pages(document):
      pageNum += 1;
      interpreter.process_page(page);
      if(page.annots):
         try:
            if isinstance( page.annots, list ):
               annots = page.annots;
            else:
               annots = page.annots.resolve();

            for annot in annots:
               if isinstance( annot, PDFObjRef ):
                  annot = annot.resolve();
   
                  if(annot.has_key('Subj')):
                     if(annot['Subj'] == 'Sticky Note' and docInfo == None):
                        logger.debug('DOC INFO ' + annot['Subj'] + ' Contents=' + annot['Contents']);
                        docInfo = annot['Contents'];
                     elif(annot['Subj'] == 'Comment on Text'):
                        logger.debug('COMMENT ON TEXT ' + annot['Subj'] + ' Contents=' + annot['Contents']);
                        contents = annot['Contents'];
                        docAnnotations.append(str(pageNum) + ':' + contents);
                     else:
                        logger.debug('UNKNOWN ANNOTATION: ' + annot['Subj'] + ' Contents=' + annot['Contents']);

         except Exception, e:
            logger.error("error getting annotation");
            logger.exception(e);
            # move file to error
            os.rename(file, "/home1/northbr6/batch/apps/catalogue/output/error/" + os.path.basename(file));

开发者ID:kdflint，项目名称:kumuku-community，代码行数:51，代码来源:nbparser.py

注：本文中的pdfminer.pdfdocument.PDFDocument类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。