当前位置: 首页>>代码示例>>Python>>正文


Python PDFDocument.initialize方法代码示例

本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument.initialize方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.initialize方法的具体用法?Python PDFDocument.initialize怎么用?Python PDFDocument.initialize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfdocument.PDFDocument的用法示例。


在下文中一共展示了PDFDocument.initialize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: with_pdf

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def with_pdf(pdf_doc, fn, pdf_pwd, *args):
    """Open the pdf document, and apply the function, returning the results"""
    result = None
    try:
        # open the pdf file
        fp = open(pdf_doc, "rb")
        # create a parser object associated with the file object
        parser = PDFParser(fp)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument(parser)
        # connect the parser and document objects
        parser.set_document(doc)
        # supply the password for initialization
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            result = fn(doc, *args)

        # close the pdf file
        fp.close()
    except IOError:
        # the file doesn't exist or similar problem
        pass
    return result
开发者ID:ichraibi,项目名称:pdfminer-layout-scanner,代码行数:27,代码来源:layout_scanner.py

示例2: extractembedded

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def extractembedded(outfp, fname, objids, pagenos, password='',
                    dumpall=False, codec=None, extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile' %
                (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print >>sys.stderr, 'extracting: %r' % path
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)

    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    return
开发者ID:coolioxlr,项目名称:ziply,代码行数:36,代码来源:dumppdf.py

示例3: extract_pdf

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def extract_pdf(file):
    """
    extract the string content of a pdf
    """
    parser = PDFParser(file)
    document = PDFDocument(parser)
    document.initialize("")
    if not document.is_extractable:
        return -1

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    codec = 'utf-8'
    device = TextConverter(rsrcmgr, retstr, codec = codec, showpageno=False, laparams = laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()

    for page in PDFPage.get_pages(file, pagenos, maxpages=0, password="", caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)

    content = retstr.getvalue()
    return content
开发者ID:stpddream,项目名称:DHSpaceCollector,代码行数:27,代码来源:fileio.py

示例4: dumppdf

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
开发者ID:coolioxlr,项目名称:ziply,代码行数:29,代码来源:dumppdf.py

示例5: pdf_to_text

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def pdf_to_text(page_object):
    parser = PDFParser(page_object)
    # Create a PDF document object that stores the document structure
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.initialize('')
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF page aggregator object
    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = []
    # i = page number #without this it doesn't work
    # page are items in page
    for i, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for object in layout:
            if isinstance(object, LTTextBox) or isinstance(object, LTTextLine):
                trial = []
                trial.append(object.get_text())
                for word in trial:
                    text_content.append(word)                    
    return text_content
开发者ID:ConstanzaSchibber,项目名称:PythonClasses,代码行数:28,代码来源:PDFtoTxT.py

示例6: pdf2metadata

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def pdf2metadata(fp):
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    doc.initialize()

    if 'Metadata' in doc.catalog:
        metadata = resolve1(doc.catalog['Metadata']).get_data()
        #print metadata  # The raw XMP metadata
    return doc.info  # The "Info" metadata
开发者ID:lidingpku,项目名称:open-conference-data,代码行数:12,代码来源:lib_pdf.py

示例7: loadPDF

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def loadPDF(library, file_name):
	"""adds a paper to the library"""
	fp = open(file_name, 'rb')
	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	document = PDFDocument(parser)
	# Supply the password for initialization.
	# (If no password is set, give an empty string.)
	password = ""
	document.initialize(password)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
		print "CANT"
	#	raise PDFTextExtractionNotAllowed
	# Create a PDF resource manager object that stores shared resources.
	rsrcmgr = PDFResourceManager()
	# Set parameters for analysis.
	laparams = LAParams()
	# Create a PDF page aggregator object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	
	text_content = []
	authors = []       #list of authors
	citations = []     #list of authors that have been cited

	#pages_length = sum(1 for page in document.get_pages())

	for ii, page in enumerate(PDFPage.create_pages(document)):
		print '---------------------------------------------------------------------------------------------------'
		print "page number {}".format(ii)
		interpreter.process_page(page)
		# receive the LTPage object for the page.
		layout = device.get_result()
		for jj, lt_obj in enumerate(layout._objs):
			if jj>3:
				break
			if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
				cur_line = lt_obj.get_text().encode('ascii', 'ignore')
				match = pattern_ignore.match(cur_line)
				if match is None and len(cur_line)<200:
					print bcolors.OKGREEN +" "+cur_line+bcolors.ENDC
				else:
					print bcolors.FAIL+" "+cur_line[0:150]+bcolors.ENDC
				
			else:
				print "PICTURE"
		break


	paper_title = file_name
	paper = library.getPaper(paper_title)
	paper.addAuthorIds(authors)
	paper.addCitationIds(citations)
开发者ID:exrhizo,项目名称:citation_graph,代码行数:57,代码来源:loadPDF.py

示例8: proc

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
    def proc(self, pdfFp):
        """Get meta-data as available from a PDF document"""

        parser = PDFParser(pdfFp)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        doc.initialize()
        self.info = doc.info
        if 'Metadata' in doc.catalog:
            self.metadata = xmp_to_dict(
                resolve1(doc.catalog['Metadata']).get_data()
            )
        self.raw_doc = pdfFp.getvalue()
开发者ID:dunlevyt,项目名称:fda-docs-ix,代码行数:15,代码来源:fda-docs-ix.py

示例9: getDocumentInfoAndAnnotations

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def getDocumentInfoAndAnnotations(pdfFile):
   logger.info("Parsing pdf file " + pdfFile);
   # Open PDF file.
   fp = open(pdfFile, 'rb');
   docInfo = None;
   docAnnotations = [];
   # Create a PDF parser object associated with the file object.
   parser = PDFParser(fp);
   # Create a PDF document object that stores the document structure.
   document = PDFDocument(parser);
   # Supply the password for initialization.
   # (If no password is set, give an empty string.)
   document.initialize('');
   # Create a PDF resource manager object that stores shared resources.
   rsrcmgr = PDFResourceManager();
   # Create a PDF device object.
   device = PDFDevice(rsrcmgr);
   # Create a PDF interpreter object.
   interpreter = PDFPageInterpreter(rsrcmgr, device);
   # Process each page contained in the document.
   pageNum = 0;
   for page in PDFPage.create_pages(document):
      pageNum += 1;
      interpreter.process_page(page);
      if(page.annots):
         try:
            if isinstance( page.annots, list ):
               annots = page.annots;
            else:
               annots = page.annots.resolve();

            for annot in annots:
               if isinstance( annot, PDFObjRef ):
                  annot = annot.resolve();
   
                  if(annot.has_key('Subj')):
                     if(annot['Subj'] == 'Sticky Note' and docInfo == None):
                        logger.debug('DOC INFO ' + annot['Subj'] + ' Contents=' + annot['Contents']);
                        docInfo = annot['Contents'];
                     elif(annot['Subj'] == 'Comment on Text'):
                        logger.debug('COMMENT ON TEXT ' + annot['Subj'] + ' Contents=' + annot['Contents']);
                        contents = annot['Contents'];
                        docAnnotations.append(str(pageNum) + ':' + contents);
                     else:
                        logger.debug('UNKNOWN ANNOTATION: ' + annot['Subj'] + ' Contents=' + annot['Contents']);

         except Exception, e:
            logger.error("error getting annotation");
            logger.exception(e);
            # move file to error
            os.rename(file, "/home1/northbr6/batch/apps/catalogue/output/error/" + os.path.basename(file));
开发者ID:kdflint,项目名称:kumuku-community,代码行数:53,代码来源:nbparser.py

示例10: load_document

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
    def load_document(self, _file, password=""):
        """turn the file into a PDFMiner document"""
        log.info("loading document...")
        parser = module_parser(_file)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)

        doc.initialize(password)

        if not doc.is_extractable:
            raise ValueError("PDF text extraction not allowed")

        return doc
开发者ID:CJStuart,项目名称:amcat,代码行数:16,代码来源:pdf.py

示例11: pdf_from_resource

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def pdf_from_resource(resource):
    """
    Builds PDF mining objects from input data.

    This function attempts to open a PDF file for processing.
    """
    parser = PDFParser(resource)
    document = PDFDocument()
    parser.set_document(document)

    document.set_parser(parser)
    document.initialize()

    return document
开发者ID:Impactstory,项目名称:cv-parser,代码行数:16,代码来源:parsecv.py

示例12: Parse_PDF

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
	def Parse_PDF(self):

		def parse_lt_objs (lt_objs, page_number, text=[]):
			"""Iterate through the list of LT* objects and capture the text or image data contained in each"""
			text_content = [] 
			page_text = {}
			for lt_obj in lt_objs:
				
				if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
				# text, so arrange is logically based on its column width
					text_content.append(lt_obj.get_text())

				elif isinstance(lt_obj, LTFigure):
					# LTFigure objects are containers for other LT* objects, so recurse through the children
					text_content.append(parse_lt_objs(lt_obj, page_number, text_content))

			for k, v in sorted([(key,value) for (key,value) in page_text.items()]):
				# sort the page_text hash by the keys (x0,x1 values of the bbox),
				# which produces a top-down, left-to-right sequence of related columns
				text_content.append(''.join(v))

			return '\n'.join(text_content)

		fp = open( self.filePath, 'rb')

		parser = PDFParser(fp)

		document = PDFDocument(parser)

		try:
			document.initialize('')
		except:
			pass

		rsrcmgr = PDFResourceManager()

		device = PDFPageAggregator(rsrcmgr, laparams=LAParams())

		interpreter = PDFPageInterpreter(rsrcmgr, device)

		text_content = []
		i = 0

		for page in PDFPage.create_pages(document):
			interpreter.process_page(page)
			layout = device.get_result()
			self.text_content.append(parse_lt_objs(layout, (i+1)).strip())
			i += 1

		return self.text_content
开发者ID:justin-prather,项目名称:Order-Finder,代码行数:52,代码来源:Order_Parser.py

示例13: check_pdf_password

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def check_pdf_password(pdf, password):
        fp = open(pdf, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        try:
                doc.initialize(password)
                if doc.is_extractable:
                        print ''
                        print 'The PDF Password Is:' + password
                        return True
                else:
                        print 'exception'
                        return False
        except:
                print '\r',
                return False
开发者ID:ttskym,项目名称:project,代码行数:18,代码来源:pdfcracker.py

示例14: convert_file

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def convert_file(pdf_file, file_name):
    parser = PDFParser(pdf_file)
    pdf = PDFDocument(parser)
    pdf.initialize("")
    if not pdf.is_extractable:
        raise PDFPage.PDFTextExtractionNotAllowed("Document does not allow text extraction: " + file_name)

    resource = PDFResourceManager()
    laparams = LAParams()
    output = StringIO.StringIO()
    device = TextConverter(resource, output, codec="utf-8", laparams=laparams)

    interpreter = PDFPageInterpreter(resource, device)
    for page in PDFPage.create_pages(pdf):
        interpreter.process_page(page)

    return output.getvalue()
开发者ID:akilism,项目名称:moving_violation_scraper,代码行数:19,代码来源:pdf_to_json.py

示例15: dumpoutline

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page)
                  in enumerate(PDFPage.create_pages(doc)) )
    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        return dest
    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level,title,dest,a,se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a.resolve()
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
开发者ID:coolioxlr,项目名称:ziply,代码行数:48,代码来源:dumppdf.py


注:本文中的pdfminer.pdfdocument.PDFDocument.initialize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。