本文整理汇总了Python中pdfminer.pdfparser.PDFDocument.get_pages方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.get_pages方法的具体用法?Python PDFDocument.get_pages怎么用?Python PDFDocument.get_pages使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.pdfparser.PDFDocument
的用法示例。
在下文中一共展示了PDFDocument.get_pages方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Pdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
class Pdf(object):
def __init__(self, pdf_file):
parser = PDFParser(pdf_file)
self._doc = PDFDocument()
parser.set_document(self._doc)
self._doc.initialize
self._doc.set_parser(parser)
@property
def pages(self):
return len(tuple(self._doc.get_pages()))
def to_text(self):
rsrcmgr = PDFResourceManager()
output = StringIO()
laparams = LAParams()
laparams.detect_vertical = True
laparams.all_texts = True
laparams.word_margin = 0.4
device = TextConverter(rsrcmgr, output, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in self._doc.get_pages():
interpreter.process_page(page)
return output.getvalue().decode('utf-8', 'ignore')
示例2: pdf_to_text
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def pdf_to_text(filename):
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter #<-- changed
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
rsrc = PDFResourceManager()
outfp = StringIO()
device = TextConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
doc = PDFDocument()
fp = open(filename, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
interpreter = PDFPageInterpreter(rsrc, device)
print "There are: " + str(len(list(doc.get_pages()))) + " pages"
for i, page in enumerate(doc.get_pages()):
outfp.write("START PAGE %d\n" % i)
if page is not None:
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
device.close()
fp.close()
return outfp.getvalue()
示例3: dumppdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
if isinstance(obj, PDFStream) and codec == 'raw':
outfp.write(obj.get_rawdata())
elif isinstance(obj, PDFStream) and codec == 'binary':
outfp.write(obj.get_data())
else:
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
return
示例4: extractContent
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def extractContent(file):
print "extractContent"
fp = open(file, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
rsrcmgr = PDFResourceManager()
codec = 'UTF-8'
laparams = LAParams()
outfp = StringIO.StringIO()
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
#if not doc.is_extractable:
# return None
for i, page in enumerate(doc.get_pages()):
print "page=" + str(i)
if page is not None:
interpreter.process_page(page)
print "EOF"
device.close()
fp.close()
return outfp.getvalue()
示例5: extract_text_elements_from_pdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def extract_text_elements_from_pdf(path, j=nulljob):
"""Opens a PDF and extract every element that is text based (LTText).
"""
fp = open(path, 'rb')
doc = PDFDocument(caching=True)
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
rsrcmgr = PDFResourceManager()
laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = []
all_elements = []
enumerated_pages = list(enumerate(doc.get_pages()))
progress_msg = "Reading page %i of %i"
for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
interpreter.process_page(page)
page_layout = device.get_result()
pages.append(Page(page_layout.width, page_layout.height))
textboxes = extract_textboxes(page_layout)
elements = [create_element(box) for box in textboxes]
merge_oneletter_elems(elements)
for i, elem in enumerate(elements):
elem.page = pageno
elem.order = i
all_elements += elements
return pages, all_elements
示例6: ParseAllPages
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def ParseAllPages(self, filepath):
# Open a PDF file.
self.filepath = filepath
fp = open(filepath, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument()
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
password = ""
doc.initialize(password)
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in doc.get_pages():
interpreter.process_page(page)
示例7: pdf_to_csv
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def pdf_to_csv(filename):
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
# becuase my test documents are utf-8 (note: utf-8 is the default codec)
doc = PDFDocument()
fp = open(filename, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(doc.get_pages()):
outfp.write("START PAGE %d\n" % i)
if page is not None:
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
device.close()
fp.close()
return outfp.getvalue()
示例8: process_pdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument()
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the document password for initialization.
# (If no password is set, give an empty string.)
doc.initialize(password)
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
pages = dict(enumerate(doc.get_pages()))
for num, page in pages.iteritems():
if pagenums and (num not in pagenums):
continue
interpreter.process_page(page)
if maxpages and maxpages <= num + 1:
break
return pages
示例9: load
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def load( self, open_file ):
self.fields = {}
self.text= {}
# Create a PDF parser object associated with the file object.
parser = PDFParser(open_file)
# Create a PDF document object that stores the document structure.
doc = PDFDocument()
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
doc.initialize('')
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for pgnum, page in enumerate( doc.get_pages() ):
interpreter.process_page(page)
if page.annots:
self._build_annotations( page )
txt= self._get_text( device )
self.text[pgnum+1]= txt
示例10: getPageLayouts
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def getPageLayouts(f1):
'''Takes a pdf file object, f1, extracts the text-like objects, and returns'''
try:
'''The parser and doc pair for a "pipe" of sorts'''
with open(fpath, 'rb') as f1:
parser = PDFParser(f1)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(pss_wd)
# can we extract text?
if doc.is_extractable:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_layouts = []
for page in doc.get_pages():
'''
I *think* we're actually calling on fp here, and not some
stored data; the idea is that .pdf files are "too big and
complicated" to load all at once, so why not just parse
what you need when you need it?
'''
interpreter.process_page(page)
# receive the LTPage object for the page
page_layouts.append(device.get_result())
except IOError:
raise IOError, "issue with loading file, please try again"
finally:
f1.close()
return page_layouts
示例11: read_invoice_pdfminer3k
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def read_invoice_pdfminer3k(pdfFile):
fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize("")
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
invoice_text = ""
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
invoice_text += lt_obj.get_text()
# Extract client info from the string extracted from pdf
client = extract_info(invoice_text, client_start, client_end)
print("client :" + client)
# Extract invoice no from the pdf file name
invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
print("invoice no :" + invoice_no)
# Pass the client info and invoice no to the method which writes to excel file
write_excel(client, invoice_no)
示例12: dumpoutline
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
pageno = None
if dest:
dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict):
dest = dest['D']
pageno = pages[dest[0].objid]
elif a:
action = a.resolve()
if isinstance(action, dict):
subtype = action.get('S')
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
dest = action['D']
pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n')
parser.close()
fp.close()
return
示例13: parse
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def parse(self, path):
out = StringIO.StringIO()
fp = None
# Directory
if os.path.isdir(path):
raise NotImplementedError()
# File
else:
fp = file(path)
rsrc = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()
laparams.char_margin = 2.0
laparams.line_margin = 2.0
laparams.word_margin = 0.0
device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
doc = PDFDocument()
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
interpreter = PDFPageInterpreter(rsrc, device)
for page in doc.get_pages():
interpreter.process_page(page)
device.close()
sample = Sample(path, None, out.getvalue())
out.close()
return sample
示例14: dumppdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos:
if codec:
for obj in page.contents:
obj = stream_value(obj)
dumpxml(outfp, obj, codec=codec)
else:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
return
示例15: parse_pdf_pdfminer
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def parse_pdf_pdfminer(self, f, fpath):
try:
laparams = LAParams()
laparams.all_texts = True
rsrcmgr = PDFResourceManager()
pagenos = set()
if self.dedup:
self.dedup_store = set()
self.handler.print_header(fpath)
page_num = 0
parser= PDFParser(f)
doc = PDFDocument(caching=True)
parser.set_document(doc)
doc.set_parser(parser)
for page in doc.get_pages():
retstr = StringIO()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_num += 1
interpreter.process_page(page)
data = retstr.getvalue()
self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
retstr.close()
self.handler.print_footer(fpath)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
self.handler.print_error(fpath, e)