本文整理匯總了Python中pdfminer.pdfpage.PDFPage.create_pages方法的典型用法代碼示例。如果您正苦於以下問題:Python PDFPage.create_pages方法的具體用法?Python PDFPage.create_pages怎麽用?Python PDFPage.create_pages使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pdfminer.pdfpage.PDFPage
的用法示例。
在下文中一共展示了PDFPage.create_pages方法的10個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: read_fields
# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def read_fields(pdffile):
import string
printable = set(string.printable)
outfields = list()
fp = open(pdffile, 'rb')
id_to_page = dict()
parser = PDFParser(fp)
doc = PDFDocument(parser)
pageno = 1;
for page in PDFPage.create_pages(doc):
id_to_page[page.pageid] = pageno
pageno += 1
if 'AcroForm' not in doc.catalog:
return None
fields = resolve1(doc.catalog['AcroForm'])['Fields']
recursively_add_fields(fields, id_to_page, outfields)
return sorted(outfields, key=fieldsorter)
示例2: pages
# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def pages(self):
if hasattr(self, "_pages"): return self._pages
doctop = 0
pp = self.pages_to_parse
self._pages = []
for i, page in enumerate(PDFPage.create_pages(self.doc)):
page_number = i+1
if pp != None and page_number not in pp: continue
p = Page(self, page, page_number=page_number, initial_doctop=doctop)
self._pages.append(p)
doctop += p.height
return self._pages
示例3: analyze_pages
# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def analyze_pages(file_name, char_margin=1.0):
"""
Input: the file path to the PDF file
Output: yields the layout object for each page in the PDF
"""
log = logging.getLogger(__name__)
# Open a PDF file.
with open(os.path.realpath(file_name), "rb") as fp:
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser, password="")
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams(
char_margin=char_margin, word_margin=0.1, detect_vertical=True
)
# Create a PDF page aggregator object.
device = CustomPDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page_num, page in enumerate(PDFPage.create_pages(document)):
try:
interpreter.process_page(page)
except OverflowError as oe:
log.exception(
"{}, skipping page {} of {}".format(oe, page_num, file_name)
)
continue
layout = device.get_result()
yield layout
示例4: extract_first_jpeg_in_pdf
# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def extract_first_jpeg_in_pdf(fstream):
"""
Reads a given PDF file and scans for the first valid embedded JPEG image.
Returns either None (if none found) or a string of data for the image.
There is no 100% guarantee for this code, yet it seems to work fine with most
scanner-produced images around.
More testing might be needed though.
Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
for PDFMiner.
:param fstream: Readable binary stream of the PDF
:return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
"""
parser = PDFParser(fstream)
document = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.create_pages(document)
for page in pages:
interpreter.process_page(page)
layout = device.result
for el in layout:
if isinstance(el, LTFigure):
for im in el:
if isinstance(im, LTImage):
# Found one!
st = None
try:
imdata = im.stream.get_data()
except:
# Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
imdata = im.stream.get_rawdata()
if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
return imdata
return None
示例5: parse_case
# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def parse_case(case_path):
"""Parse all the pdf files in the folder."""
try:
result = {
'id': case_path.split('/')[-2],
'docs': {}
}
for name in os.listdir(case_path):
if name[0] == '.' or name[-4:] != '.pdf':
continue
doc_id = name.split('.')[0]
result['docs'][doc_id] = {'pages': {}}
doc_obj = result['docs'][doc_id]
path = case_path + name
fp = open(path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams(detect_vertical=True, all_texts=True)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
layout = device.get_result()
doc_obj['pages'][layout.pageid] = {
'size': (layout.width, layout.height),
'text': parse_text(layout)
}
# print(layout.width, layout.height)
output = open(case_path + 'parsed.json', 'w')
json.dump(result, output, indent=None)
except:
print("Error " + case_path)
return None
示例6: parse_pdf
# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def parse_pdf(self, fp):
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for pgnum, page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
page.annots and self.parse_annotations(pgnum, page)
示例7: extract
# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def extract(self, max_page_num=None):
for page in PDFPage.create_pages(self._document):
self._interpreter.process_page(page)
layout = self._device.get_result()
if max_page_num != None and layout.pageid > max_page_num:
break
self._pages[layout.pageid] = layout
示例8: p2t
# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def p2t(sourcefile, outfile):
with open(sourcefile, 'rb') as fp:
# 來創建一個pdf文檔分析器
parser = PDFParser(fp)
#創建一個PDF文檔對象存儲文檔結構
try:
document = PDFDocument(parser)
except:
print(sourcefile + ' :pdf未正確下載')
# 檢查文件是否允許文本提取
else:
if not document.is_extractable:
print(sourcefile + ' :不允許提取文本')
# 創建一個PDF資源管理器對象來存儲共賞資源
rsrcmgr=PDFResourceManager()
# 設定參數進行分析
laparams=LAParams()
# 創建一個PDF設備對象
# device=PDFDevice(rsrcmgr)
device=PDFPageAggregator(rsrcmgr,laparams=laparams)
# 創建一個PDF解釋器對象
interpreter=PDFPageInterpreter(rsrcmgr,device)
# 處理每一頁
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# 接受該頁麵的LTPage對象
layout=device.get_result()
for x in layout:
if(isinstance(x,LTTextBoxHorizontal)):
with open(outfile, 'a') as f:
f.write(x.get_text().encode('utf-8')+'\n')
print(sourcefile + ' 已轉為 ' + outfile)
##############################################把doc轉為txt##############################################
# 調用之前要確保你在linux 下裝了catdoc
示例9: get_title_from_io
# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def get_title_from_io(pdf_io):
# pylint: disable=too-many-locals
parser = PDFParser(pdf_io)
# if pdf is protected with a pwd, 2nd param here is password
doc = PDFDocument(parser)
# pdf may not allow extraction
# pylint: disable=no-else-return
if doc.is_extractable:
rm = PDFResourceManager()
dev = TextOnlyDevice(rm)
interpreter = TextOnlyInterpreter(rm, dev)
first_page = StringIO()
converter = TextConverter(rm, first_page, laparams=LAParams())
page_interpreter = PDFPageInterpreter(rm, converter)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
page_interpreter.process_page(page)
break
converter.close()
first_page_text = first_page.getvalue()
first_page.close()
dev.recover_last_paragraph()
verbose('all blocks')
for b in dev.blocks:
verbose(b)
# find max font size
max_tfs = max(dev.blocks, key=lambda x: x[1])[1]
verbose('max_tfs: ', max_tfs)
# find max blocks with max font size
max_blocks = list(filter(lambda x: x[1] == max_tfs, dev.blocks))
# find the one with the highest y coordinate
# this is the most close to top
max_y = max(max_blocks, key=lambda x: x[3])[3]
verbose('max_y: ', max_y)
found_blocks = list(filter(lambda x: x[3] == max_y, max_blocks))
verbose('found blocks')
for b in found_blocks:
verbose(b)
block = found_blocks[0]
title = ''.join(block[4]).strip()
# Retrieve missing spaces if needed
if " " not in title:
title = retrieve_spaces(first_page_text, title)
# Remove duplcate spaces if any are present
if " " in title:
title = " ".join(title.split())
return title
else:
return None
示例10: extractHighlights
# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def extractHighlights(filename,anno,verbose=True):
'''Extract highlighted texts from a PDF
'''
hlpages=anno.hlpages
if len(hlpages)==0:
return []
#--------------Get pdfmine instances--------------
document, interpreter, device=init(filename)
#----------------Loop through pages----------------
hltexts=[]
for ii,page in enumerate(PDFPage.create_pages(document)):
#------------Get highlights in page------------
if len(hlpages)>0 and ii+1 in hlpages:
anno_total=len(anno.highlights[ii+1])
anno_found=0
interpreter.process_page(page)
layout = device.get_result()
#--------------Sort boxes diagnoally--------------
objs=sortDiag(layout)
#-----------------Refine ordering-----------------
objs=fineTuneOrder(objs)
#----------------Loop through boxes----------------
for jj,objj in enumerate(objs):
if type(objj)!=LTTextBox and\
type(objj)!=LTTextBoxHorizontal:
continue
textjj,numjj=findStrFromBox(anno.highlights[ii+1],objj)
if numjj>0:
#--------------Attach text with meta--------------
textjj=Anno(textjj,\
ctime=getCtime(anno.highlights[ii+1]),\
title=anno.meta['title'],\
page=ii+1,citationkey=anno.meta['citationkey'],\
tags=anno.meta['tags'])
hltexts.append(textjj)
#----------------Break if all found----------------
anno_found+=numjj
if anno_total==anno_found:
break
return hltexts