本文整理汇总了Python中PyPDF2.PdfFileReader方法的典型用法代码示例。如果您正苦于以下问题:Python PyPDF2.PdfFileReader方法的具体用法?Python PyPDF2.PdfFileReader怎么用?Python PyPDF2.PdfFileReader使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类PyPDF2
的用法示例。
在下文中一共展示了PyPDF2.PdfFileReader方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: pdf_meta
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
doc_info = None
if use_pdf_meta:
doc_info = PdfFileReader(open(tmp_file_path, 'rb')).getDocumentInfo()
if doc_info:
author = doc_info.author if doc_info.author else u'Unknown'
title = doc_info.title if doc_info.title else original_file_name
subject = doc_info.subject
else:
author = u'Unknown'
title = original_file_name
subject = ""
return BookMeta(
file_path=tmp_file_path,
extension=original_file_extension,
title=title,
author=author,
cover=pdf_preview(tmp_file_path, original_file_name),
description=subject,
tags="",
series="",
series_id="",
languages="")
示例2: run
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def run(self, params={}):
try:
if params.get('contents'):
pdfFile = base64.b64decode(params.get('contents'))
else:
raise Exception("File contents missing!")
except Exception as e:
self.logger.error("File contents missing: ", e)
raise
try:
with open("temp.pdf", 'wb') as temp_pdf:
temp_pdf.write(pdfFile)
pdfReader = PyPDF2.PdfFileReader(open('temp.pdf', 'rb'))
pdftext = ""
for page in range(pdfReader.numPages):
pageObj = pdfReader.getPage(page)
pdftext += pageObj.extractText().replace('\n','')
except Exception as e:
self.logger.info("An error occurred while extracting text: ", e)
raise
return {"output": pdftext}
示例3: pdf_date
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def pdf_date(path):
"""
Extract a date from PDF file metadata.
Args:
path (str): The file path.
Returns:
datetime: The created date.
"""
reader = PdfFileReader(path)
# Get rid of `D:` prefix and timezone.
stamp = reader.documentInfo['/CreationDate']
match = re.search('\d+', stamp)
return datetime.strptime(
match.group(),
'%Y%m%d%H%M%S'
)
示例4: join_ocred_pdf
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def join_ocred_pdf(self):
# Join PDF files into one file that contains all OCR "backgrounds"
text_pdf_file_list = sorted(glob.glob(self.tmp_dir + "{0}*.{1}".format(self.prefix, "pdf")))
self.debug("We have {0} ocr'ed files".format(len(text_pdf_file_list)))
if len(text_pdf_file_list) > 0:
pdf_merger = PyPDF2.PdfFileMerger()
for text_pdf_file in text_pdf_file_list:
pdf_merger.append(PyPDF2.PdfFileReader(text_pdf_file, strict=False))
pdf_merger.write(self.tmp_dir + self.prefix + "-ocr.pdf")
pdf_merger.close()
else:
eprint("No PDF files generated after OCR. This is not expected. Aborting.")
self.cleanup()
exit(1)
#
self.debug("Joined ocr'ed PDF files")
示例5: encryptPDFs
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def encryptPDFs(root, password):
"""Encrypts all pdfs folder walk
Args:
root (str): folder path to walk
password (str): password to encrypt pdfs with
Returns:
None
"""
for folder, subfolder, fileList in os.walk(root):
for file in fileList:
if file.endswith('.pdf'):
filepath = os.path.join(os.path.abspath(folder), file)
pdfFileObj = open(filepath, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
if not pdfReader.isEncrypted:
pdfWriter = PyPDF2.PdfFileWriter()
for pageNum in range(pdfReader.numPages):
pdfWriter.addPage(pdfReader.getPage(pageNum))
pdfWriter.encrypt(password)
newPath = os.path.dirname(filepath) + '/untitled folder/' + \
('_encrypted.'.join(os.path.basename(filepath).split('.')))
resultPdf = open(newPath, 'wb')
pdfWriter.write(resultPdf)
resultPdf.close()
示例6: breakPassword
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def breakPassword(filename):
"""Breaks a single word password of a PDF
Args:
filename (str): Filename for encrypted pdf
Returns:
None
"""
encryptedFile = open(filename, 'rb')
pdfReader = PyPDF2.PdfFileReader(encryptedFile)
with open('dictionary.txt') as words:
wordList = words.read().split('\n')
for word in wordList:
wordLower = word.lower()
wordCap = word.capitalize()
if pdfReader.decrypt(word):
return word
elif pdfReader.decrypt(wordCap):
return wordCap
elif pdfReader.decrypt(wordLower):
return wordLower
return
示例7: encrypt
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def encrypt(out_pdf, password):
print('Encrypting the document')
output_pdf = PyPDF2.PdfFileWriter()
in_file = open(out_pdf, "rb")
input_pdf = PyPDF2.PdfFileReader(in_file)
output_pdf.appendPagesFromReader(input_pdf)
output_pdf.encrypt(password)
# Intermediate file
with open(INTERMEDIATE_ENCRYPT_FILE, "wb") as out_file:
output_pdf.write(out_file)
in_file.close()
# Rename the intermediate file
os.rename(INTERMEDIATE_ENCRYPT_FILE, out_pdf)
示例8: Analyze_Metadata_pdf
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
pdfFile = PdfFileReader(file(filename, 'rb'))
metadata = pdfFile.getDocumentInfo()
print ' - Document: ' + str(filename)
for meta in metadata:
value=(metadata[meta])
print ' - ' + meta + ':' + metadata[meta]
if meta == "/Author":
if value not in meta_author_array:
meta_author_array.append(value)
elif meta =="/Producer":
if value not in meta_producer_array:
meta_producer_array.append(value)
elif meta == "/Creator":
if value not in meta_creator_array:
meta_creator_array.append(value)
#Group the different arrays in one with all metadata
metadata_files.append(meta_author_array)
metadata_files.append(meta_producer_array)
metadata_files.append(meta_creator_array)
####### FUNCTION AnalyzeMetadata doc ######
示例9: Analyze_Metadata_pdf
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
pdfFile = PdfFileReader(file(filename, 'rb'))
metadata = pdfFile.getDocumentInfo()
print ' - Document: ' + str(filename)
for meta in metadata:
value=(metadata[meta])
print ' - ' + meta + ':' + metadata[meta]
if meta == "/Author":
if value not in meta_author_array:
meta_author_array.append(value)
elif meta =="/Producer":
if value not in meta_producer_array:
meta_producer_array.append(value)
elif meta == "/Creator":
if value not in meta_creator_array:
meta_creator_array.append(value)
#Group the different arrays in one with all metadata
metadata_files.append(meta_author_array)
metadata_files.append(meta_producer_array)
metadata_files.append(meta_creator_array)
#print metadata_files
####### FUNCTION AnalyzeMetadata doc ######
示例10: GrepPDF
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def GrepPDF(self, path):
"""Parses a PDF files text content for keywords.
Args:
path (str): PDF file path.
Returns:
set[str]: unique occurrences of every match.
"""
with open(path, 'rb') as pdf_file_obj:
matches = set()
text = ''
pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
pages = pdf_reader.numPages
for page in range(pages):
page_obj = pdf_reader.getPage(page)
text += '\n' + page_obj.extractText()
matches.update(set(x.lower() for x in re.findall(
self._keywords, text, re.IGNORECASE)))
return matches
示例11: merge_pdf
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def merge_pdf(input_folder, output_file):
pdf2merge = []
for filename in os.listdir(input_folder):
#print(filename)
if filename.endswith('.pdf'):
pdf2merge.append(filename)
pdf2merge.sort()
pdfWriter = PyPDF2.PdfFileWriter()
for filename in pdf2merge:
pdfFileObj = open(input_folder+"/"+filename,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
for pageNum in range(pdfReader.numPages):
pageObj = pdfReader.getPage(pageNum)
pdfWriter.addPage(pageObj)
pdfOutput = open(output_file+'.pdf', 'wb')
pdfWriter.write(pdfOutput)
#Outputting the PDF
pdfOutput.close()
示例12: _destinations_in_two_columns
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def _destinations_in_two_columns(pdf, destinations, cutoff=3):
"""
Check if the named destinations are organized along two columns (heuristic)
@param pdf: a PdfFileReader object
@param destinations:
'cutoff' is used to tune the heuristic: if 'cutoff' destinations in the
would-be second column start at the same position, return True
"""
# iterator for the x coordinates of refs in the would-be second column
xpositions = (_destination_position(pdf, dest)[3] for (_, dest)
in destinations
if _destination_position(pdf, dest)[1] == 1)
xpos_count = {}
for xpos in xpositions:
xpos_count[xpos] = xpos_count.get(xpos, 0) + 1
if xpos_count[xpos] >= cutoff:
return True
return False
示例13: parse_pdf_pypdf2
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def parse_pdf_pypdf2(self, f, fpath):
try:
pdf = PdfFileReader(f, strict = False)
if self.dedup:
self.dedup_store = set()
self.handler.print_header(fpath)
page_num = 0
for page in pdf.pages:
page_num += 1
data = page.extractText()
self.parse_page(fpath, data, page_num)
self.handler.print_footer(fpath)
except (KeyboardInterrupt, SystemExit):
raise
示例14: pdf_page_to_png
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def pdf_page_to_png(src_pdf, pagenum=0, resolution=154):
"""
Returns specified PDF page as wand.image.Image png.
:param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
:param int pagenum: Page number to take.
:param int resolution: Resolution for resulting png in DPI.
"""
check_dependencies(__optional_dependencies__['pdf'])
# Import libraries within this function so as to avoid import-time dependence
import PyPDF2
from wand.image import Image # TODO: When we start using this again, document which system-level libraries are required.
dst_pdf = PyPDF2.PdfFileWriter()
dst_pdf.addPage(src_pdf.getPage(pagenum))
pdf_bytes = io.BytesIO()
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
img = Image(file=pdf_bytes, resolution=resolution)
img.convert("png")
return img
示例15: check_nb_pages
# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def check_nb_pages(self, data):
"""
Does this PDF contain enough pages?
"""
try:
s_io = StringIO(data)
reader = PyPDF2.PdfFileReader(s_io)
num_pages = reader.getNumPages()
print("num pages: %d" % num_pages)
return num_pages > 2
except PyPdfError as e:
return False