当前位置: 首页>>代码示例>>Python>>正文


Python PyPDF2.PdfFileReader方法代码示例

本文整理汇总了Python中PyPDF2.PdfFileReader方法的典型用法代码示例。如果您正苦于以下问题:Python PyPDF2.PdfFileReader方法的具体用法?Python PyPDF2.PdfFileReader怎么用?Python PyPDF2.PdfFileReader使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在PyPDF2的用法示例。


在下文中一共展示了PyPDF2.PdfFileReader方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: pdf_meta

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
    doc_info = None
    if use_pdf_meta:
        doc_info = PdfFileReader(open(tmp_file_path, 'rb')).getDocumentInfo()

    if doc_info:
        author = doc_info.author if doc_info.author else u'Unknown'
        title = doc_info.title if doc_info.title else original_file_name
        subject = doc_info.subject
    else:
        author = u'Unknown'
        title = original_file_name
        subject = ""

    return BookMeta(
        file_path=tmp_file_path,
        extension=original_file_extension,
        title=title,
        author=author,
        cover=pdf_preview(tmp_file_path, original_file_name),
        description=subject,
        tags="",
        series="",
        series_id="",
        languages="") 
开发者ID:janeczku,项目名称:calibre-web,代码行数:27,代码来源:uploader.py

示例2: run

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def run(self, params={}):
        try:
            if params.get('contents'):
                pdfFile = base64.b64decode(params.get('contents'))
            else:
                raise Exception("File contents missing!")
        except Exception as e:
            self.logger.error("File contents missing: ", e)
            raise
        try:
            with open("temp.pdf", 'wb') as temp_pdf:
                temp_pdf.write(pdfFile)
                pdfReader = PyPDF2.PdfFileReader(open('temp.pdf', 'rb'))
                pdftext = ""
                for page in range(pdfReader.numPages):
                    pageObj = pdfReader.getPage(page)
                    pdftext += pageObj.extractText().replace('\n','')
        except Exception as e:
            self.logger.info("An error occurred while extracting text: ", e)
            raise
        return {"output": pdftext} 
开发者ID:rapid7,项目名称:insightconnect-plugins,代码行数:23,代码来源:action.py

示例3: pdf_date

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def pdf_date(path):

    """
    Extract a date from PDF file metadata.

    Args:
        path (str): The file path.

    Returns:
        datetime: The created date.
    """

    reader = PdfFileReader(path)

    # Get rid of `D:` prefix and timezone.
    stamp = reader.documentInfo['/CreationDate']
    match = re.search('\d+', stamp)

    return datetime.strptime(
        match.group(),
        '%Y%m%d%H%M%S'
    ) 
开发者ID:davidmcclure,项目名称:open-syllabus-project,代码行数:24,代码来源:utils.py

示例4: join_ocred_pdf

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def join_ocred_pdf(self):
        # Join PDF files into one file that contains all OCR "backgrounds"
        text_pdf_file_list = sorted(glob.glob(self.tmp_dir + "{0}*.{1}".format(self.prefix, "pdf")))
        self.debug("We have {0} ocr'ed files".format(len(text_pdf_file_list)))
        if len(text_pdf_file_list) > 0:
            pdf_merger = PyPDF2.PdfFileMerger()
            for text_pdf_file in text_pdf_file_list:
                pdf_merger.append(PyPDF2.PdfFileReader(text_pdf_file, strict=False))
            pdf_merger.write(self.tmp_dir + self.prefix + "-ocr.pdf")
            pdf_merger.close()
        else:
            eprint("No PDF files generated after OCR. This is not expected. Aborting.")
            self.cleanup()
            exit(1)
        #
        self.debug("Joined ocr'ed PDF files") 
开发者ID:LeoFCardoso,项目名称:pdf2pdfocr,代码行数:18,代码来源:pdf2pdfocr.py

示例5: encryptPDFs

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def encryptPDFs(root, password):
    """Encrypts all pdfs folder walk
       Args:
          root (str): folder path to walk
          password (str): password to encrypt pdfs with
       Returns:
          None
    """
    for folder, subfolder, fileList in os.walk(root):
        for file in fileList:
            if file.endswith('.pdf'):
                filepath = os.path.join(os.path.abspath(folder), file)
                pdfFileObj = open(filepath, 'rb')
                pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

                if not pdfReader.isEncrypted:
                    pdfWriter = PyPDF2.PdfFileWriter()
                    for pageNum in range(pdfReader.numPages):
                        pdfWriter.addPage(pdfReader.getPage(pageNum))
                    pdfWriter.encrypt(password)
                    newPath = os.path.dirname(filepath) + '/untitled folder/' + \
                              ('_encrypted.'.join(os.path.basename(filepath).split('.')))
                    resultPdf = open(newPath, 'wb')
                    pdfWriter.write(resultPdf)
                    resultPdf.close() 
开发者ID:kudeh,项目名称:automate-the-boring-stuff-projects,代码行数:27,代码来源:pdfParanoia.py

示例6: breakPassword

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def breakPassword(filename):
    """Breaks a single word password of a PDF
    Args:
        filename (str): Filename for encrypted pdf
    Returns:
        None
    """
    encryptedFile = open(filename, 'rb')
    pdfReader = PyPDF2.PdfFileReader(encryptedFile)

    with open('dictionary.txt') as words:
        wordList = words.read().split('\n')

    for word in wordList:
        wordLower = word.lower()
        wordCap = word.capitalize()

        if pdfReader.decrypt(word):
            return word
        elif pdfReader.decrypt(wordCap):
            return wordCap
        elif pdfReader.decrypt(wordLower):
            return wordLower

    return 
开发者ID:kudeh,项目名称:automate-the-boring-stuff-projects,代码行数:27,代码来源:passwordBreaker.py

示例7: encrypt

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def encrypt(out_pdf, password):
    print('Encrypting the document')

    output_pdf = PyPDF2.PdfFileWriter()

    in_file = open(out_pdf, "rb")
    input_pdf = PyPDF2.PdfFileReader(in_file)
    output_pdf.appendPagesFromReader(input_pdf)
    output_pdf.encrypt(password)

    # Intermediate file
    with open(INTERMEDIATE_ENCRYPT_FILE, "wb") as out_file:
        output_pdf.write(out_file)

    in_file.close()

    # Rename the intermediate file
    os.rename(INTERMEDIATE_ENCRYPT_FILE, out_pdf) 
开发者ID:PacktPublishing,项目名称:Python-Automation-Cookbook,代码行数:20,代码来源:watermarking_pdf.py

示例8: Analyze_Metadata_pdf

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		value=(metadata[meta])
		print ' - ' + meta + ':' + metadata[meta]
		if meta == "/Author":
			if value not in meta_author_array:
				meta_author_array.append(value)
		elif meta =="/Producer":
			if value not in meta_producer_array:
				meta_producer_array.append(value)
		elif meta == "/Creator":
			if value not in meta_creator_array:
				meta_creator_array.append(value)
	#Group the different arrays in one with all metadata
	metadata_files.append(meta_author_array)
	metadata_files.append(meta_producer_array)
	metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ###### 
开发者ID:n4xh4ck5,项目名称:RastLeak,代码行数:25,代码来源:downloadfiles.py

示例9: Analyze_Metadata_pdf

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		value=(metadata[meta])
		print ' - ' + meta + ':' + metadata[meta]
		if meta == "/Author":
			if value not in meta_author_array:
				meta_author_array.append(value)
		elif meta =="/Producer":
			if value not in meta_producer_array:
				meta_producer_array.append(value)
		elif meta == "/Creator":
			if value not in meta_creator_array:
				meta_creator_array.append(value)
	#Group the different arrays in one with all metadata
	metadata_files.append(meta_author_array)
	metadata_files.append(meta_producer_array)
	metadata_files.append(meta_creator_array)
	#print metadata_files
####### FUNCTION AnalyzeMetadata doc ###### 
开发者ID:n4xh4ck5,项目名称:RastLeak,代码行数:25,代码来源:RastLeak_1_2.py

示例10: GrepPDF

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def GrepPDF(self, path):
    """Parses a PDF files text content for keywords.

    Args:
      path (str): PDF file path.

    Returns:
      set[str]: unique occurrences of every match.
    """
    with open(path, 'rb') as pdf_file_obj:
      matches = set()
      text = ''
      pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
      pages = pdf_reader.numPages
      for page in range(pages):
        page_obj = pdf_reader.getPage(page)
        text += '\n' + page_obj.extractText()
      matches.update(set(x.lower() for x in re.findall(
          self._keywords, text, re.IGNORECASE)))
    return matches 
开发者ID:log2timeline,项目名称:dftimewolf,代码行数:22,代码来源:grepper.py

示例11: merge_pdf

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def merge_pdf(input_folder, output_file):
    pdf2merge = []
    for filename in os.listdir(input_folder):
        #print(filename)
        if filename.endswith('.pdf'):
            pdf2merge.append(filename)
            
    pdf2merge.sort()
    
    
    pdfWriter = PyPDF2.PdfFileWriter()
    for filename in pdf2merge:
        pdfFileObj = open(input_folder+"/"+filename,'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        for pageNum in range(pdfReader.numPages):
            pageObj = pdfReader.getPage(pageNum)
            pdfWriter.addPage(pageObj)
            
    pdfOutput = open(output_file+'.pdf', 'wb')
    pdfWriter.write(pdfOutput)
    #Outputting the PDF
    pdfOutput.close() 
开发者ID:AlexandrovLab,项目名称:SigProfilerExtractor,代码行数:24,代码来源:subroutines.py

示例12: _destinations_in_two_columns

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def _destinations_in_two_columns(pdf, destinations, cutoff=3):
    """
    Check if the named destinations are organized along two columns (heuristic)

    @param pdf: a PdfFileReader object
    @param destinations:

    'cutoff' is used to tune the heuristic: if 'cutoff' destinations in the
    would-be second column start at the same position, return True
    """
    # iterator for the x coordinates of refs in the would-be second column
    xpositions = (_destination_position(pdf, dest)[3] for (_, dest)
                  in destinations
                  if _destination_position(pdf, dest)[1] == 1)
    xpos_count = {}
    for xpos in xpositions:
        xpos_count[xpos] = xpos_count.get(xpos, 0) + 1
        if xpos_count[xpos] >= cutoff:
            return True
    return False 
开发者ID:inspirehep,项目名称:refextract,代码行数:22,代码来源:pdf.py

示例13: parse_pdf_pypdf2

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def parse_pdf_pypdf2(self, f, fpath):
		try:
			pdf = PdfFileReader(f, strict = False)

			if self.dedup:
				self.dedup_store = set()

			self.handler.print_header(fpath)
			page_num = 0
			for page in pdf.pages:
				page_num += 1

				data = page.extractText()

				self.parse_page(fpath, data, page_num)
			self.handler.print_footer(fpath)
		except (KeyboardInterrupt, SystemExit):
			raise 
开发者ID:armbues,项目名称:ioc_parser,代码行数:20,代码来源:Parser.py

示例14: pdf_page_to_png

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def pdf_page_to_png(src_pdf, pagenum=0, resolution=154):
    """
    Returns specified PDF page as wand.image.Image png.
    :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
    :param int pagenum: Page number to take.
    :param int resolution: Resolution for resulting png in DPI.
    """

    check_dependencies(__optional_dependencies__['pdf'])
    # Import libraries within this function so as to avoid import-time dependence
    import PyPDF2
    from wand.image import Image  # TODO: When we start using this again, document which system-level libraries are required.

    dst_pdf = PyPDF2.PdfFileWriter()
    dst_pdf.addPage(src_pdf.getPage(pagenum))

    pdf_bytes = io.BytesIO()
    dst_pdf.write(pdf_bytes)
    pdf_bytes.seek(0)

    img = Image(file=pdf_bytes, resolution=resolution)
    img.convert("png")

    return img 
开发者ID:airbnb,项目名称:knowledge-repo,代码行数:26,代码来源:image.py

示例15: check_nb_pages

# 需要导入模块: import PyPDF2 [as 别名]
# 或者: from PyPDF2 import PdfFileReader [as 别名]
def check_nb_pages(self, data):
        """
        Does this PDF contain enough pages?
        """
        try:
            s_io = StringIO(data)
            reader = PyPDF2.PdfFileReader(s_io)
            num_pages = reader.getNumPages()
            print("num pages: %d" % num_pages)
            return num_pages > 2
        except PyPdfError as e:
            return False 
开发者ID:dissemin,项目名称:oabot,代码行数:14,代码来源:classifier.py


注:本文中的PyPDF2.PdfFileReader方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。