当前位置: 首页>>代码示例>>Python>>正文


Python pdfparser.PDFDocument类代码示例

本文整理汇总了Python中pdfminer.pdfparser.PDFDocument的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument类的具体用法?Python PDFDocument怎么用?Python PDFDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了PDFDocument类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: harvest_file

 def harvest_file(self, path):
     with open(path, 'rb') as fp:
         # FIXME: how do we know which encoding to use? Should we
         # use 'chardet' to detect it?
         encoding = 'utf-8'
         parser = PDFParser(fp)
         if HAS_PDFMINER_3K:
             doc = PDFDocument()
             parser.set_document(doc)
             doc.set_parser(parser)
         else:
             doc = PDFDocument(parser)
         title = doc.info[0].get('Title', '')
         if isinstance(title, PDFObjRef):
             title = title.resolve()
         if isinstance(title, bytes):
             # This may not be necessary with pdfminer3k.
             try:
                 title = title.decode(encoding)
             except UnicodeDecodeError:
                 logger.warning('Could not correctly decode title of "%s".', path)
                 title = title.decode(encoding, 'ignore')
         fp.seek(0)
         content = extract_content(fp, encoding).strip()
         try:
             content = content.decode(encoding)
         except UnicodeDecodeError:
             logger.warning('Could not correctly decode content of "%s".', path)
             content = content.decode(encoding, 'ignore')
     return {
         'title': title,
         'content': content,
         'kind': 'PDF',
     }
开发者ID:Polyconseil,项目名称:dokang_pdf,代码行数:34,代码来源:__init__.py

示例2: create_pages

    def create_pages(self):
        """Apply parsing function, returning the results"""

        from public_project.models import Page
        # create a parser object associated with the file object
        parser = PDFParser(self.pdf_file)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument()
        # connect the parser and document objects
        parser.set_document(doc)
        doc.set_parser(parser)
        # supply the password for initialization
        pdf_pwd = ''
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            doc_pages = self._parse_pages(doc)

        i = 1
        for doc_page in doc_pages:
            page = Page(
                document=self.document,
                number=i,
                content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'),
            )
            page.save()
            i = i + 1
开发者ID:atassumer,项目名称:django-public-project,代码行数:28,代码来源:doc_scanner.py

示例3: extractContent

def extractContent(file):
    print "extractContent"

    fp = open(file, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    rsrcmgr = PDFResourceManager()
    codec = 'UTF-8'
    laparams = LAParams()
    outfp = StringIO.StringIO()

    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    #if not doc.is_extractable:
    #    return None

    for i, page in enumerate(doc.get_pages()):
        print "page=" + str(i)
        if page is not None:
            interpreter.process_page(page)
    print "EOF"
    device.close()
    fp.close()

    return outfp.getvalue()
开发者ID:CaliopeProject,项目名称:CaliopeServer,代码行数:29,代码来源:PDFProcessor.py

示例4: get_metadata

    def get_metadata(self):
        """Returns metadata from both
    	   the info field (older PDFs) and XMP (newer PDFs).
           Return format is a .modules.metadata.Metadata object
    	"""
        file_pointer = open(self.path, 'rb')
        parser = PDFParser(file_pointer)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize()
        metadata = Metadata()
        for i in doc.info:
            metadata.add(i)
        if 'Metadata' in doc.catalog:
            xmp_metadata = resolve1(doc.catalog['Metadata']).get_data()
            xmp_dict = xmp_to_dict(xmp_metadata)
            #Let's add only the most useful one
            if "xap" in xmp_dict:
                metadata.add(xmp_dict["xap"])
            if "pdf" in xmp_dict:
                metadata.add(xmp_dict["pdf"])
            if "dc" in xmp_dict:
                metadata.add(xmp_dict["dc"], metadataType="dc")
        file_pointer.close()

        self.metadata = metadata
        return metadata
开发者ID:staffanm,项目名称:protokollen,代码行数:28,代码来源:pdf.py

示例5: WithPdf

 def WithPdf(self, pdfdoc, password, fn, *args):
     """Open the pdf document, and apply the function, returning the results"""
     result = None
     try:
         # open the pdf file
         fp = open(pdfdoc, 'rb')
         # create a parser object associated with the file object
         parser = PDFParser(fp)
         # create a PDFDocument object that stores the document structure
         doc = PDFDocument()
         # connect the parser and document objects
         parser.set_document(doc)
         doc.set_parser(parser)
         # supply the password for initialization
         if password:
             self.password = password
         doc.initialize(self.password)
 
         if doc.is_extractable:
             # apply the function and return the result
             result = fn(doc, *args)
 
         # close the pdf file
         fp.close()
     except IOError:
         # the file doesn't exist or similar problem
         pass
     return result
开发者ID:i11uminator,项目名称:bookservice,代码行数:28,代码来源:MyPdfMiner.py

示例6: Pdf

class Pdf(object):

    def __init__(self, pdf_file):
        parser = PDFParser(pdf_file)
        self._doc = PDFDocument()
        parser.set_document(self._doc)
        self._doc.initialize
        self._doc.set_parser(parser)

    @property
    def pages(self):
        return len(tuple(self._doc.get_pages()))

    def to_text(self):
        rsrcmgr = PDFResourceManager()
        output = StringIO()
        laparams = LAParams()
        laparams.detect_vertical = True
        laparams.all_texts = True
        laparams.word_margin = 0.4
        device = TextConverter(rsrcmgr, output, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self._doc.get_pages():
                interpreter.process_page(page)
        return output.getvalue().decode('utf-8', 'ignore')
开发者ID:MikaYuoadas,项目名称:Docbucket,代码行数:25,代码来源:pdf.py

示例7: getPDFMetadata

def getPDFMetadata(path):

    result = {}

    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()

    result = doc.info

    if 'Metadata' in doc.catalog:
        metadata = resolve1(doc.catalog['Metadata']).get_data()
        
        try:
            result.update( metadata ) # The raw XMP metadata
            
        except:
            pass
            
        try:
            result.update( xmp_to_dict(metadata) )
            
        except:
            pass

    return result[0]
开发者ID:tanmoydeb07,项目名称:ebookreader,代码行数:29,代码来源:readerUtilities.py

示例8: getData

def getData(fileName):
 doc = PDFDocument()
 fp = file(fileName, 'rb')
 parser = PDFParser(fp)
 try:
  parser.set_document(doc)
  doc.set_parser(parser)
 except:
  return "error"
   
 parser.close()
 fp.close()
 try:
  for xref in doc.xrefs:
   info_ref=xref.trailer.get('Info')
   if info_ref:
    info=resolve1(info_ref)
   metadata=info
   if metadata == None:
    return "Empty metadata"
   else:
    if metadata.has_key('Author'):
     print("Author "+metadata['Author'])
    if metadata.has_key('Company'):
     print("Company "+metadata['Company'])
    if metadata.has_key('Producer'):
     print("Producer "+metadata['Producer'])
    if metadata.has_key('Creator'):
     print("Creator "+metadata['Creator'])         
 except Exception,e:
  print "\t [x] Error in PDF extractor"
  return e 
开发者ID:jmortega,项目名称:europython_ethical_hacking,代码行数:32,代码来源:metadataPDF.py

示例9: getData

	def getData(self):
		doc = PDFDocument()
		fp = file(self.fname, 'rb')
		parser = PDFParser(fp)
		try:
			parser.set_document(doc)
			doc.set_parser(parser)
			doc.initialize(self.password)
		except:
			return "error"
		
		parser.close()
		fp.close()
		#try:
		#	metadata = resolve1(doc.catalog['Metadata'])
		#	return "ok"
		#except:
		#	print "[x] Error in PDF extractor, Metadata catalog"
		try:
			for xref in doc.xrefs:
				info_ref=xref.trailer.get('Info')
				if info_ref:
					info=resolve1(info_ref)
				self.metadata=info
				self.raw = info
			if self.raw == None:
				return "Empty metadata"
			else:
				return "ok"
		except Exception,e:
			return e 
			print "\t [x] Error in PDF extractor, Trailer Info"
开发者ID:TechByTom,项目名称:metagoofil,代码行数:32,代码来源:metadataPDF.py

示例10: PdfSerializer

class PdfSerializer(object):
    def __init__(self, filename):
        self.__filename = filename

        fp = open(self.__filename, 'rb')
        parser = PDFParser(fp)
        self.__doc = PDFDocument()
        parser.set_document(self.__doc)
        self.__doc.set_parser(parser)
        self.__doc.initialize('')

    def writeToTxt(self):
        text = self.getString()
        txtFile = open(self.__filename.replace(".pdf", ".txt"), "w")
        txtFile.write(text.encode('ascii','replace').decode("utf-8"))
        txtFile.close()

    def getString(self):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        string = StringIO()
        device = TextConverter(rsrcmgr, string, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self.__doc.get_pages():
            interpreter.process_page(page)
        return string.getvalue()
开发者ID:vsharonlynn,项目名称:cs3219-project-CViA,代码行数:26,代码来源:pdf_serializer.py

示例11: initialize_pdf_miner

def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
开发者ID:AymanYac,项目名称:OCP-Graduation-Project,代码行数:31,代码来源:pdftables.py

示例12: pdf_function

    def pdf_function(pdf_doc, password='', *args, **kwargs):
        result = None
        try:
            # open the pdf file
            fp = open(pdf_doc, 'rb')
            # create a parser object associated with the file object
            parser = PDFParser(fp)
            # create a PDFDocument object that stores the document structure
            doc = PDFDocument()
            # connect the parser and document objects
            parser.set_document(doc)
            doc.set_parser(parser)
            # supply the password for initialization
            doc.initialize(password)

            if doc.is_extractable:
                # apply the function and return the result
                result = function(doc, *args, **kwargs)

            # close the pdf file
            fp.close()
        except IOError:
            # the file doesn't exist or similar problem
            pass
        return result
开发者ID:ArcainOne,项目名称:anathema,代码行数:25,代码来源:book_parser.py

示例13: get_pdf_metadata

def get_pdf_metadata(fileOrUrl, textmode=False, prefix='', basicauth=None):
    if len(args) > 1:
        prefix = fileOrUrl + ':'
    fp = None
    if fileOrUrl.startswith('http://') or fileOrUrl.startswith('https://'):
        request = urllib2.Request(fileOrUrl)
        if basicauth:
            request.add_header('Authorization', 'Basic ' + basicauth)
        fobj = urllib2.urlopen(request)
        pdfdata = fobj.read()
        fobj.close()
        fp = StringIO.StringIO(pdfdata)
    else:
        fp = open(fileOrUrl, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    fp.close()
    if textmode:
        for obj in doc.info:
            for (name, val) in obj.iteritems():
                print '{0}:{1}={2}'.format(
                    fileOrUrl, name, val
                )
    else:
        val = doc.info
        if type(val) is list and len(val) == 1:
            val = val[0]
        print prefix + str(val)
开发者ID:kristerhedfors,项目名称:bin,代码行数:31,代码来源:pdfmeta.py

示例14: parse_pdf_pdfminer

    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            parser= PDFParser(f)
            doc = PDFDocument(caching=True)

            parser.set_document(doc)
            doc.set_parser(parser)
            for page in doc.get_pages():
                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                page_num += 1
                interpreter.process_page(page)
                data = retstr.getvalue()
                self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
                retstr.close()
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
开发者ID:sebdraven,项目名称:ioc_parser,代码行数:31,代码来源:iocp.py

示例15: pdf_isvalid

def pdf_isvalid(filelike):
    ''' returns True if valid pdf, else False
    @param filelike: filelike object, seekable
    '''
    logger = logging.getLogger()
    isvalid = False    
    filelike.seek(0)  
    
    if filelike.read(len(PDF_MAGIC)) != PDF_MAGIC:
        return False
    else:
        filelike.seek(0)
    try:
        parser = PDFParser(filelike)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        if doc.is_extractable:
            isvalid = True
    except PDFException as excobj:
        logger.warning("pdf has valid header but, still not valid pdf, exception was %r" %(excobj))
        isvalid = False
            
    filelike.seek(0)
    return isvalid
开发者ID:emulbreh,项目名称:ecs,代码行数:26,代码来源:pdfutils.py


注:本文中的pdfminer.pdfparser.PDFDocument类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。