当前位置: 首页>>代码示例>>Python>>正文


Python PDFDocument.getobj方法代码示例

本文整理汇总了Python中pdfminer.pdfparser.PDFDocument.getobj方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.getobj方法的具体用法?Python PDFDocument.getobj怎么用?Python PDFDocument.getobj使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfparser.PDFDocument的用法示例。


在下文中一共展示了PDFDocument.getobj方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: dumppdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import getobj [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(doc.get_pages()):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
开发者ID:Adniel,项目名称:ComparePdf,代码行数:31,代码来源:dumppdf.py

示例2: dumppdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import getobj [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
  doc = PDFDocument()
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp)
  doc.initialize(password)
  if objids:
    for objid in objids:
      obj = doc.getobj(objid)
      if isinstance(obj, PDFStream) and codec == 'raw':
        outfp.write(obj.get_rawdata())
      elif isinstance(obj, PDFStream) and codec == 'binary':
        outfp.write(obj.get_data())
      else:
        dumpxml(outfp, obj, codec=codec)
  if pagenos:
    for (pageno,page) in enumerate(doc.get_pages()):
      if pageno in pagenos:
        dumpxml(outfp, page.attrs)
  if dumpall:
    dumpallobjs(outfp, doc, codec=codec)
  if (not objids) and (not pagenos) and (not dumpall):
    dumptrailers(outfp, doc)
  fp.close()
  if codec not in ('raw','binary'):
    outfp.write('\n')
  return
开发者ID:frid,项目名称:PythonPool,代码行数:29,代码来源:dumppdf.py

示例3: MapFactory

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import getobj [as 别名]
def MapFactory(map_path):
    try:
        map_file = file(map_path, "rb")
    except:
        return None

    document = PDFDocument()

    try:
        parser = PDFParser(map_file)
        parser.set_document(document)
        document.set_parser(parser)
        document.initialize("")
    except:
        return None

    # The image object on all IBGE PDFs is indexed
    # at ID 6. We also probe for a few properties.
    obj = document.getobj(6)
    if not obj or not isinstance(obj, PDFStream):
        return None

    if not "Width" in obj:
        return None
    if not "Height" in obj:
        return None
    if not "ColorSpace" in obj:
        return None

    width = obj["Width"]
    height = obj["Height"]
    map_class = None

    if (width == MapA4Portrait.WIDTH and height == MapA4Portrait.HEIGHT):
        map_class = MapA4Portrait
    elif (width == MapA4Landscape.WIDTH and height == MapA4Landscape.HEIGHT):
        map_class = MapA4Landscape
    elif (width == MapA3Portrait.WIDTH and height == MapA3Portrait.HEIGHT):
        map_class = MapA3Portrait
    elif (width == MapA3Landscape.WIDTH and height == MapA3Landscape.HEIGHT):
        map_class = MapA3Landscape
    elif (width == MapA2Portrait.WIDTH and height == MapA2Portrait.HEIGHT):
        map_class = MapA2Portrait
    elif (width == MapA2Landscape.WIDTH and height == MapA2Landscape.HEIGHT):
        map_class = MapA2Landscape
    elif (width == MapA1Portrait.WIDTH and height == MapA1Portrait.HEIGHT):
        map_class = MapA1Portrait
    elif (width == MapA1Landscape.WIDTH and height == MapA1Landscape.HEIGHT):
        map_class = MapA1Landscape
    else:
        return None

    return map_class(_MakePPMImage(width, height, obj.get_data()), map_path)
开发者ID:drott,项目名称:IBGETools,代码行数:55,代码来源:Map.py

示例4: extractembedded

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import getobj [as 别名]
def extractembedded(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)

    cwd = os.path.normpath(os.getcwd()) + '/'
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict):
                objtype = obj.get('Type', '')
                if isinstance(objtype, PSLiteral) and objtype.name == 'Filespec':
                    filename = obj['UF'] or obj['F']
                    fileref = obj['EF']['F']
                    fileobj = doc.getobj(fileref.objid)
                    if not isinstance(fileobj, PDFStream):
                        raise Exception("unable to process PDF: reference for %s is not a PDFStream" % (filename))
                    if not isinstance(fileobj['Type'], PSLiteral) or not fileobj['Type'].name == 'EmbeddedFile':
                        raise Exception("unable to process PDF: reference for %s is not an EmbeddedFile" % (filename))

                    print "extracting", filename
                    absfilename = os.path.normpath(os.path.abspath(filename))
                    if not absfilename.startswith(cwd):
                        raise Exception("filename %s is trying to escape to parent directories.." % (filename))

                    dirname = os.path.dirname(absfilename)
                    if not os.path.isdir(dirname):
                        os.makedirs(dirname)

                    # don't overwrite anything
                    fd = os.open(absfilename, os.O_WRONLY | os.O_CREAT | os.O_EXCL)
                    f = os.fdopen(fd, 'wb')
                    f.write(fileobj.get_data())
                    f.close()
开发者ID:eug48,项目名称:pdfminer,代码行数:40,代码来源:dumppdf.py

示例5: __init__

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import getobj [as 别名]
class PDFMine:
	def __init__(self, filename):
		self.result = {}
		self.filename=filename
		self.fp=open(filename, "rb")
		self.parser=PDFParser(self.fp)
		self.doc=PDFDocument()
		self.parser.set_document(self.doc)
		self.doc.set_parser(self.parser)
		self.doc.initialize()
		self.pagecount=self.pgcount()
		print "Page count %i" % self.pagecount
		if self.doc.is_extractable:
			print "Starting extraction of %s" % self.filename
		else:
			print "Oops, error extracting %s" % self.filename
			raise()
		
	def close(self):
		self.fp.close()
		
	def pgcount(self):
		count=0;
		for page in self.doc.get_pages():
			count=count+1
		return count
		
	def save_video(self, targetdir):
		"""Saves all your videos to targetdir """
		for page in self.doc.get_pages():
			if (page.annots):
				obj=self.doc.getobj(page.annots.objid)
				for i in obj:
					annotobj=i.resolve()
					try:
						if (annotobj["Subtype"].name=='RichMedia'):
							linktype="media"
							data=annotobj["RichMediaContent"].resolve()
							dataobj=data["Assets"].resolve()
							fstream=dataobj["Names"][1].resolve()
							filename=fstream["F"]
							fdata=fstream['EF']['F'].resolve().get_data()
							f=open(os.path.join(targetdir,filename),"w")
							f.write(fdata)
							f.close()
					except:
						pass
		
	def _rect(self, bbox):
		""" Changes a bounding box into something we can use 
		with HTML (x,y,width,height measured from top left) """
		pgbox=self.pgbox
		pgwidth=round(abs(pgbox[0]-pgbox[2]))
		pgheight=round(abs(pgbox[1]-pgbox[3]))
		x=round(min(bbox[0], bbox[2]))
		y=pgheight-(round(max(bbox[1],bbox[3])))
		width=round(max(bbox[0], bbox[2])-min(bbox[0], bbox[2]))
		height=round(max(bbox[1], bbox[3])-min(bbox[1], bbox[3]))
		result={"x":x, "y":y, "width":width, "height":height}
		return result
		
	def _find_objid_pgnum(self, obj):
		"""Given a page, return the page number """
		i=0
		for page in self.doc.get_pages():
			i=i+1
			if self.doc.getobj(page.pageid)==obj:
				return i
		return False
	
	def parse_pages(self):
		result=[]
		i=0
		for page in self.doc.get_pages():
			self.pgbox=page.mediabox
			i=i+1
			print "==== Page %d ====" % i
			result.append(self._parse_page(page))
		return result
	
	def _parse_page(self, page):
		result=[]
		vids=self._parse_video(page)
		if len(vids)>0:
			result.extend(self._parse_video(page))
		links=self._parse_links(page)
		if len(links)>0:
			result.extend(links)
		comments=self._parse_comments(page)
		if len(comments)>0:
			result.extend(comments)
		return result
	
	def _parse_comments(self, page):
		result=[]
		rsrcmgr = PDFResourceManager()
		laparams = LAParams()
		device = PDFPageAggregator(rsrcmgr, laparams=laparams)
		interpreter = PDFPageInterpreter(rsrcmgr, device)
		interpreter.process_page(page)
#.........这里部分代码省略.........
开发者ID:10layer,项目名称:PDF-Mine,代码行数:103,代码来源:pdfmine.py


注:本文中的pdfminer.pdfparser.PDFDocument.getobj方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。