本文整理汇总了Python中pdfminer.pdfparser.PDFDocument.getobj方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.getobj方法的具体用法?Python PDFDocument.getobj怎么用?Python PDFDocument.getobj使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.pdfparser.PDFDocument
的用法示例。
在下文中一共展示了PDFDocument.getobj方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: dumppdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import getobj [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos:
if codec:
for obj in page.contents:
obj = stream_value(obj)
dumpxml(outfp, obj, codec=codec)
else:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
return
示例2: dumppdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import getobj [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
if isinstance(obj, PDFStream) and codec == 'raw':
outfp.write(obj.get_rawdata())
elif isinstance(obj, PDFStream) and codec == 'binary':
outfp.write(obj.get_data())
else:
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
return
示例3: MapFactory
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import getobj [as 别名]
def MapFactory(map_path):
try:
map_file = file(map_path, "rb")
except:
return None
document = PDFDocument()
try:
parser = PDFParser(map_file)
parser.set_document(document)
document.set_parser(parser)
document.initialize("")
except:
return None
# The image object on all IBGE PDFs is indexed
# at ID 6. We also probe for a few properties.
obj = document.getobj(6)
if not obj or not isinstance(obj, PDFStream):
return None
if not "Width" in obj:
return None
if not "Height" in obj:
return None
if not "ColorSpace" in obj:
return None
width = obj["Width"]
height = obj["Height"]
map_class = None
if (width == MapA4Portrait.WIDTH and height == MapA4Portrait.HEIGHT):
map_class = MapA4Portrait
elif (width == MapA4Landscape.WIDTH and height == MapA4Landscape.HEIGHT):
map_class = MapA4Landscape
elif (width == MapA3Portrait.WIDTH and height == MapA3Portrait.HEIGHT):
map_class = MapA3Portrait
elif (width == MapA3Landscape.WIDTH and height == MapA3Landscape.HEIGHT):
map_class = MapA3Landscape
elif (width == MapA2Portrait.WIDTH and height == MapA2Portrait.HEIGHT):
map_class = MapA2Portrait
elif (width == MapA2Landscape.WIDTH and height == MapA2Landscape.HEIGHT):
map_class = MapA2Landscape
elif (width == MapA1Portrait.WIDTH and height == MapA1Portrait.HEIGHT):
map_class = MapA1Portrait
elif (width == MapA1Landscape.WIDTH and height == MapA1Landscape.HEIGHT):
map_class = MapA1Landscape
else:
return None
return map_class(_MakePPMImage(width, height, obj.get_data()), map_path)
示例4: extractembedded
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import getobj [as 别名]
def extractembedded(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
cwd = os.path.normpath(os.getcwd()) + '/'
for xref in doc.xrefs:
for objid in xref.get_objids():
obj = doc.getobj(objid)
if isinstance(obj, dict):
objtype = obj.get('Type', '')
if isinstance(objtype, PSLiteral) and objtype.name == 'Filespec':
filename = obj['UF'] or obj['F']
fileref = obj['EF']['F']
fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream):
raise Exception("unable to process PDF: reference for %s is not a PDFStream" % (filename))
if not isinstance(fileobj['Type'], PSLiteral) or not fileobj['Type'].name == 'EmbeddedFile':
raise Exception("unable to process PDF: reference for %s is not an EmbeddedFile" % (filename))
print "extracting", filename
absfilename = os.path.normpath(os.path.abspath(filename))
if not absfilename.startswith(cwd):
raise Exception("filename %s is trying to escape to parent directories.." % (filename))
dirname = os.path.dirname(absfilename)
if not os.path.isdir(dirname):
os.makedirs(dirname)
# don't overwrite anything
fd = os.open(absfilename, os.O_WRONLY | os.O_CREAT | os.O_EXCL)
f = os.fdopen(fd, 'wb')
f.write(fileobj.get_data())
f.close()
示例5: __init__
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import getobj [as 别名]
class PDFMine:
def __init__(self, filename):
self.result = {}
self.filename=filename
self.fp=open(filename, "rb")
self.parser=PDFParser(self.fp)
self.doc=PDFDocument()
self.parser.set_document(self.doc)
self.doc.set_parser(self.parser)
self.doc.initialize()
self.pagecount=self.pgcount()
print "Page count %i" % self.pagecount
if self.doc.is_extractable:
print "Starting extraction of %s" % self.filename
else:
print "Oops, error extracting %s" % self.filename
raise()
def close(self):
self.fp.close()
def pgcount(self):
count=0;
for page in self.doc.get_pages():
count=count+1
return count
def save_video(self, targetdir):
"""Saves all your videos to targetdir """
for page in self.doc.get_pages():
if (page.annots):
obj=self.doc.getobj(page.annots.objid)
for i in obj:
annotobj=i.resolve()
try:
if (annotobj["Subtype"].name=='RichMedia'):
linktype="media"
data=annotobj["RichMediaContent"].resolve()
dataobj=data["Assets"].resolve()
fstream=dataobj["Names"][1].resolve()
filename=fstream["F"]
fdata=fstream['EF']['F'].resolve().get_data()
f=open(os.path.join(targetdir,filename),"w")
f.write(fdata)
f.close()
except:
pass
def _rect(self, bbox):
""" Changes a bounding box into something we can use
with HTML (x,y,width,height measured from top left) """
pgbox=self.pgbox
pgwidth=round(abs(pgbox[0]-pgbox[2]))
pgheight=round(abs(pgbox[1]-pgbox[3]))
x=round(min(bbox[0], bbox[2]))
y=pgheight-(round(max(bbox[1],bbox[3])))
width=round(max(bbox[0], bbox[2])-min(bbox[0], bbox[2]))
height=round(max(bbox[1], bbox[3])-min(bbox[1], bbox[3]))
result={"x":x, "y":y, "width":width, "height":height}
return result
def _find_objid_pgnum(self, obj):
"""Given a page, return the page number """
i=0
for page in self.doc.get_pages():
i=i+1
if self.doc.getobj(page.pageid)==obj:
return i
return False
def parse_pages(self):
result=[]
i=0
for page in self.doc.get_pages():
self.pgbox=page.mediabox
i=i+1
print "==== Page %d ====" % i
result.append(self._parse_page(page))
return result
def _parse_page(self, page):
result=[]
vids=self._parse_video(page)
if len(vids)>0:
result.extend(self._parse_video(page))
links=self._parse_links(page)
if len(links)>0:
result.extend(links)
comments=self._parse_comments(page)
if len(comments)>0:
result.extend(comments)
return result
def _parse_comments(self, page):
result=[]
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
#.........这里部分代码省略.........