本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument.getobj方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.getobj方法的具体用法?Python PDFDocument.getobj怎么用?Python PDFDocument.getobj使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.pdfdocument.PDFDocument
的用法示例。
在下文中一共展示了PDFDocument.getobj方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: dumppdf
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
def dumppdf(fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, password)
res = ""
if objids:
for objid in objids:
obj = doc.getobj(objid)
res += dumpxml(obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
if pageno in pagenos:
if codec:
for obj in page.contents:
obj = stream_value(obj)
res += dumpxml( obj, codec=codec)
else:
res += dumpxml(page.attrs)
#print "before dumpall"
if dumpall:
res += dumpallobjs( doc, codec=codec)
#print "after dumpall"
if (not objids) and (not pagenos) and (not dumpall):
res += dumptrailers( doc)
fp.close()
if codec not in ('raw','binary'):
res += '\n'
#print "end proc"
return res
示例2: extractembedded
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
def extractembedded(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
def extract1(obj):
filename = os.path.basename(obj['UF'] or obj['F'])
fileref = obj['EF']['F']
fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream):
raise PDFValueError(
'unable to process PDF: reference for %r is not a PDFStream' %
(filename))
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
raise PDFValueError(
'unable to process PDF: reference for %r is not an EmbeddedFile' %
(filename))
path = os.path.join(extractdir, filename)
if os.path.exists(path):
raise IOError('file exists: %r' % path)
print >>sys.stderr, 'extracting: %r' % path
out = file(path, 'wb')
out.write(fileobj.get_data())
out.close()
return
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, password)
for xref in doc.xrefs:
for objid in xref.get_objids():
obj = doc.getobj(objid)
if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
extract1(obj)
return
示例3: dumppdf
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
if pageno in pagenos:
if codec:
for obj in page.contents:
obj = stream_value(obj)
dumpxml(outfp, obj, codec=codec)
else:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
return
示例4: parse
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
def parse (self):
fp = file(self.pdf, 'rb')
parser = PDFParser(fp, dbg=self.debug)
doc = PDFDocument(parser, dbg=self.debug)
#extract blob of data after EOF (if it exists)
if doc.found_eof and doc.eof_distance > 3:
self.bin_blob = parser.read_from_end(doc.eof_distance)
res = '<pdf>'
visited = set() #keep track of the objects already visited
for xref in doc.xrefs:
for objid in xref.get_objids():
if objid in visited:
continue
if objid == 21 or objid == 67:
print objid
visited.add(objid)
try:
obj = doc.getobj(objid)
res += '<object id="' + str(objid) + '">\n'
res += self.dump(obj)
res += '\n</object>\n\n'
except PDFObjectNotFound as e:
mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096)
mal_obj = mal_obj.replace('<', '0x3C')
res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj)
self.takenote(self.malformed, 'objects', objid)
except Exception as e:
res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message)
fp.close()
res += self.dumptrailers(doc)
res += '</pdf>'
self.xml=res
self.errors = doc.errors
self.bytes_read = parser.BYTES
return
示例5: print_all_obj
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
def print_all_obj(filename):
with file(filename, 'rb') as f:
parser = PDFParser(f)
doc = PDFDocument(parser, None)
visited_objids = set()
for xref in doc.xrefs:
for objid in xref.get_objids():
if objid in visited_objids:
continue
visited_objids.add(objid)
print objid, get_obj_type(doc.getobj(objid))
示例6: extractComments
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
def extractComments(fp):
parser = PDFParser(fp)
doc = PDFDocument(parser, "")
visited = set()
pages = []
resultList = []
def extract(objid, obj):
result = None
if isinstance(obj, dict):
# 'Type' is PDFObjRef type
if obj.has_key('Type') and obj['Type'].name == 'Page':
pages.append(objid)
elif obj.has_key('C'):
try:
pr = obj['P']
pi = pages.index(pr.objid)+1
except:
pi = -1
try:
result = (fp.name, objid, pi, obj['Subtype'].name, obj['Subj'],obj['T'],obj['Contents'])
except:
# if any of the listed entries do not exist, ignore
#print(objid, pi, obj['Subtype'].name)
result = ()
return result
for xref in doc.xrefs:
for objid in xref.get_objids():
if objid in visited: continue
visited.add(objid)
try:
obj = doc.getobj(objid)
if obj is None: continue
r= extract(objid,obj)
if r:
resultList.append(r)
except PDFObjectNotFound, e:
print >>sys.stderr, 'not found: %r' % e