当前位置: 首页>>代码示例>>Python>>正文


Python PDFDocument.getobj方法代码示例

本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument.getobj方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.getobj方法的具体用法?Python PDFDocument.getobj怎么用?Python PDFDocument.getobj使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfdocument.PDFDocument的用法示例。


在下文中一共展示了PDFDocument.getobj方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: dumppdf

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
def dumppdf(fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    res = ""
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            res += dumpxml(obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        res += dumpxml( obj, codec=codec)
                else:
                    res += dumpxml(page.attrs)
    #print "before dumpall"
    if dumpall:
        res += dumpallobjs( doc, codec=codec)
        #print "after dumpall"
    if (not objids) and (not pagenos) and (not dumpall):
        res += dumptrailers( doc)
    fp.close()
    if codec not in ('raw','binary'):
        res += '\n'
    #print "end proc"
    return res
开发者ID:toejamhoney,项目名称:peepdf-js_analyse,代码行数:32,代码来源:dumppdf.py

示例2: extractembedded

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
def extractembedded(outfp, fname, objids, pagenos, password='',
                    dumpall=False, codec=None, extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile' %
                (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print >>sys.stderr, 'extracting: %r' % path
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    return
开发者ID:toejamhoney,项目名称:peepdf-js_analyse,代码行数:34,代码来源:dumppdf.py

示例3: dumppdf

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
开发者ID:coolioxlr,项目名称:ziply,代码行数:29,代码来源:dumppdf.py

示例4: parse

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
 def parse (self):
     fp = file(self.pdf, 'rb')
     parser = PDFParser(fp, dbg=self.debug)
     doc = PDFDocument(parser, dbg=self.debug)
     #extract blob of data after EOF (if it exists)
     if doc.found_eof and doc.eof_distance > 3:
         self.bin_blob = parser.read_from_end(doc.eof_distance)
     res = '<pdf>'
     visited = set() #keep track of the objects already visited
     for xref in doc.xrefs:
         for objid in xref.get_objids():
             if objid in visited:
                 continue
             if objid == 21 or objid == 67:
                 print objid
             visited.add(objid)
             try:
                 obj = doc.getobj(objid)
                 res += '<object id="' + str(objid) + '">\n'
                 res += self.dump(obj)
                 res += '\n</object>\n\n'
             except PDFObjectNotFound as e:
                 mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096)
                 mal_obj = mal_obj.replace('<', '0x3C')
                 res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj)
                 self.takenote(self.malformed, 'objects', objid)
             except Exception as e:
                 res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message)
     fp.close()
     res += self.dumptrailers(doc)
     res += '</pdf>'
     self.xml=res
     self.errors = doc.errors
     self.bytes_read = parser.BYTES
     return
开发者ID:toejamhoney,项目名称:thisneedsacoolname,代码行数:37,代码来源:xml_creator.py

示例5: print_all_obj

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
def print_all_obj(filename):
    with file(filename, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser, None)
        visited_objids = set()
        for xref in doc.xrefs:
            for objid in xref.get_objids():
                if objid in visited_objids:
                    continue
                visited_objids.add(objid)
                print objid, get_obj_type(doc.getobj(objid))
开发者ID:kexplo,项目名称:extract_images_from_pdf,代码行数:13,代码来源:extimgpdf.py

示例6: extractComments

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import getobj [as 别名]
def extractComments(fp):
    parser = PDFParser(fp)
    doc = PDFDocument(parser, "")

    visited = set()
    pages = []
    resultList = []

    def extract(objid, obj):
        result = None
        if isinstance(obj, dict):
            # 'Type' is PDFObjRef type
            if obj.has_key('Type') and obj['Type'].name == 'Page':
                pages.append(objid)
            elif obj.has_key('C'):
                try:
                    pr = obj['P']
                    pi = pages.index(pr.objid)+1
                except:
                    pi = -1
                try:
                    result = (fp.name, objid, pi, obj['Subtype'].name, obj['Subj'],obj['T'],obj['Contents'])
                except:
                    # if any of the listed entries do not exist, ignore 
                    #print(objid, pi, obj['Subtype'].name)
                    result = ()

        return result

    for xref in doc.xrefs:
        for objid in xref.get_objids():
            if objid in visited: continue
            visited.add(objid)
            try:
                obj = doc.getobj(objid)
                if obj is None: continue
                r= extract(objid,obj)
                if r:
                    resultList.append(r)
            except PDFObjectNotFound, e:
                print >>sys.stderr, 'not found: %r' % e
开发者ID:ckolumbus,项目名称:CkPyPdftools,代码行数:43,代码来源:ckpdftools.py


注:本文中的pdfminer.pdfdocument.PDFDocument.getobj方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。