本文整理汇总了Python中pdfminer.pdfparser.PDFDocument._parse_everything方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument._parse_everything方法的具体用法?Python PDFDocument._parse_everything怎么用?Python PDFDocument._parse_everything使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.pdfparser.PDFDocument
的用法示例。
在下文中一共展示了PDFDocument._parse_everything方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: PDFExploreCmd
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import _parse_everything [as 别名]
#.........这里部分代码省略.........
@intarg(1)
def do_rtok(self, arg):
"Read the next X tokens, X being the supplied argument."
tokens = []
try:
for _ in range(arg):
pos, token = self.parser.nexttoken()
token = str(token)
if len(token) > 20:
token = token[:20] + "[...(%d)]" % (len(token)-20)
tokens.append(token)
except PSEOF:
pass
print(' '.join(tokens))
if len(tokens) != arg:
print("End of file reached")
@intarg(1)
def do_ptok(self, arg):
"Peek the next X tokens, X being the supplied argument. Your current position will not change."
pos = self.parser.lex.lexpos
self.do_rtok(arg)
self.do_setpos(pos)
def do_robj(self, arg):
"Read the next object and sets it as the 'current' object."
objid, genno, obj = self.doc.readobj()
self.current_obj = (objid, genno, obj)
self.do_st('')
@intarg()
def do_sobj(self, arg):
"Select object with ID X. The object has to have been read already."
obj = None
if arg in self.doc._cached_objs:
obj = self.doc._cached_objs[arg]
elif arg in self.doc._parsed_objs:
obj = self.doc._parsed_objs[arg]
else:
print("Object hasn't been read yet.")
strmid, index = self.doc.find_obj_ref(arg)
if index is not None:
print("However, our object id is in a xref")
if strmid:
print("Stream ID: %d" % strmid)
print("Position: %d" % index)
if obj is not None:
self.current_obj = (arg, 0, obj)
self.do_st('')
def do_dbgobj(self, arg):
"Enter in debug mode with current obj as 'obj' in the local scope."
if not self.current_obj:
print("No current obj.")
return
objid, genno, obj = self.current_obj
import pdb; pdb.set_trace()
def do_readall(self, arg):
"Read all objects in the document."
self.doc._parse_everything()
print("Read %d objects:" % len(self.doc._cached_objs))
self.do_whatisread('')
def do_dumpdata(self, arg):
"For each read stream, print out the decoded data it contains."
objs = self._cached_objects()
for objid, obj in objs:
print("Dumping obj id: %d" % objid)
print(repr(obj))
if hasattr(obj, 'get_data'):
print(repr(obj.get_data()))
def do_whatisread(self, arg):
"Prints a list of all read object ids."
objs = self._cached_objects()
print(repr([objid for objid, obj in objs]))
def do_refs(self, arg):
"Look in all read objects and find all objects that reference to our current object."
if not self.current_obj:
print("No current obj.")
return
target_id, _, _ = self.current_obj
result = [parent_id for parent_id, ref in self._get_refs() if ref.objid == target_id]
print(repr(result))
def do_deadrefs(self, arg):
"Print (dead_id, host_id) for all dead references in the document."
objs = self._cached_objects()
objids = {objid for objid, obj in objs}
result = [(ref.objid, parent_id) for parent_id, ref in self._get_refs() if ref.objid not in objids]
print(repr(result))
def do_quit(self, arg):
"Quit PDFExplore"
self.fp.close()
sys.exit(0)
do_q = do_quit