本文整理汇总了Python中pdfminer.pdfparser.PDFDocument类的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument类的具体用法?Python PDFDocument怎么用?Python PDFDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PDFDocument类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: harvest_file
def harvest_file(self, path):
with open(path, 'rb') as fp:
# FIXME: how do we know which encoding to use? Should we
# use 'chardet' to detect it?
encoding = 'utf-8'
parser = PDFParser(fp)
if HAS_PDFMINER_3K:
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
else:
doc = PDFDocument(parser)
title = doc.info[0].get('Title', '')
if isinstance(title, PDFObjRef):
title = title.resolve()
if isinstance(title, bytes):
# This may not be necessary with pdfminer3k.
try:
title = title.decode(encoding)
except UnicodeDecodeError:
logger.warning('Could not correctly decode title of "%s".', path)
title = title.decode(encoding, 'ignore')
fp.seek(0)
content = extract_content(fp, encoding).strip()
try:
content = content.decode(encoding)
except UnicodeDecodeError:
logger.warning('Could not correctly decode content of "%s".', path)
content = content.decode(encoding, 'ignore')
return {
'title': title,
'content': content,
'kind': 'PDF',
}
示例2: create_pages
def create_pages(self):
"""Apply parsing function, returning the results"""
from public_project.models import Page
# create a parser object associated with the file object
parser = PDFParser(self.pdf_file)
# create a PDFDocument object that stores the document structure
doc = PDFDocument()
# connect the parser and document objects
parser.set_document(doc)
doc.set_parser(parser)
# supply the password for initialization
pdf_pwd = ''
doc.initialize(pdf_pwd)
if doc.is_extractable:
# apply the function and return the result
doc_pages = self._parse_pages(doc)
i = 1
for doc_page in doc_pages:
page = Page(
document=self.document,
number=i,
content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'),
)
page.save()
i = i + 1
示例3: extractContent
def extractContent(file):
print "extractContent"
fp = open(file, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
rsrcmgr = PDFResourceManager()
codec = 'UTF-8'
laparams = LAParams()
outfp = StringIO.StringIO()
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
#if not doc.is_extractable:
# return None
for i, page in enumerate(doc.get_pages()):
print "page=" + str(i)
if page is not None:
interpreter.process_page(page)
print "EOF"
device.close()
fp.close()
return outfp.getvalue()
示例4: get_metadata
def get_metadata(self):
"""Returns metadata from both
the info field (older PDFs) and XMP (newer PDFs).
Return format is a .modules.metadata.Metadata object
"""
file_pointer = open(self.path, 'rb')
parser = PDFParser(file_pointer)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
metadata = Metadata()
for i in doc.info:
metadata.add(i)
if 'Metadata' in doc.catalog:
xmp_metadata = resolve1(doc.catalog['Metadata']).get_data()
xmp_dict = xmp_to_dict(xmp_metadata)
#Let's add only the most useful one
if "xap" in xmp_dict:
metadata.add(xmp_dict["xap"])
if "pdf" in xmp_dict:
metadata.add(xmp_dict["pdf"])
if "dc" in xmp_dict:
metadata.add(xmp_dict["dc"], metadataType="dc")
file_pointer.close()
self.metadata = metadata
return metadata
示例5: WithPdf
def WithPdf(self, pdfdoc, password, fn, *args):
"""Open the pdf document, and apply the function, returning the results"""
result = None
try:
# open the pdf file
fp = open(pdfdoc, 'rb')
# create a parser object associated with the file object
parser = PDFParser(fp)
# create a PDFDocument object that stores the document structure
doc = PDFDocument()
# connect the parser and document objects
parser.set_document(doc)
doc.set_parser(parser)
# supply the password for initialization
if password:
self.password = password
doc.initialize(self.password)
if doc.is_extractable:
# apply the function and return the result
result = fn(doc, *args)
# close the pdf file
fp.close()
except IOError:
# the file doesn't exist or similar problem
pass
return result
示例6: Pdf
class Pdf(object):
def __init__(self, pdf_file):
parser = PDFParser(pdf_file)
self._doc = PDFDocument()
parser.set_document(self._doc)
self._doc.initialize
self._doc.set_parser(parser)
@property
def pages(self):
return len(tuple(self._doc.get_pages()))
def to_text(self):
rsrcmgr = PDFResourceManager()
output = StringIO()
laparams = LAParams()
laparams.detect_vertical = True
laparams.all_texts = True
laparams.word_margin = 0.4
device = TextConverter(rsrcmgr, output, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in self._doc.get_pages():
interpreter.process_page(page)
return output.getvalue().decode('utf-8', 'ignore')
示例7: getPDFMetadata
def getPDFMetadata(path):
result = {}
fp = open(path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
result = doc.info
if 'Metadata' in doc.catalog:
metadata = resolve1(doc.catalog['Metadata']).get_data()
try:
result.update( metadata ) # The raw XMP metadata
except:
pass
try:
result.update( xmp_to_dict(metadata) )
except:
pass
return result[0]
示例8: getData
def getData(fileName):
doc = PDFDocument()
fp = file(fileName, 'rb')
parser = PDFParser(fp)
try:
parser.set_document(doc)
doc.set_parser(parser)
except:
return "error"
parser.close()
fp.close()
try:
for xref in doc.xrefs:
info_ref=xref.trailer.get('Info')
if info_ref:
info=resolve1(info_ref)
metadata=info
if metadata == None:
return "Empty metadata"
else:
if metadata.has_key('Author'):
print("Author "+metadata['Author'])
if metadata.has_key('Company'):
print("Company "+metadata['Company'])
if metadata.has_key('Producer'):
print("Producer "+metadata['Producer'])
if metadata.has_key('Creator'):
print("Creator "+metadata['Creator'])
except Exception,e:
print "\t [x] Error in PDF extractor"
return e
示例9: getData
def getData(self):
doc = PDFDocument()
fp = file(self.fname, 'rb')
parser = PDFParser(fp)
try:
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(self.password)
except:
return "error"
parser.close()
fp.close()
#try:
# metadata = resolve1(doc.catalog['Metadata'])
# return "ok"
#except:
# print "[x] Error in PDF extractor, Metadata catalog"
try:
for xref in doc.xrefs:
info_ref=xref.trailer.get('Info')
if info_ref:
info=resolve1(info_ref)
self.metadata=info
self.raw = info
if self.raw == None:
return "Empty metadata"
else:
return "ok"
except Exception,e:
return e
print "\t [x] Error in PDF extractor, Trailer Info"
示例10: PdfSerializer
class PdfSerializer(object):
def __init__(self, filename):
self.__filename = filename
fp = open(self.__filename, 'rb')
parser = PDFParser(fp)
self.__doc = PDFDocument()
parser.set_document(self.__doc)
self.__doc.set_parser(parser)
self.__doc.initialize('')
def writeToTxt(self):
text = self.getString()
txtFile = open(self.__filename.replace(".pdf", ".txt"), "w")
txtFile.write(text.encode('ascii','replace').decode("utf-8"))
txtFile.close()
def getString(self):
rsrcmgr = PDFResourceManager()
laparams = LAParams()
string = StringIO()
device = TextConverter(rsrcmgr, string, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in self.__doc.get_pages():
interpreter.process_page(page)
return string.getvalue()
示例11: initialize_pdf_miner
def initialize_pdf_miner(fh):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fh)
# Create a PDF document object that stores the document structure.
doc = PDFDocument()
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
doc.initialize("")
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
raise ValueError("PDFDocument is_extractable was False.")
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
# for page in doc.get_pages():
# interpreter.process_page(page)
# Set parameters for analysis.
laparams = LAParams()
laparams.word_margin = 0.0
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return doc, interpreter, device
示例12: pdf_function
def pdf_function(pdf_doc, password='', *args, **kwargs):
result = None
try:
# open the pdf file
fp = open(pdf_doc, 'rb')
# create a parser object associated with the file object
parser = PDFParser(fp)
# create a PDFDocument object that stores the document structure
doc = PDFDocument()
# connect the parser and document objects
parser.set_document(doc)
doc.set_parser(parser)
# supply the password for initialization
doc.initialize(password)
if doc.is_extractable:
# apply the function and return the result
result = function(doc, *args, **kwargs)
# close the pdf file
fp.close()
except IOError:
# the file doesn't exist or similar problem
pass
return result
示例13: get_pdf_metadata
def get_pdf_metadata(fileOrUrl, textmode=False, prefix='', basicauth=None):
if len(args) > 1:
prefix = fileOrUrl + ':'
fp = None
if fileOrUrl.startswith('http://') or fileOrUrl.startswith('https://'):
request = urllib2.Request(fileOrUrl)
if basicauth:
request.add_header('Authorization', 'Basic ' + basicauth)
fobj = urllib2.urlopen(request)
pdfdata = fobj.read()
fobj.close()
fp = StringIO.StringIO(pdfdata)
else:
fp = open(fileOrUrl, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
fp.close()
if textmode:
for obj in doc.info:
for (name, val) in obj.iteritems():
print '{0}:{1}={2}'.format(
fileOrUrl, name, val
)
else:
val = doc.info
if type(val) is list and len(val) == 1:
val = val[0]
print prefix + str(val)
示例14: parse_pdf_pdfminer
def parse_pdf_pdfminer(self, f, fpath):
try:
laparams = LAParams()
laparams.all_texts = True
rsrcmgr = PDFResourceManager()
pagenos = set()
if self.dedup:
self.dedup_store = set()
self.handler.print_header(fpath)
page_num = 0
parser= PDFParser(f)
doc = PDFDocument(caching=True)
parser.set_document(doc)
doc.set_parser(parser)
for page in doc.get_pages():
retstr = StringIO()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_num += 1
interpreter.process_page(page)
data = retstr.getvalue()
self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
retstr.close()
self.handler.print_footer(fpath)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
self.handler.print_error(fpath, e)
示例15: pdf_isvalid
def pdf_isvalid(filelike):
''' returns True if valid pdf, else False
@param filelike: filelike object, seekable
'''
logger = logging.getLogger()
isvalid = False
filelike.seek(0)
if filelike.read(len(PDF_MAGIC)) != PDF_MAGIC:
return False
else:
filelike.seek(0)
try:
parser = PDFParser(filelike)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
if doc.is_extractable:
isvalid = True
except PDFException as excobj:
logger.warning("pdf has valid header but, still not valid pdf, exception was %r" %(excobj))
isvalid = False
filelike.seek(0)
return isvalid