本文整理汇总了Python中pdfminer.layout.LTFigure方法的典型用法代码示例。如果您正苦于以下问题:Python layout.LTFigure方法的具体用法?Python layout.LTFigure怎么用?Python layout.LTFigure使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.layout
的用法示例。
在下文中一共展示了layout.LTFigure方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_layout
# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTFigure [as 别名]
def parse_layout(self, layout):
obj_stack = list(reversed(list(layout)))
while obj_stack:
obj = obj_stack.pop()
if type(obj) in [LTFigure, LTTextBox, LTTextLine, LTTextBoxHorizontal]:
obj_stack.extend(reversed(list(obj)))
elif type(obj) == LTTextLineHorizontal:
self.texts.append(obj)
elif type(obj) == LTRect:
if obj.width < 1.0:
self._adjust_to_close(obj, self.verticals, 'x0')
self.verticals.append(obj)
elif obj.height < 1.0:
self._adjust_to_close(obj, self.horizontals, 'y0')
self.horizontals.append(obj)
elif type(obj) == LTImage:
self.images.append(obj)
elif type(obj) == LTCurve:
pass
elif type(obj) == LTChar:
pass
elif type(obj) == LTLine:
pass
else:
assert False, "Unrecognized type: %s" % type(obj)
示例2: elem_type
# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTFigure [as 别名]
def elem_type(elem):
if isinstance(elem, LTLine):
return "line"
if isinstance(elem, LTCurve):
return "curve"
if isinstance(elem, LTTextLine):
return "text"
if isinstance(elem, LTFigure):
return "figure"
return "unkown"
示例3: extract_first_jpeg_in_pdf
# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTFigure [as 别名]
def extract_first_jpeg_in_pdf(fstream):
"""
Reads a given PDF file and scans for the first valid embedded JPEG image.
Returns either None (if none found) or a string of data for the image.
There is no 100% guarantee for this code, yet it seems to work fine with most
scanner-produced images around.
More testing might be needed though.
Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
for PDFMiner.
:param fstream: Readable binary stream of the PDF
:return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
"""
parser = PDFParser(fstream)
document = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.create_pages(document)
for page in pages:
interpreter.process_page(page)
layout = device.result
for el in layout:
if isinstance(el, LTFigure):
for im in el:
if isinstance(im, LTImage):
# Found one!
st = None
try:
imdata = im.stream.get_data()
except:
# Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
imdata = im.stream.get_rawdata()
if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
return imdata
return None
示例4: _process_layout
# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTFigure [as 别名]
def _process_layout(self, layout):
"""Process an LTPage layout and return a list of elements."""
# Here we just group text into paragraphs
elements = []
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
elements.append(Paragraph(lt_obj.get_text().strip()))
elif isinstance(lt_obj, LTFigure):
# Recursive...
elements.extend(self._process_layout(lt_obj))
return elements
示例5: load
# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTFigure [as 别名]
def load(
pdf_file: IO,
pdf_file_path: Optional[str] = None,
la_params: Optional[Dict] = None,
**kwargs,
) -> PDFDocument:
"""
Loads the pdf file into a PDFDocument.
Args:
pdf_file (io): The PDF file.
la_params (dict): The layout parameters passed to PDF Miner for analysis. See
the PDFMiner documentation here:
https://pdfminersix.readthedocs.io/en/latest/api/composable.html#laparams.
Note that py_pdf_parser will re-order the elements it receives from PDFMiner
so options relating to element ordering will have no effect.
pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation
for `PDFDocument`.
kwargs: Passed to `PDFDocument`. See the documentation for `PDFDocument`.
Returns:
PDFDocument: A PDFDocument with the file loaded.
"""
if la_params is None:
la_params = {}
la_params = {**DEFAULT_LA_PARAMS, **la_params}
pages: Dict[int, Page] = {}
for page in extract_pages(pdf_file, laparams=LAParams(**la_params)):
elements = [element for element in page if isinstance(element, LTTextContainer)]
# If all_texts=True then we may get some text from inside figures
if la_params.get("all_texts"):
figures = (element for element in page if isinstance(element, LTFigure))
for figure in figures:
elements += [
element
for element in figure
if isinstance(element, LTTextContainer)
]
if not elements:
logger.warning(
f"No elements detected on page {page.pageid}, skipping this page."
)
continue
pages[page.pageid] = Page(
width=page.width, height=page.height, elements=elements
)
return PDFDocument(pages=pages, pdf_file_path=pdf_file_path, **kwargs)