本文整理匯總了Python中pdfminer.layout.LTTextLine方法的典型用法代碼示例。如果您正苦於以下問題:Python layout.LTTextLine方法的具體用法?Python layout.LTTextLine怎麽用?Python layout.LTTextLine使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pdfminer.layout
的用法示例。
在下文中一共展示了layout.LTTextLine方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: convert_pdf_to_txt
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def convert_pdf_to_txt(path):
fp = open(path, 'rb')
txt = ''
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
txt += lt_obj.get_text()
return(txt)
示例2: split_text
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def split_text(cls, line, text1, text2):
textbox = not isinstance(line, layout.LTTextLine)
if textbox:
box = line
line = line._objs[0]
second = object.__new__(line.__class__)
second.__dict__ = dict(line.__dict__)
(o1, o2) = (cls.strip_text_line(line, text1),
cls.strip_text_line(second, text2))
if textbox:
box2 = object.__new__(box.__class__)
box2.__dict__ = dict(box.__dict__)
box._objs = [o1]
box2._objs = [o2]
return (box, box2)
else:
return (o1, o2)
示例3: parse_layout
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def parse_layout(self, layout):
obj_stack = list(reversed(list(layout)))
while obj_stack:
obj = obj_stack.pop()
if type(obj) in [LTFigure, LTTextBox, LTTextLine, LTTextBoxHorizontal]:
obj_stack.extend(reversed(list(obj)))
elif type(obj) == LTTextLineHorizontal:
self.texts.append(obj)
elif type(obj) == LTRect:
if obj.width < 1.0:
self._adjust_to_close(obj, self.verticals, 'x0')
self.verticals.append(obj)
elif obj.height < 1.0:
self._adjust_to_close(obj, self.horizontals, 'y0')
self.horizontals.append(obj)
elif type(obj) == LTImage:
self.images.append(obj)
elif type(obj) == LTCurve:
pass
elif type(obj) == LTChar:
pass
elif type(obj) == LTLine:
pass
else:
assert False, "Unrecognized type: %s" % type(obj)
示例4: elem_type
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def elem_type(elem):
if isinstance(elem, LTLine):
return "line"
if isinstance(elem, LTCurve):
return "curve"
if isinstance(elem, LTTextLine):
return "text"
if isinstance(elem, LTFigure):
return "figure"
return "unkown"
示例5: __str__
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def __str__(self, *args, **kwargs):
return "\t".join(
r.get_text().encode("utf8", "replace")
for r in self.elems
if isinstance(r, LTTextLine)
)
#############################################
# Static utilities
#############################################
示例6: _split_text_n_lines
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def _split_text_n_lines(elems):
texts = []
lines = []
for e in elems:
if isinstance(e, LTTextLine):
texts.append(e)
elif isinstance(e, LTLine):
lines.append(e)
return texts, lines
示例7: _left_bar
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def _left_bar(content, default_val):
last_bar = default_val
for _coord, val in content:
if not isinstance(val, LTTextLine):
last_bar = val
yield last_bar
示例8: _row_str
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def _row_str(row_content):
def strfy(r):
if r is None:
return "None"
if isinstance(r, tuple):
_c, r = r
if isinstance(r, LTTextLine):
return r.get_text().encode("utf8", "replace")
if isinstance(r, numbers.Number):
return "|"
return str(r)
return "\t".join(strfy(r) for r in row_content)
示例9: parse_layout
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def parse_layout(elems, font_stat, combine=False):
"""
Parses pdf texts into a hypergraph grouped into rows
and columns and then output
"""
boxes_segments = elems.segments
boxes_curves = elems.curves
boxes_figures = elems.figures
page_width = elems.layout.width
# page_height = elems.layout.height
boxes = elems.mentions
avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat)
width = get_page_width(boxes + boxes_segments + boxes_figures + boxes_curves)
char_width = get_char_width(boxes)
grid_size = avg_font_pts / 2.0
for i, m in enumerate(boxes + elems.figures):
m.id = i
m.feats = defaultdict(bool)
prefix = ""
if isinstance(m, LTTextLine) and m.font_name:
prefix = m.font_name + "-" + str(m.font_size) + "-"
m.xc = (m.x0 + m.x1) / 2.0
m.yc = (m.y0 + m.y1) / 2.0
m.feats[prefix + "x0"] = m.x0_grid = m.x0 // grid_size
m.feats[prefix + "x1"] = m.x1_grid = m.x1 // grid_size
m.feats[prefix + "xc"] = m.xc_grid = m.xc // grid_size
m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size
tbls, tbl_features = cluster_vertically_aligned_boxes(
boxes,
elems.layout.bbox,
avg_font_pts,
width,
char_width,
boxes_segments,
boxes_curves,
boxes_figures,
page_width,
combine,
)
return tbls, tbl_features
示例10: parse_text
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def parse_text(layout):
"""Function to recursively parse the layout tree."""
result = []
if not hasattr(layout, '__iter__'):
return result
for lt_obj in layout:
if isinstance(lt_obj, LTTextLine):
bbox = lt_obj.bbox
text = lt_obj.get_text().strip()
if text != '':
result += [(bbox, text)]
else:
result += parse_text(lt_obj)
return result
示例11: _process_layout
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def _process_layout(self, layout):
"""Process an LTPage layout and return a list of elements."""
# Here we just group text into paragraphs
elements = []
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
elements.append(Paragraph(lt_obj.get_text().strip()))
elif isinstance(lt_obj, LTFigure):
# Recursive...
elements.extend(self._process_layout(lt_obj))
return elements
示例12: try_add_field
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def try_add_field(cls, t, obj, results, nrows, nameoffset=0):
if isinstance(obj, layout.LTTextLine):
cls._try_add(t, obj, results, nrows, nameoffset)
elif isinstance(obj, layout.LTTextBox):
if not cls._try_add(t, obj, results, nrows, nameoffset): #only if add fails recurse
for i in obj:
cls.try_add_field(t, i, results, nrows, nameoffset)
示例13: count_rows
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def count_rows(cls, t, o, offset=0):
info = t.col_info[TITable.NAME]
count = 0
if isinstance(o, layout.LTTextBox):
for i in o:
count += cls.count_rows(t, i, offset)
return count
elif isinstance(o, layout.LTTextLine):
text = cls.get_entry_text(o)
if abs(info.l - (o.bbox[0] + offset)) < 0.2:
if info.regex.match(text):
return 1
return 0
示例14: get_alignment_features
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def get_alignment_features(line_bboxes, elems, font_stat):
alignment_features = []
for line_bbox in line_bboxes:
line_bbox_ordered = (line_bbox[4], line_bbox[3], line_bbox[6], line_bbox[5])
boxes = [
elem for elem in elems.mentions if intersect(line_bbox_ordered, elem.bbox)
]
boxes_segments = [
elem for elem in elems.segments if intersect(line_bbox_ordered, elem.bbox)
]
boxes_figures = [
elem for elem in elems.figures if intersect(line_bbox_ordered, elem.bbox)
]
boxes_curves = [
elem for elem in elems.curves if intersect(line_bbox_ordered, elem.bbox)
]
page_width = elems.layout.width
# page_height = elems.layout.height
avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat)
width = get_page_width(boxes + boxes_segments + boxes_figures + boxes_curves)
if len(boxes) == 0:
alignment_features += [[0] * 17]
continue
char_width = get_char_width(boxes)
grid_size = avg_font_pts / 2.0
for i, m in enumerate(boxes + elems.figures):
m.id = i
m.feats = defaultdict(bool)
prefix = ""
if isinstance(m, LTTextLine) and m.font_name:
prefix = m.font_name + "-" + str(m.font_size) + "-"
m.xc = (m.x0 + m.x1) / 2.0
m.yc = (m.y0 + m.y1) / 2.0
m.feats[prefix + "x0"] = m.x0_grid = m.x0 // grid_size
m.feats[prefix + "x1"] = m.x1_grid = m.x1 // grid_size
m.feats[prefix + "xc"] = m.xc_grid = m.xc // grid_size
m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size
nodes, nodes_features = cluster_vertically_aligned_boxes(
boxes,
elems.layout.bbox,
avg_font_pts,
width,
char_width,
boxes_segments,
boxes_curves,
boxes_figures,
page_width,
True,
)
if len(nodes_features) == 0:
alignment_features += [[0] * 17]
else:
alignment_features += [nodes_features]
return alignment_features
示例15: _try_add
# 需要導入模塊: from pdfminer import layout [as 別名]
# 或者: from pdfminer.layout import LTTextLine [as 別名]
def _try_add(cls, t, obj, results, nrows, nameoffset):
if obj.bbox[0] < ((t.col_info[TITable.NAME].l - nameoffset)- 0.5): # don't consider items that are past the left of the table
return False
text = cls.get_entry_text(obj)
added = False
center = cls.calculate_center(obj)
closest_field = None
min_diff = sys.maxint
field_info = None
for (field, info) in t.col_info.iteritems():
if field == TITable.NAME:
center -= nameoffset
diff = abs(center - info.c)
if diff < min_diff:
min_diff = diff
closest_field = field
field_info = info
#print "%s closest to %s (%s)" % (obj, closest_field, field_info.regex.pattern)
addrfield = [j for j in t.col_info.itervalues() if j.typ == TITable.ADDRESS]
if isinstance(obj, layout.LTText):
text = cls.get_entry_text(obj)
if field_info.regex.search(text):
if len(results[closest_field]) >= nrows:
added = False
else:
results[closest_field] += [obj]
added = True
elif isinstance(obj, layout.LTTextLine) and \
((closest_field == TITable.OFFSET) or \
(closest_field in [a.name for a in addrfield])):
fields = [j for j in text.rsplit(")", 2) if len(j) > 0]
if len(fields) == 2:
fields = [f+")" for f in fields]
off = fields[0].strip()
adr = fields[1].strip()
if adr[0] == '+': # move + to end of off if @ start of adr
adr = adr[1:].strip()
if TITable.OFFSET in t.col_info:
col1 = t.col_info[TITable.OFFSET]
col2 = addrfield[0]
elif len(addrfield) == 2:
col1 = addrfield[0]
col2 = addrfield[1]
else:
return False
if col1.regex.match(off) \
and col2.regex.match(adr):
#print "splitting objects"
(oobj, aobj) = cls.split_text(obj, off, adr)
# TODO: split text into two obbjects
results[col1.name].append(oobj)
results[col2.name].append(aobj)
added = True
return added