当前位置: 首页>>代码示例>>Python>>正文


Python layout.LTTextLine方法代码示例

本文整理汇总了Python中pdfminer.layout.LTTextLine方法的典型用法代码示例。如果您正苦于以下问题:Python layout.LTTextLine方法的具体用法?Python layout.LTTextLine怎么用?Python layout.LTTextLine使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.layout的用法示例。


在下文中一共展示了layout.LTTextLine方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: convert_pdf_to_txt

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def convert_pdf_to_txt(path):
    fp = open(path, 'rb')
    txt = ''
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                txt += lt_obj.get_text()
    return(txt) 
开发者ID:opensourcesec,项目名称:Forager,代码行数:22,代码来源:pdfConverter.py

示例2: split_text

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def split_text(cls, line, text1, text2):
        textbox = not isinstance(line, layout.LTTextLine)
        if textbox:
            box = line
            line = line._objs[0]
        second = object.__new__(line.__class__)
        second.__dict__ = dict(line.__dict__)
        (o1, o2) = (cls.strip_text_line(line, text1),
                    cls.strip_text_line(second, text2))
        if textbox:
            box2 = object.__new__(box.__class__)
            box2.__dict__ = dict(box.__dict__)
            box._objs = [o1]
            box2._objs = [o2]
            return (box, box2)
        else:
            return (o1, o2) 
开发者ID:bx,项目名称:bootloader_instrumentation_suite,代码行数:19,代码来源:parse_am37x_register_tables.py

示例3: parse_layout

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def parse_layout(self, layout):
        obj_stack = list(reversed(list(layout)))
        while obj_stack:
            obj = obj_stack.pop()
            if type(obj) in [LTFigure, LTTextBox, LTTextLine, LTTextBoxHorizontal]:
                obj_stack.extend(reversed(list(obj)))
            elif type(obj) == LTTextLineHorizontal:
                self.texts.append(obj)
            elif type(obj) == LTRect:
                if obj.width < 1.0:
                    self._adjust_to_close(obj, self.verticals, 'x0')
                    self.verticals.append(obj)
                elif obj.height < 1.0:
                    self._adjust_to_close(obj, self.horizontals, 'y0')
                    self.horizontals.append(obj)
            elif type(obj) == LTImage:
                self.images.append(obj)
            elif type(obj) == LTCurve:
                pass
            elif type(obj) == LTChar:
                pass
            elif type(obj) == LTLine:
                pass                    
            else:
                assert False, "Unrecognized type: %s" % type(obj) 
开发者ID:johnlinp,项目名称:pdf-to-markdown,代码行数:27,代码来源:pile.py

示例4: elem_type

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def elem_type(elem):
    if isinstance(elem, LTLine):
        return "line"
    if isinstance(elem, LTCurve):
        return "curve"
    if isinstance(elem, LTTextLine):
        return "text"
    if isinstance(elem, LTFigure):
        return "figure"
    return "unkown" 
开发者ID:HazyResearch,项目名称:pdftotree,代码行数:12,代码来源:node.py

示例5: __str__

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def __str__(self, *args, **kwargs):
        return "\t".join(
            r.get_text().encode("utf8", "replace")
            for r in self.elems
            if isinstance(r, LTTextLine)
        )


#############################################
#    Static utilities
############################################# 
开发者ID:HazyResearch,项目名称:pdftotree,代码行数:13,代码来源:node.py

示例6: _split_text_n_lines

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def _split_text_n_lines(elems):
    texts = []
    lines = []
    for e in elems:
        if isinstance(e, LTTextLine):
            texts.append(e)
        elif isinstance(e, LTLine):
            lines.append(e)
    return texts, lines 
开发者ID:HazyResearch,项目名称:pdftotree,代码行数:11,代码来源:node.py

示例7: _left_bar

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def _left_bar(content, default_val):
    last_bar = default_val
    for _coord, val in content:
        if not isinstance(val, LTTextLine):
            last_bar = val
        yield last_bar 
开发者ID:HazyResearch,项目名称:pdftotree,代码行数:8,代码来源:node.py

示例8: _row_str

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def _row_str(row_content):
    def strfy(r):
        if r is None:
            return "None"
        if isinstance(r, tuple):
            _c, r = r
        if isinstance(r, LTTextLine):
            return r.get_text().encode("utf8", "replace")
        if isinstance(r, numbers.Number):
            return "|"
        return str(r)

    return "\t".join(strfy(r) for r in row_content) 
开发者ID:HazyResearch,项目名称:pdftotree,代码行数:15,代码来源:node.py

示例9: parse_layout

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def parse_layout(elems, font_stat, combine=False):
    """
    Parses pdf texts into a hypergraph grouped into rows
    and columns and then output
    """
    boxes_segments = elems.segments
    boxes_curves = elems.curves
    boxes_figures = elems.figures
    page_width = elems.layout.width
    #  page_height = elems.layout.height
    boxes = elems.mentions
    avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat)
    width = get_page_width(boxes + boxes_segments + boxes_figures + boxes_curves)
    char_width = get_char_width(boxes)
    grid_size = avg_font_pts / 2.0
    for i, m in enumerate(boxes + elems.figures):
        m.id = i
        m.feats = defaultdict(bool)
        prefix = ""
        if isinstance(m, LTTextLine) and m.font_name:
            prefix = m.font_name + "-" + str(m.font_size) + "-"
        m.xc = (m.x0 + m.x1) / 2.0
        m.yc = (m.y0 + m.y1) / 2.0
        m.feats[prefix + "x0"] = m.x0_grid = m.x0 // grid_size
        m.feats[prefix + "x1"] = m.x1_grid = m.x1 // grid_size
        m.feats[prefix + "xc"] = m.xc_grid = m.xc // grid_size
        m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size

    tbls, tbl_features = cluster_vertically_aligned_boxes(
        boxes,
        elems.layout.bbox,
        avg_font_pts,
        width,
        char_width,
        boxes_segments,
        boxes_curves,
        boxes_figures,
        page_width,
        combine,
    )
    return tbls, tbl_features 
开发者ID:HazyResearch,项目名称:pdftotree,代码行数:43,代码来源:pdf_parsers.py

示例10: parse_text

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def parse_text(layout):
    """Function to recursively parse the layout tree."""
    result = []
    if not hasattr(layout, '__iter__'):
        return result
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextLine):
            bbox = lt_obj.bbox
            text = lt_obj.get_text().strip()
            if text != '':
                result += [(bbox, text)]
        else:
            result += parse_text(lt_obj)
    return result 
开发者ID:thomas0809,项目名称:GraphIE,代码行数:16,代码来源:parse_pdf.py

示例11: _process_layout

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def _process_layout(self, layout):
        """Process an LTPage layout and return a list of elements."""
        # Here we just group text into paragraphs
        elements = []
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                elements.append(Paragraph(lt_obj.get_text().strip()))
            elif isinstance(lt_obj, LTFigure):
                # Recursive...
                elements.extend(self._process_layout(lt_obj))
        return elements 
开发者ID:mcs07,项目名称:ChemDataExtractor,代码行数:13,代码来源:pdf.py

示例12: try_add_field

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def try_add_field(cls, t, obj, results, nrows, nameoffset=0):
        if isinstance(obj, layout.LTTextLine):
            cls._try_add(t, obj, results, nrows, nameoffset)
        elif isinstance(obj, layout.LTTextBox):
            if not cls._try_add(t, obj, results, nrows, nameoffset): #only if add fails recurse
                for i in obj:
                    cls.try_add_field(t, i, results, nrows, nameoffset) 
开发者ID:bx,项目名称:bootloader_instrumentation_suite,代码行数:9,代码来源:parse_am37x_register_tables.py

示例13: count_rows

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def count_rows(cls, t, o, offset=0):
        info = t.col_info[TITable.NAME]
        count = 0
        if isinstance(o, layout.LTTextBox):
            for i in o:
                count += cls.count_rows(t, i, offset)
            return count
        elif isinstance(o, layout.LTTextLine):
            text = cls.get_entry_text(o)
            if abs(info.l - (o.bbox[0] + offset)) < 0.2:
                if info.regex.match(text):
                    return 1
        return 0 
开发者ID:bx,项目名称:bootloader_instrumentation_suite,代码行数:15,代码来源:parse_am37x_register_tables.py

示例14: get_alignment_features

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def get_alignment_features(line_bboxes, elems, font_stat):
    alignment_features = []
    for line_bbox in line_bboxes:
        line_bbox_ordered = (line_bbox[4], line_bbox[3], line_bbox[6], line_bbox[5])
        boxes = [
            elem for elem in elems.mentions if intersect(line_bbox_ordered, elem.bbox)
        ]
        boxes_segments = [
            elem for elem in elems.segments if intersect(line_bbox_ordered, elem.bbox)
        ]
        boxes_figures = [
            elem for elem in elems.figures if intersect(line_bbox_ordered, elem.bbox)
        ]
        boxes_curves = [
            elem for elem in elems.curves if intersect(line_bbox_ordered, elem.bbox)
        ]
        page_width = elems.layout.width
        #  page_height = elems.layout.height
        avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat)
        width = get_page_width(boxes + boxes_segments + boxes_figures + boxes_curves)
        if len(boxes) == 0:
            alignment_features += [[0] * 17]
            continue
        char_width = get_char_width(boxes)
        grid_size = avg_font_pts / 2.0
        for i, m in enumerate(boxes + elems.figures):
            m.id = i
            m.feats = defaultdict(bool)
            prefix = ""
            if isinstance(m, LTTextLine) and m.font_name:
                prefix = m.font_name + "-" + str(m.font_size) + "-"
            m.xc = (m.x0 + m.x1) / 2.0
            m.yc = (m.y0 + m.y1) / 2.0
            m.feats[prefix + "x0"] = m.x0_grid = m.x0 // grid_size
            m.feats[prefix + "x1"] = m.x1_grid = m.x1 // grid_size
            m.feats[prefix + "xc"] = m.xc_grid = m.xc // grid_size
            m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size

        nodes, nodes_features = cluster_vertically_aligned_boxes(
            boxes,
            elems.layout.bbox,
            avg_font_pts,
            width,
            char_width,
            boxes_segments,
            boxes_curves,
            boxes_figures,
            page_width,
            True,
        )
        if len(nodes_features) == 0:
            alignment_features += [[0] * 17]
        else:
            alignment_features += [nodes_features]
    return alignment_features 
开发者ID:HazyResearch,项目名称:pdftotree,代码行数:57,代码来源:features.py

示例15: _try_add

# 需要导入模块: from pdfminer import layout [as 别名]
# 或者: from pdfminer.layout import LTTextLine [as 别名]
def _try_add(cls, t, obj, results, nrows, nameoffset):
        if obj.bbox[0] < ((t.col_info[TITable.NAME].l - nameoffset)- 0.5): # don't consider items that are past the left of the table
            return False

        text = cls.get_entry_text(obj)
        added = False
        center = cls.calculate_center(obj)
        closest_field = None
        min_diff = sys.maxint
        field_info = None
        for (field, info) in t.col_info.iteritems():
            if field == TITable.NAME:
                center -= nameoffset
            diff = abs(center - info.c)
            if diff < min_diff:
                min_diff = diff
                closest_field = field
                field_info = info

        #print "%s closest to %s (%s)" % (obj, closest_field, field_info.regex.pattern)

        addrfield = [j for j in t.col_info.itervalues() if j.typ == TITable.ADDRESS]
        if isinstance(obj, layout.LTText):
            text = cls.get_entry_text(obj)
            if field_info.regex.search(text):
                if len(results[closest_field]) >= nrows:
                    added = False
                else:
                    results[closest_field] += [obj]
                    added = True
            elif isinstance(obj, layout.LTTextLine) and \
                            ((closest_field == TITable.OFFSET) or \
                             (closest_field in [a.name for a in addrfield])):

                fields = [j for j in text.rsplit(")", 2) if len(j) > 0]
                if len(fields) == 2:
                    fields = [f+")" for f in fields]
                    off = fields[0].strip()
                    adr = fields[1].strip()
                    if adr[0] == '+':  # move + to end of off if @ start of adr
                        adr = adr[1:].strip()
                    if TITable.OFFSET in t.col_info:
                        col1 = t.col_info[TITable.OFFSET]
                        col2 = addrfield[0]
                    elif len(addrfield) == 2:
                        col1 = addrfield[0]
                        col2 = addrfield[1]
                    else:
                        return False

                    if col1.regex.match(off) \
                       and col2.regex.match(adr):
                        #print "splitting objects"
                        (oobj, aobj) = cls.split_text(obj, off, adr)
                        # TODO: split text into two obbjects
                        results[col1.name].append(oobj)
                        results[col2.name].append(aobj)
                        added = True
        return added 
开发者ID:bx,项目名称:bootloader_instrumentation_suite,代码行数:61,代码来源:parse_am37x_register_tables.py


注:本文中的pdfminer.layout.LTTextLine方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。