当前位置: 首页>>代码示例>>Python>>正文


Python tree_utils.get_node_text函数代码示例

本文整理汇总了Python中regparser.tree.xml_parser.tree_utils.get_node_text函数的典型用法代码示例。如果您正苦于以下问题:Python get_node_text函数的具体用法?Python get_node_text怎么用?Python get_node_text使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了get_node_text函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: table_xml_to_data

def table_xml_to_data(xml_node):
    """Construct a data structure of the table data. We provide a different
    structure than the native XML as the XML encodes too much logic. This
    structure can be used to generate semi-complex tables which could not be
    generated from the markdown above"""
    header_root = build_header(xml_node.xpath('./BOXHD/CHED'))
    header = [[] for _ in range(header_root.height())]

    def per_node(node):
        header[node.level].append({'text': node.text,
                                   'colspan': node.colspan,
                                   'rowspan': node.rowspan})
    struct.walk(header_root, per_node)
    header = header[1:]     # skip the root

    rows = []
    for row in xml_node.xpath('./ROW'):
        rows.append([tree_utils.get_node_text(td, add_spaces=True).strip()
                     for td in row.xpath('./ENT')])

    table_data = {'header': header, 'rows': rows}

    caption_nodes = xml_node.xpath('./TTITLE')
    if len(caption_nodes):
        text = tree_utils.get_node_text(caption_nodes[0]).strip()
        table_data["caption"] = text

    return table_data
开发者ID:vrajmohan,项目名称:regulations-parser,代码行数:28,代码来源:formatting.py

示例2: test_get_node_text

    def test_get_node_text(self):
        text = '<P>(a)<E T="03">Fruit.</E>Apps,<PRTPAGE P="102"/> and pins</P>'
        doc = etree.fromstring(text)
        result = tree_utils.get_node_text(doc)

        self.assertEquals('(a)Fruit.Apps, and pins', result)

        text = '<P>(a)<E T="03">Fruit.</E>Apps,<PRTPAGE P="102"/> and pins</P>'
        doc = etree.fromstring(text)
        result = tree_utils.get_node_text(doc, add_spaces=True)

        self.assertEquals('(a) Fruit. Apps, and pins', result)

        text = '<P>(a) <E T="03">Fruit.</E> Apps, and pins</P>'
        doc = etree.fromstring(text)
        result = tree_utils.get_node_text(doc, add_spaces=True)

        self.assertEquals('(a) Fruit. Apps, and pins', result)

        text = '<P>(a) ABC<E T="52">123</E>= 5</P>'
        doc = etree.fromstring(text)
        result = tree_utils.get_node_text(doc, add_spaces=True)
        self.assertEquals('(a) ABC_{123} = 5', result)

        text = '<P>(a) <E>Keyterm.</E> ABC<E T="52">123</E>= 5</P>'
        doc = etree.fromstring(text)
        result = tree_utils.get_node_text(doc, add_spaces=True)
        self.assertEquals('(a) Keyterm. ABC_{123} = 5', result)
开发者ID:EricSchles,项目名称:regulations-parser,代码行数:28,代码来源:tree_utils_tests.py

示例3: table_xml_to_plaintext

def table_xml_to_plaintext(xml_node):
    """Markdown representation of a table. Note that this doesn't account
    for all the options needed to display the table properly, but works fine
    for simple tables. This gets included in the reg plain text"""
    header = [tree_utils.get_node_text(hd, add_spaces=True).strip()
              for hd in xml_node.xpath('./BOXHD/CHED|./TTITLE')]
    divider = ['---']*len(header)
    rows = []
    for tr in xml_node.xpath('./ROW'):
        rows.append([tree_utils.get_node_text(td, add_spaces=True).strip()
                     for td in tr.xpath('./ENT')])
    table = []
    for row in [header] + [divider] + rows:
        table.append('|' + '|'.join(row) + '|')
    return '\n'.join(table)
开发者ID:vrajmohan,项目名称:regulations-parser,代码行数:15,代码来源:formatting.py

示例4: build_header

def build_header(xml_nodes):
    """Builds a TableHeaderNode tree, with an empty root. Each node in the tree
    includes its colspan/rowspan"""
    stack = HeaderStack()
    stack.add(0, TableHeaderNode(None, 0))  # Root
    for xml_node in xml_nodes:
        level = int(xml_node.attrib['H'])
        text = tree_utils.get_node_text(xml_node, add_spaces=True).strip()
        stack.add(level, TableHeaderNode(text, level))

    while stack.size() > 1:
        stack.unwind()
    root = stack.m_stack[0][0][1]

    max_height = root.height()

    def set_rowspan(n):
        n.rowspan = max_height - n.height() - n.level + 1
    struct.walk(root, set_rowspan)

    def set_colspan(n):
        n.colspan = n.width()
    struct.walk(root, set_colspan)

    return root
开发者ID:adderall,项目名称:regulations-parser,代码行数:25,代码来源:formatting.py

示例5: parse_amdpar

def parse_amdpar(par, initial_context):
    """ Parse the <AMDPAR> tags into a list of paragraphs that have changed.
    """

    #   Replace and "and"s in titles; they will throw off and_token_resolution
    for e in filter(lambda e: e.text, par.xpath('./E')):
        e.text = e.text.replace(' and ', ' ')
    text = get_node_text(par, add_spaces=True)
    tokenized = [t[0] for t, _, _ in amdpar.token_patterns.scanString(text)]

    tokenized = compress_context_in_tokenlists(tokenized)
    tokenized = resolve_confused_context(tokenized, initial_context)
    tokenized = paragraph_in_context_moved(tokenized, initial_context)
    tokenized = remove_false_deletes(tokenized, text)
    tokenized = multiple_moves(tokenized)
    tokenized = switch_passive(tokenized)
    tokenized = and_token_resolution(tokenized)
    tokenized, subpart = deal_with_subpart_adds(tokenized)
    tokenized = context_to_paragraph(tokenized)
    tokenized = move_then_modify(tokenized)
    if not subpart:
        tokenized = separate_tokenlist(tokenized)
    initial_context = switch_context(tokenized, initial_context)
    tokenized, final_context = compress_context(tokenized, initial_context)
    amends = make_amendments(tokenized, subpart)
    return amends, final_context
开发者ID:EricSchles,项目名称:regulations-parser,代码行数:26,代码来源:diff.py

示例6: nodes_from_interp_p

def nodes_from_interp_p(xml_node):
    """Given an XML node that contains text for an interpretation paragraph,
    split it into sub-paragraphs and account for trailing stars"""
    node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
    first_marker = get_first_interp_marker(text_with_tags)
    collapsed = collapsed_markers_matches(node_text, text_with_tags)

    #   -2 throughout to account for matching the character + period
    ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
    starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

    #   Node for this paragraph
    n = Node(node_text[0:starts[0]], label=[first_marker],
             node_type=Node.INTERP, tagged_text=text_with_tags)
    yield n
    if n.text.endswith('* * *'):
        yield Node(label=[mtypes.INLINE_STARS])

    #   Collapsed-marker children
    for match, end in zip(collapsed, ends):
        marker = match.group(1)
        if marker == '1':
            marker = '<E T="03">1</E>'
        n = Node(node_text[match.end() - 2:end], label=[marker],
                 node_type=Node.INTERP)
        yield n
        if n.text.endswith('* * *'):
            yield Node(label=[mtypes.INLINE_STARS])
开发者ID:eregs,项目名称:regulations-parser,代码行数:29,代码来源:gpo_cfr.py

示例7: process_inner_children

def process_inner_children(inner_stack, xml_node):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)
        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes:
            logger.warning("Couldn't determine interp marker. Appending to "
                           "previous paragraph: %s", node_text)
            previous = nodes[-1]
            previous.text += "\n\n" + node_text
            if previous.tagged_text:
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags
        else:
            nodes.extend(nodes_from_interp_p(xml_node))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    add_nodes_to_stack(nodes, inner_stack)
开发者ID:eregs,项目名称:regulations-parser,代码行数:29,代码来源:gpo_cfr.py

示例8: derive_nodes

    def derive_nodes(self, xml, processor=None):
        texts = ["```" + self.fence_type(xml)]
        for child in xml:
            texts.append(tree_utils.get_node_text(child).strip())
        texts.append("```")

        return [Node("\n".join(texts), label=[mtypes.MARKERLESS])]
开发者ID:theresaanna,项目名称:regulations-parser,代码行数:7,代码来源:paragraph_processor.py

示例9: make_authority_instructions

def make_authority_instructions(auth_xml, cfr_part):
    """Creates an `EREGS_INSTRUCTIONS` element specific to the authority
    information"""
    instructions = etree.Element('EREGS_INSTRUCTIONS')
    authority = etree.SubElement(instructions, 'AUTHORITY', label=cfr_part)
    authority.text = '\n'.join(get_node_text(p, add_spaces=True)
                               for p in auth_xml.xpath('./P'))
    return instructions
开发者ID:tadhg-ohiggins,项目名称:regulations-parser,代码行数:8,代码来源:amdparser.py

示例10: process

    def process(self, appendix, part):
        self.m_stack = tree_utils.NodeStack()

        self.part = part
        self.paragraph_count = 0
        self.header_count = 0
        self.depth = None
        self.appendix_letter = None
        # holds collections of nodes until their depth is determined
        self.nodes = []

        self.set_letter(appendix)
        remove_toc(appendix, self.appendix_letter)

        def is_subhead(tag, text):
            initial = initial_marker(text)
            return ((tag == 'HD' and (not initial or '.' in initial[1]))
                    or (tag in ('P', 'FP')
                        and title_label_pair(text, self.appendix_letter,
                                             self.part)))

        for child in appendix.getchildren():
            text = tree_utils.get_node_text(child, add_spaces=True).strip()
            if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED')
                    or child.tag == 'RESERVED'):
                self.end_group()
                self.hed(part, text)
            elif is_subhead(child.tag, text):
                self.end_group()
                self.subheader(child, text)
            elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'):
                text = self.insert_dashes(child, text)
                self.paragraph_with_marker(
                    text,
                    tree_utils.get_node_text_tags_preserved(child))
            elif child.tag == 'SEQUENCE':
                old_depth = self.depth
                self.end_group()
                self.depth = old_depth
                self.process_sequence(child)
            elif child.tag in ('P', 'FP'):
                text = self.insert_dashes(child, text)
                self.paragraph_no_marker(text)
            elif child.tag == 'GPH':
                self.graphic(child)
            elif child.tag == 'GPOTABLE':
                self.table(child)
            elif child.tag in ('NOTE', 'NOTES'):
                self.fence(child, 'note')
            elif child.tag == 'CODE':
                self.fence(child, child.get('LANGUAGE', 'code'))

        self.end_group()
        while self.m_stack.size() > 1:
            self.m_stack.unwind()

        if self.m_stack.m_stack[0]:
            return self.m_stack.m_stack[0][0][1]
开发者ID:cfpb,项目名称:regulations-parser,代码行数:58,代码来源:appendices.py

示例11: process_appendix

def process_appendix(m_stack, current_section, child):
    html_parser = HTMLParser.HTMLParser()

    for ch in child.getchildren():
        if ch.tag == 'HD':
            appendix_section = get_appendix_section_number(
                ch.text, current_section)

            if appendix_section is None:
                appendix_section = determine_next_section(m_stack, 2)

            n = Node(
                node_type=Node.APPENDIX, label=[appendix_section],
                title=ch.text)

            node_level = 2
            tree_utils.add_to_stack(m_stack, node_level, n)
        if ch.tag == 'P':
            text = ' '.join([ch.text] + [c.tail for c in ch if c.tail])
            markers_list = tree_utils.get_paragraph_markers(text)

            node_text = tree_utils.get_node_text(ch)

            if len(markers_list) > 0:
                if len(markers_list) > 1:
                    actual_markers = ['(%s)' % m for m in markers_list]
                    node_text = tree_utils.split_text(
                        node_text, actual_markers)
                else:
                    node_text = [node_text]

                for m, node_text in zip(markers_list, node_text):
                    n = Node(
                        node_text, label=[str(m)], node_type=Node.APPENDIX)

                    last = m_stack.peek()
                    node_level = determine_level(m, last[0][0])

                    if m == 'i':
                        #This is bit of a hack, since we can't easily
                        #distinguish between the Roman numeral #(i) and the
                        #letter (i) to determine the level. We look ahead to
                        #help. This is not #a complete solution and we should
                        #circle back at some point.

                        next_text = ' '.join(
                            [ch.getnext().text] +
                            [c.tail for c in ch.getnext() if c.tail])

                        next_markers = tree_utils.get_paragraph_markers(
                            next_text)

                        if next_markers[0] == 'ii':
                            node_level = 5
                    tree_utils.add_to_stack(m_stack, node_level, n)
            else:
                last = m_stack.peek_last()
                last[1].text = last[1].text + '\n %s' % node_text
开发者ID:dclegalhackers,项目名称:regulations-parser,代码行数:58,代码来源:appendices.py

示例12: set_letter

 def set_letter(self, appendix):
     """Find (and set) the appendix letter"""
     for hd in appendix_headers(appendix):
         text = tree_utils.get_node_text(hd)
         if self.appendix_letter:
             logger.warning("Found two appendix headers: %s and %s",
                            self.appendix_letter, text)
         self.appendix_letter = grammar.headers.parseString(text).appendix
     return self.appendix_letter
开发者ID:eregs,项目名称:regulations-parser,代码行数:9,代码来源:appendices.py

示例13: derive_nodes

    def derive_nodes(self, xml, processor=None):
        texts = ["```" + xml.get('LANGUAGE', 'code')]
        for child in xml:
            text = tree_utils.get_node_text(child).strip()
            if text:
                texts.append(text)
        texts.append("```")

        return [Node("\n".join(texts), label=[mtypes.MARKERLESS])]
开发者ID:vrajmohan,项目名称:regulations-parser,代码行数:9,代码来源:paragraph_processor.py

示例14: process

    def process(self, appendix, part):
        self.m_stack = tree_utils.NodeStack()

        self.paragraph_count = 0
        self.header_count = 0
        self.depth = None
        self.appendix_letter = None

        self.set_letter(appendix)
        remove_toc(appendix, self.appendix_letter)

        def is_subhead(tag, text):
            initial = initial_marker(text)
            return ((tag == 'HD' and (not initial or '.' in initial[1]))
                    or (tag in ('P', 'FP')
                        and title_label_pair(text, self.appendix_letter)))

        for child in appendix.getchildren():
            text = tree_utils.get_node_text(child, add_spaces=True).strip()
            if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED')
                    or child.tag == 'RESERVED'):
                self.hed(part, text)
            elif is_subhead(child.tag, text):
                self.subheader(child, text)
            elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'):
                if child.getnext() is None:
                    next_text = ''
                else:
                    next_text = self.find_next_text_with_marker(
                        child.getnext()) or ''
                texts = self.split_paragraph_text(text, next_text)
                for text, next_text in zip(texts, texts[1:]):
                    self.paragraph_with_marker(text, next_text)
            elif child.tag in ('P', 'FP'):
                self.paragraph_no_marker(text)
            elif child.tag == 'GPH':
                self.graphic(child)
            elif child.tag == 'GPOTABLE':
                self.table(child)
            elif child.tag in ('NOTE', 'NOTES'):
                self.fence(child, 'note')
            elif child.tag == 'CODE':
                self.fence(child, child.get('LANGUAGE', 'code'))

        while self.m_stack.size() > 1:
            self.m_stack.unwind()

        if self.m_stack.m_stack[0]:
            root = self.m_stack.m_stack[0][0][1]

            def per_node(n):
                if hasattr(n, 'p_level'):
                    del n.p_level

            walk(root, per_node)
            return root
开发者ID:khandelwal,项目名称:regulations-parser,代码行数:56,代码来源:appendices.py

示例15: set_letter

 def set_letter(self, appendix):
     """Find (and set) the appendix letter"""
     for node in (c for c in appendix.getchildren()
                  if is_appendix_header(c)):
         text = tree_utils.get_node_text(node)
         if self.appendix_letter:
             logging.warning("Found two appendix headers: %s and %s",
                             self.appendix_letter, text)
         self.appendix_letter = headers.parseString(text).appendix
     return self.appendix_letter
开发者ID:khandelwal,项目名称:regulations-parser,代码行数:10,代码来源:appendices.py


注:本文中的regparser.tree.xml_parser.tree_utils.get_node_text函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。