本文整理汇总了Python中regparser.tree.xml_parser.tree_utils.get_node_text函数的典型用法代码示例。如果您正苦于以下问题:Python get_node_text函数的具体用法?Python get_node_text怎么用?Python get_node_text使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了get_node_text函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: table_xml_to_data
def table_xml_to_data(xml_node):
"""Construct a data structure of the table data. We provide a different
structure than the native XML as the XML encodes too much logic. This
structure can be used to generate semi-complex tables which could not be
generated from the markdown above"""
header_root = build_header(xml_node.xpath('./BOXHD/CHED'))
header = [[] for _ in range(header_root.height())]
def per_node(node):
header[node.level].append({'text': node.text,
'colspan': node.colspan,
'rowspan': node.rowspan})
struct.walk(header_root, per_node)
header = header[1:] # skip the root
rows = []
for row in xml_node.xpath('./ROW'):
rows.append([tree_utils.get_node_text(td, add_spaces=True).strip()
for td in row.xpath('./ENT')])
table_data = {'header': header, 'rows': rows}
caption_nodes = xml_node.xpath('./TTITLE')
if len(caption_nodes):
text = tree_utils.get_node_text(caption_nodes[0]).strip()
table_data["caption"] = text
return table_data
示例2: test_get_node_text
def test_get_node_text(self):
text = '<P>(a)<E T="03">Fruit.</E>Apps,<PRTPAGE P="102"/> and pins</P>'
doc = etree.fromstring(text)
result = tree_utils.get_node_text(doc)
self.assertEquals('(a)Fruit.Apps, and pins', result)
text = '<P>(a)<E T="03">Fruit.</E>Apps,<PRTPAGE P="102"/> and pins</P>'
doc = etree.fromstring(text)
result = tree_utils.get_node_text(doc, add_spaces=True)
self.assertEquals('(a) Fruit. Apps, and pins', result)
text = '<P>(a) <E T="03">Fruit.</E> Apps, and pins</P>'
doc = etree.fromstring(text)
result = tree_utils.get_node_text(doc, add_spaces=True)
self.assertEquals('(a) Fruit. Apps, and pins', result)
text = '<P>(a) ABC<E T="52">123</E>= 5</P>'
doc = etree.fromstring(text)
result = tree_utils.get_node_text(doc, add_spaces=True)
self.assertEquals('(a) ABC_{123} = 5', result)
text = '<P>(a) <E>Keyterm.</E> ABC<E T="52">123</E>= 5</P>'
doc = etree.fromstring(text)
result = tree_utils.get_node_text(doc, add_spaces=True)
self.assertEquals('(a) Keyterm. ABC_{123} = 5', result)
示例3: table_xml_to_plaintext
def table_xml_to_plaintext(xml_node):
"""Markdown representation of a table. Note that this doesn't account
for all the options needed to display the table properly, but works fine
for simple tables. This gets included in the reg plain text"""
header = [tree_utils.get_node_text(hd, add_spaces=True).strip()
for hd in xml_node.xpath('./BOXHD/CHED|./TTITLE')]
divider = ['---']*len(header)
rows = []
for tr in xml_node.xpath('./ROW'):
rows.append([tree_utils.get_node_text(td, add_spaces=True).strip()
for td in tr.xpath('./ENT')])
table = []
for row in [header] + [divider] + rows:
table.append('|' + '|'.join(row) + '|')
return '\n'.join(table)
示例4: build_header
def build_header(xml_nodes):
"""Builds a TableHeaderNode tree, with an empty root. Each node in the tree
includes its colspan/rowspan"""
stack = HeaderStack()
stack.add(0, TableHeaderNode(None, 0)) # Root
for xml_node in xml_nodes:
level = int(xml_node.attrib['H'])
text = tree_utils.get_node_text(xml_node, add_spaces=True).strip()
stack.add(level, TableHeaderNode(text, level))
while stack.size() > 1:
stack.unwind()
root = stack.m_stack[0][0][1]
max_height = root.height()
def set_rowspan(n):
n.rowspan = max_height - n.height() - n.level + 1
struct.walk(root, set_rowspan)
def set_colspan(n):
n.colspan = n.width()
struct.walk(root, set_colspan)
return root
示例5: parse_amdpar
def parse_amdpar(par, initial_context):
""" Parse the <AMDPAR> tags into a list of paragraphs that have changed.
"""
# Replace and "and"s in titles; they will throw off and_token_resolution
for e in filter(lambda e: e.text, par.xpath('./E')):
e.text = e.text.replace(' and ', ' ')
text = get_node_text(par, add_spaces=True)
tokenized = [t[0] for t, _, _ in amdpar.token_patterns.scanString(text)]
tokenized = compress_context_in_tokenlists(tokenized)
tokenized = resolve_confused_context(tokenized, initial_context)
tokenized = paragraph_in_context_moved(tokenized, initial_context)
tokenized = remove_false_deletes(tokenized, text)
tokenized = multiple_moves(tokenized)
tokenized = switch_passive(tokenized)
tokenized = and_token_resolution(tokenized)
tokenized, subpart = deal_with_subpart_adds(tokenized)
tokenized = context_to_paragraph(tokenized)
tokenized = move_then_modify(tokenized)
if not subpart:
tokenized = separate_tokenlist(tokenized)
initial_context = switch_context(tokenized, initial_context)
tokenized, final_context = compress_context(tokenized, initial_context)
amends = make_amendments(tokenized, subpart)
return amends, final_context
示例6: nodes_from_interp_p
def nodes_from_interp_p(xml_node):
"""Given an XML node that contains text for an interpretation paragraph,
split it into sub-paragraphs and account for trailing stars"""
node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
first_marker = get_first_interp_marker(text_with_tags)
collapsed = collapsed_markers_matches(node_text, text_with_tags)
# -2 throughout to account for matching the character + period
ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
starts = [m.end() - 2 for m in collapsed] + [len(node_text)]
# Node for this paragraph
n = Node(node_text[0:starts[0]], label=[first_marker],
node_type=Node.INTERP, tagged_text=text_with_tags)
yield n
if n.text.endswith('* * *'):
yield Node(label=[mtypes.INLINE_STARS])
# Collapsed-marker children
for match, end in zip(collapsed, ends):
marker = match.group(1)
if marker == '1':
marker = '<E T="03">1</E>'
n = Node(node_text[match.end() - 2:end], label=[marker],
node_type=Node.INTERP)
yield n
if n.text.endswith('* * *'):
yield Node(label=[mtypes.INLINE_STARS])
示例7: process_inner_children
def process_inner_children(inner_stack, xml_node):
"""Process the following nodes as children of this interpretation. This
is very similar to reg_text.py:build_from_section()"""
children = itertools.takewhile(
lambda x: not is_title(x), xml_node.itersiblings())
nodes = []
for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
first_marker = get_first_interp_marker(text_with_tags)
if xml_node.tag == 'STARS':
nodes.append(Node(label=[mtypes.STARS_TAG]))
elif not first_marker and nodes:
logger.warning("Couldn't determine interp marker. Appending to "
"previous paragraph: %s", node_text)
previous = nodes[-1]
previous.text += "\n\n" + node_text
if previous.tagged_text:
previous.tagged_text += "\n\n" + text_with_tags
else:
previous.tagged_text = text_with_tags
else:
nodes.extend(nodes_from_interp_p(xml_node))
# Trailing stars don't matter; slightly more efficient to ignore them
while nodes and nodes[-1].label[0] in mtypes.stars:
nodes = nodes[:-1]
add_nodes_to_stack(nodes, inner_stack)
示例8: derive_nodes
def derive_nodes(self, xml, processor=None):
texts = ["```" + self.fence_type(xml)]
for child in xml:
texts.append(tree_utils.get_node_text(child).strip())
texts.append("```")
return [Node("\n".join(texts), label=[mtypes.MARKERLESS])]
示例9: make_authority_instructions
def make_authority_instructions(auth_xml, cfr_part):
"""Creates an `EREGS_INSTRUCTIONS` element specific to the authority
information"""
instructions = etree.Element('EREGS_INSTRUCTIONS')
authority = etree.SubElement(instructions, 'AUTHORITY', label=cfr_part)
authority.text = '\n'.join(get_node_text(p, add_spaces=True)
for p in auth_xml.xpath('./P'))
return instructions
示例10: process
def process(self, appendix, part):
self.m_stack = tree_utils.NodeStack()
self.part = part
self.paragraph_count = 0
self.header_count = 0
self.depth = None
self.appendix_letter = None
# holds collections of nodes until their depth is determined
self.nodes = []
self.set_letter(appendix)
remove_toc(appendix, self.appendix_letter)
def is_subhead(tag, text):
initial = initial_marker(text)
return ((tag == 'HD' and (not initial or '.' in initial[1]))
or (tag in ('P', 'FP')
and title_label_pair(text, self.appendix_letter,
self.part)))
for child in appendix.getchildren():
text = tree_utils.get_node_text(child, add_spaces=True).strip()
if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED')
or child.tag == 'RESERVED'):
self.end_group()
self.hed(part, text)
elif is_subhead(child.tag, text):
self.end_group()
self.subheader(child, text)
elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'):
text = self.insert_dashes(child, text)
self.paragraph_with_marker(
text,
tree_utils.get_node_text_tags_preserved(child))
elif child.tag == 'SEQUENCE':
old_depth = self.depth
self.end_group()
self.depth = old_depth
self.process_sequence(child)
elif child.tag in ('P', 'FP'):
text = self.insert_dashes(child, text)
self.paragraph_no_marker(text)
elif child.tag == 'GPH':
self.graphic(child)
elif child.tag == 'GPOTABLE':
self.table(child)
elif child.tag in ('NOTE', 'NOTES'):
self.fence(child, 'note')
elif child.tag == 'CODE':
self.fence(child, child.get('LANGUAGE', 'code'))
self.end_group()
while self.m_stack.size() > 1:
self.m_stack.unwind()
if self.m_stack.m_stack[0]:
return self.m_stack.m_stack[0][0][1]
示例11: process_appendix
def process_appendix(m_stack, current_section, child):
html_parser = HTMLParser.HTMLParser()
for ch in child.getchildren():
if ch.tag == 'HD':
appendix_section = get_appendix_section_number(
ch.text, current_section)
if appendix_section is None:
appendix_section = determine_next_section(m_stack, 2)
n = Node(
node_type=Node.APPENDIX, label=[appendix_section],
title=ch.text)
node_level = 2
tree_utils.add_to_stack(m_stack, node_level, n)
if ch.tag == 'P':
text = ' '.join([ch.text] + [c.tail for c in ch if c.tail])
markers_list = tree_utils.get_paragraph_markers(text)
node_text = tree_utils.get_node_text(ch)
if len(markers_list) > 0:
if len(markers_list) > 1:
actual_markers = ['(%s)' % m for m in markers_list]
node_text = tree_utils.split_text(
node_text, actual_markers)
else:
node_text = [node_text]
for m, node_text in zip(markers_list, node_text):
n = Node(
node_text, label=[str(m)], node_type=Node.APPENDIX)
last = m_stack.peek()
node_level = determine_level(m, last[0][0])
if m == 'i':
#This is bit of a hack, since we can't easily
#distinguish between the Roman numeral #(i) and the
#letter (i) to determine the level. We look ahead to
#help. This is not #a complete solution and we should
#circle back at some point.
next_text = ' '.join(
[ch.getnext().text] +
[c.tail for c in ch.getnext() if c.tail])
next_markers = tree_utils.get_paragraph_markers(
next_text)
if next_markers[0] == 'ii':
node_level = 5
tree_utils.add_to_stack(m_stack, node_level, n)
else:
last = m_stack.peek_last()
last[1].text = last[1].text + '\n %s' % node_text
示例12: set_letter
def set_letter(self, appendix):
"""Find (and set) the appendix letter"""
for hd in appendix_headers(appendix):
text = tree_utils.get_node_text(hd)
if self.appendix_letter:
logger.warning("Found two appendix headers: %s and %s",
self.appendix_letter, text)
self.appendix_letter = grammar.headers.parseString(text).appendix
return self.appendix_letter
示例13: derive_nodes
def derive_nodes(self, xml, processor=None):
texts = ["```" + xml.get('LANGUAGE', 'code')]
for child in xml:
text = tree_utils.get_node_text(child).strip()
if text:
texts.append(text)
texts.append("```")
return [Node("\n".join(texts), label=[mtypes.MARKERLESS])]
示例14: process
def process(self, appendix, part):
self.m_stack = tree_utils.NodeStack()
self.paragraph_count = 0
self.header_count = 0
self.depth = None
self.appendix_letter = None
self.set_letter(appendix)
remove_toc(appendix, self.appendix_letter)
def is_subhead(tag, text):
initial = initial_marker(text)
return ((tag == 'HD' and (not initial or '.' in initial[1]))
or (tag in ('P', 'FP')
and title_label_pair(text, self.appendix_letter)))
for child in appendix.getchildren():
text = tree_utils.get_node_text(child, add_spaces=True).strip()
if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED')
or child.tag == 'RESERVED'):
self.hed(part, text)
elif is_subhead(child.tag, text):
self.subheader(child, text)
elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'):
if child.getnext() is None:
next_text = ''
else:
next_text = self.find_next_text_with_marker(
child.getnext()) or ''
texts = self.split_paragraph_text(text, next_text)
for text, next_text in zip(texts, texts[1:]):
self.paragraph_with_marker(text, next_text)
elif child.tag in ('P', 'FP'):
self.paragraph_no_marker(text)
elif child.tag == 'GPH':
self.graphic(child)
elif child.tag == 'GPOTABLE':
self.table(child)
elif child.tag in ('NOTE', 'NOTES'):
self.fence(child, 'note')
elif child.tag == 'CODE':
self.fence(child, child.get('LANGUAGE', 'code'))
while self.m_stack.size() > 1:
self.m_stack.unwind()
if self.m_stack.m_stack[0]:
root = self.m_stack.m_stack[0][0][1]
def per_node(n):
if hasattr(n, 'p_level'):
del n.p_level
walk(root, per_node)
return root
示例15: set_letter
def set_letter(self, appendix):
"""Find (and set) the appendix letter"""
for node in (c for c in appendix.getchildren()
if is_appendix_header(c)):
text = tree_utils.get_node_text(node)
if self.appendix_letter:
logging.warning("Found two appendix headers: %s and %s",
self.appendix_letter, text)
self.appendix_letter = headers.parseString(text).appendix
return self.appendix_letter