本文整理汇总了Python中nlmmanipulate.NlmManipulate.load_dom_tree方法的典型用法代码示例。如果您正苦于以下问题:Python NlmManipulate.load_dom_tree方法的具体用法?Python NlmManipulate.load_dom_tree怎么用?Python NlmManipulate.load_dom_tree使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nlmmanipulate.NlmManipulate
的用法示例。
在下文中一共展示了NlmManipulate.load_dom_tree方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def run(self):
elements = ['abbrev', 'abstract', 'ack', 'address', 'aff', 'alt-text', 'app', 'app-group', 'array',
'article-title', 'attrib', 'author-comment', 'author-notes', 'award-group', 'bio', 'boxed-text',
'caption', 'chem-struct', 'chem-struct-wrap', 'col', 'colgroup', 'collab', 'compound-kwd',
'contrib', 'contrib-group', 'corresp', 'custom-meta', 'def', 'def-item', 'def-list', 'disp-formula',
'disp-formula-group', 'disp-quote', 'element-citation', 'ext-link', 'fig', 'fig-group', 'fn',
'fn-group', 'funding-source', 'glossary', 'glyph-data', 'graphic', 'inline-formula',
'inline-graphic', 'inline-supplementary-material', 'institution', 'kwd', 'kwd-group', 'list',
'list-item', 'long-desc', 'media', 'milestone-end', 'milestone-start', 'mixed-citation',
'named-content', 'nlm-citation', 'note', 'notes', 'p', 'person-group', 'preformat',
'product', 'ref', 'ref-list', 'related-article', 'related-object', 'response', 'sec', 'sig',
'sig-block', 'source', 'speech', 'statement', 'sub-article', 'supplementary-material', 'table',
'table-wrap', 'table-wrap-group', 'tbody', 'td', 'term', 'tex-math', 'tfoot', 'th', 'thead',
'title', 'tr', 'trans-abstract', 'trans-source', 'trans-title', 'trans-title-group', 'verse-group',
'xref']
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
for element in elements:
self.debug.print_debug(self, u'Assigning ID to all {0} elements'.format(element))
for item in tree.xpath(u'//{0}'.format(element)):
if not 'id' in item.attrib:
item.attrib['id'] = u'ID{0}'.format(uuid.uuid4())
tree.write(self.gv.nlm_file_path)
tree.write(self.gv.nlm_temp_file_path)
示例2: prune
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def prune(self):
self.debug.print_debug(self, u'Deleting all stubs from article')
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
for p in tree.xpath('//xref[@ref-type="bibr" and @rid="TO_LINK"]'):
self.extract_contents(p)
manipulate.save_tree(tree)
示例3: process_database_references
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def process_database_references(self, db):
manipulate = NlmManipulate(self.gv)
master_tree = manipulate.load_dom_tree()
tree = master_tree.xpath('//back/ref-list/ref')
for element in tree:
cont = True
text = manipulate.get_stripped_text(element)
year_test = re.compile('((19|20)\d{2})|(n\.d\.)')
match = year_test.search(text)
if match:
# strip out elements in brackets that might scupper parsing
text = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', text)
list_split = text.split(',')
list_split = [x.strip() for x in list_split]
if len(list_split) < 10:
for length in range(1, len(list_split)):
if not cont:
break
for permute in itertools.permutations(list_split, length):
key = match.groups(0)[0] + ''.join(permute).strip()
if isinstance(key, unicode):
key = key.encode("utf-16le")
if key in db:
obj = db[key]
print ('Found {0} in database "{1}"'.format(obj.object_type(), obj.title))
new_element = etree.fromstring(obj.get_citation())
hash_object = hashlib.sha256(key)
hex_dig = hash_object.hexdigest()
new_element.attrib['id'] = hex_dig
if 'id' in element.attrib:
current_id = element.attrib['id']
referrers = master_tree.xpath('//*[@rid={0}]'.format(current_id))
for link in referrers:
link.attrib['rid'] = hex_dig
element.addnext(new_element)
element.getparent().remove(element)
cont = False
break
return manipulate, master_tree
示例4: run_prompt
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def run_prompt(self):
self.run(False)
self.debug.print_debug(self, u"Entering interactive mode")
prompt = Interactive(self.gv)
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
ref_items = tree.xpath("//back/ref-list/ref")
# note that we don't want to exit even if there are no references to link because the user may want to delete
# some
delete_all = False
for p in tree.xpath('//xref[@ref-type="bibr"]'):
text = manipulate.get_stripped_text(p)
if "rid" in p.attrib and p.attrib["rid"] == "TO_LINK":
prompt.print_(u"Found an unhandled reference marker: {0}".format(text))
elif "rid" in p.attrib:
remote = next((x for x in ref_items if "id" in x.attrib and (x.attrib["id"] == p.attrib["rid"])), None)
remote_text = manipulate.get_stripped_text(remote)
prompt.print_(u'Found a handled reference marker: "{0}" which links to "{1}"'.format(text, remote_text))
opts = (
"Skip",
"Delete",
"deleTe all",
"Enter search",
"Ibid",
"enter Link id",
"skip Rest",
"show Context",
)
sel = ""
if delete_all:
sel = "d"
else:
sel = prompt.input_options(opts)
result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree)
if result == "abort":
manipulate.save_tree(tree)
return
elif result == "delall":
delete_all = True
manipulate.save_tree(tree)
示例5: link_items
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def link_items(self, source_id, dest_id, manipulate=None, tree=None):
self.debug.print_debug(self, u'Attempting to link XREF {0} to REF {1}'.format(source_id, dest_id))
if manipulate is None:
manipulate = NlmManipulate(self.gv)
if tree is None:
tree = manipulate.load_dom_tree()
source = tree.xpath('//xref[@id="{0}"]'.format(source_id))[0]
dest = tree.xpath('//ref[@id="{0}"]'.format(dest_id))[0]
ReplaceObject(self.gv, source, dest).link()
manipulate.save_tree(tree)
示例6: run_ext_link_compliance
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def run_ext_link_compliance(self):
self.debug.print_debug(self, u'Attempting to correct any mis-nested graphics elements')
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
bad_links = tree.xpath('//ext-link/graphic')
for link in bad_links:
link_parent = link.getparent()
parent = link_parent.getparent()
parent.insert(parent.index(link_parent)+1, link)
tree.write(self.gv.nlm_file_path)
tree.write(self.gv.nlm_temp_file_path)
示例7: run_prompt
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def run_prompt(self):
self.run(False)
self.debug.print_debug(self, u'Entering interactive mode')
prompt = Interactive(self.gv)
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
ref_items = tree.xpath('//back/ref-list/ref')
# note that we don't want to exit even if there are no references to link because the user may want to delete
# some
delete_all = False
for p in tree.xpath('//xref[@ref-type="bibr"]'):
text = manipulate.get_stripped_text(p)
prompt.print_(prompt.colorize('green',("-" * 80)))
if 'rid' in p.attrib and p.attrib['rid'] == 'TO_LINK':
prompt.print_(u"Found an unhandled reference marker: {0}".format(text))
elif 'rid' in p.attrib:
remote = next((x for x in ref_items if 'id' in x.attrib and (x.attrib['id'] == p.attrib['rid'])), None)
remote_text = manipulate.get_stripped_text(remote)
prompt.print_(u"Found a handled reference marker: \"{0}\" which links to \"{1}\"".format(text,
remote_text))
opts = ('Skip', 'Delete', 'deleTe all', 'Enter search', 'Ibid', 'enter Link id',
'skip Rest', 'show Context')
sel = ''
if delete_all:
sel = 'd'
else:
sel = prompt.input_options(opts)
result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree)
if result == 'abort':
manipulate.save_tree(tree)
return
elif result == 'delall':
delete_all = True
manipulate.save_tree(tree)
示例8: process_zotero
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def process_zotero(self):
from zotero import libzotero
zotero = libzotero.LibZotero(unicode(self.gv.settings.get_setting(u'zotero', self)), self.gv)
manipulate = NlmManipulate(self.gv)
master_tree = manipulate.load_dom_tree()
tree = master_tree.xpath('//back/ref-list/ref')
for element in tree:
original_term = manipulate.get_stripped_text(element)
term = original_term
#term = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', term)
term = re.sub(r'(?<![0-9])[1-9][0-9]{0,2}(?![0-9])', r'', term)
term = re.sub(r'[\-,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\!\\/]', '', term)
term = re.sub(u'[^\s]+?\s[Ee]dition', u' ', term)
term = re.sub(u'\s.\s', u' ', term)
term = re.sub(u'(?<=[A-Z])\.', u' ', term)
term = term.replace(u'“', u'')
term = term.replace(u'\'s', u'')
term = term.replace(u'’s', u'')
term = term.replace(u'’', u'')
term = term.replace(u' Ed. ', u' ')
term = term.replace(u' Ed ', u' ')
term = term.replace(u' Trans. ', u' ')
term = term.replace(u' Trans ', u' ')
term = term.replace(u' trans ', u' ')
term = term.replace(u' trans. ', u' ')
term = term.replace(u' by. ', u' ')
term = term.replace(u' by ', u' ')
term = term.replace(u' ed. ', u' ')
term = term.replace(u' ed ', u' ')
term = term.replace(u' In ', u' ')
term = term.replace(u' in ', u' ')
term = term.replace(u' print ', u' ')
term = term.replace(u' Print ', u' ')
term = term.replace(u' and ', u' ')
term = term.replace(u'”', u'')
term = re.sub(r'[Aa]ccessed', '', term)
term = re.sub(r'meTypesetbr', '', term)
term = re.sub(r'\s+', ' ', term)
results = zotero.search(term.strip())
while len(results) == 0 and len(term.strip().split(' ')) > 2:
# no results found.
# begin iterating backwards
term = ' '.join(term.strip().split(' ')[:-1])
results = zotero.search(term.strip())
if len(results) == 1:
res = results[0].JATS_format()
if res is not None:
ref = etree.fromstring(res)
if 'id' in element.attrib:
ref.attrib['id'] = element.attrib['id']
element.addnext(ref)
original_term = re.sub(u'--', u'', original_term)
comment = etree.Comment(original_term)
ref.addnext(comment)
element.tag = 'REMOVE'
etree.strip_elements(master_tree, 'REMOVE')
manipulate.save_tree(master_tree)
示例9: run
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def run(self, interactive):
if interactive:
self.run_prompt()
return
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
# pre-cleanup: remove all empty ext-links as these break the linker
items_to_clean = tree.xpath('//ext-link')
count = 0
for item in items_to_clean:
if '{http://www.w3.org/1999/xlink}href' in item.attrib and \
item.attrib['{http://www.w3.org/1999/xlink}href'] == '':
count += 1
item.tag = 'REMOVE'
etree.strip_tags(item.getparent(), 'REMOVE')
if count > 0:
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Removed {0} blank ext-link tags'.format(count))
ref_items = tree.xpath('//back/ref-list/ref')
self.clean_ref_items(tree, ref_items, manipulate)
# handle numbered reference items
references_and_numbers = {}
for ref in ref_items:
text = manipulate.get_stripped_text(ref)
ref_match = re.compile('^(?P<number>\d+)\.*')
result = ref_match.match(text)
if result:
references_and_numbers[result.group('number')] = ref
parsed = self.process_ibid_authors(ref_items)
if parsed > 0:
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Replace {0} instances of "---." at start of references'.format(parsed))
to_link = []
to_stub = []
square_bracket_count = {}
for p in tree.xpath('//sec//p[not(mml:math)] | //td',
namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'}):
text = manipulate.get_stripped_text(p)
reference_test = re.compile('\((?P<text>[^%]+?)\)')
matches = reference_test.finditer(text)
# exclude any square brackets with numbers inside
sub_match = re.compile('\[(?P<square>\d*[,\-;\d\s]*)\]')
smatch = sub_match.search(text)
if smatch:
smatches = sub_match.finditer(text)
for smatch in smatches:
self.debug.print_debug(self, u'Handling references in square '
u'brackets: [{0}] '.format(smatch.group('square')))
for item in re.split(';|,', smatch.group('square')):
if '-' in item:
parent, tail = manipulate.find_text(p, item)
if parent is not None:
new_string = ''
try:
split_range = item.strip().split('-')
for no in range(int(split_range[0]), int(split_range[1]) + 1):
new_string += str(no) + ','
except:
self.debug.print_debug(self, u'Unable to parse reference '
u'number in range {0}'.format(item))
break
if new_string.endswith(',') and not item.endswith(','):
new_string = new_string[0:len(new_string) - 1]
if tail and new_string != '':
parent.tail = parent.tail.replace(item, new_string)
elif not tail and new_string != '':
parent.text = parent.text.replace(item, new_string)
try:
split_range = item.strip().split('-')
for no in range(int(split_range[0]), int(split_range[1]) + 1):
self.debug.print_debug(self, u'Parsing reference '
u'number in range {0}'.format(str(no)))
#.........这里部分代码省略.........
示例10: run_graphics_sibling
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def run_graphics_sibling(self):
# images are hard to handle because Word/OO puts them in different places
# for instance, the caption can come before or after;
# <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float"
# orientation="portrait" xlink:type="simple"/>
self.debug.print_debug(self, u'Attempting to classify captions for graphics objects [sibling]')
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
graphics = tree.xpath('//graphic')
graphic_titles = []
graphic_ids = []
graphic_regex_dot = re.compile('^.+?\s*\d+\..+')
graphic_regex_colon = re.compile('^.+?\s*\d+\:.+')
separator = ':'
for graphic in graphics:
use_next = False
use_previous = False
# get the next sibling
p = graphic.getparent().getnext()
pprev = graphic.getparent().getprevious()
if p is not None and p.tag == 'p':
text = manipulate.get_stripped_text(p)
if graphic_regex_colon.match(text):
use_next = True
separator = ':'
elif graphic_regex_dot.match(text):
use_next = True
separator = '.'
if not use_next:
if pprev is not None and pprev.tag == 'p':
text = manipulate.get_stripped_text(pprev)
if graphic_regex_colon.match(text):
use_previous = True
separator = ':'
elif graphic_regex_dot.match(text):
use_previous = True
separator = '.'
if not use_next or use_previous:
# see if the title in this section potentially contains text we can match
parent = graphic.getparent()
while parent is not None and not parent.tag.endswith('sec'):
parent = parent.getparent()
if parent is not None:
titles = parent.xpath('title')
else:
titles = []
if len(titles) > 0:
p = titles[0]
text = manipulate.get_stripped_text(p)
if graphic_regex_colon.match(text):
use_next = True
separator = ':'
elif graphic_regex_dot.match(text):
use_next = True
separator = '.'
if use_next or use_previous:
if use_next:
text = manipulate.get_stripped_text(p)
else:
text = manipulate.get_stripped_text(pprev)
p = pprev
# likely this is a table identifier
split_title = text.split(separator)
title = split_title[0].strip()
caption = (''.join(split_title[1:])).strip()
self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))
title_element = None
# use an existing title element if one exists
try:
title_element = graphic.xpath('label')[0]
except:
title_element = etree.Element('label')
graphic.insert(0, title_element)
title_element.text = title
#.........这里部分代码省略.........
示例11: run_tables
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def run_tables(self):
self.debug.print_debug(self, u'Attempting to classify captions for table objects')
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
tables = tree.xpath('//table-wrap')
table_titles = []
table_ids = []
table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+')
table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+')
separator = ':'
for table in tables:
use_next = False
use_previous = False
used_title = False
# get the next sibling
p = table.getnext()
pprev = table.getprevious()
old_title = None
if p is not None and p.tag == 'p':
cont = True
for sub in p:
if sub.tag == 'graphic':
cont = False
if cont:
text = manipulate.get_stripped_text(p)
if table_regex_colon.match(text):
use_next = True
separator = ':'
elif table_regex_dot.match(text):
use_next = True
separator = '.'
if not use_next:
cont = True
for sub in pprev:
if sub.tag == 'graphic':
cont = False
if cont:
if pprev is not None and pprev.tag == 'p':
text = manipulate.get_stripped_text(pprev)
if table_regex_colon.match(text):
use_previous = True
separator = ':'
elif table_regex_dot.match(text):
use_previous = True
separator = '.'
if not use_next or use_previous:
# see if the title in this section potentially contains text we can match
parent = table.getparent()
titles = parent.xpath('title')
if len(titles) > 0:
p = titles[0]
text = manipulate.get_stripped_text(p)
if table_regex_colon.match(text):
use_next = True
separator = ':'
used_title = True
elif table_regex_dot.match(text):
use_next = True
separator = '.'
used_title = True
if use_next or use_previous:
if use_next:
text = manipulate.get_stripped_text(p)
else:
text = manipulate.get_stripped_text(pprev)
p = pprev
# likely this is a table identifier
split_title = text.split(separator)
title = split_title[0]
caption = (''.join(split_title[1:])).strip()
# strip all formatting from caption for ease of parsing
# TODO: preserve formatting (far harder)
new_p = etree.Element('p')
new_p.text = caption
if p.tag.endswith('title'):
new_title = etree.Element('title')
new_title.text = ''
#.........这里部分代码省略.........
示例12: run_tables
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def run_tables(self):
self.debug.print_debug(self, u'Attempting to classify captions for table objects')
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
tables = tree.xpath('//table-wrap')
table_titles = []
table_ids = []
table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+')
table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+')
separator = ':'
for table in tables:
caption_element = None
use_next = False
use_previous = False
used_title = False
# get the next sibling
p = table.getnext()
pprev = table.getprevious()
old_title = None
if p is not None and p.tag == 'p':
cont = True
for sub in p:
if sub.tag == 'graphic':
cont = False
if cont:
text = manipulate.get_stripped_text(p)
if table_regex_colon.match(text):
use_next = True
separator = ':'
elif table_regex_dot.match(text):
use_next = True
separator = '.'
if not use_next:
cont = True
for sub in pprev:
if sub.tag == 'graphic':
cont = False
if cont:
if pprev is not None and pprev.tag == 'p':
text = manipulate.get_stripped_text(pprev)
if table_regex_colon.match(text):
use_previous = True
separator = ':'
elif table_regex_dot.match(text):
use_previous = True
separator = '.'
if not use_next or use_previous:
# see if the title in this section potentially contains text we can match
parent = table.getparent()
titles = parent.xpath('title')
if len(titles) > 0:
p = titles[0]
text = manipulate.get_stripped_text(p)
if table_regex_colon.match(text):
use_next = True
separator = ':'
used_title = True
elif table_regex_dot.match(text):
use_next = True
separator = '.'
used_title = True
if use_next or use_previous:
if use_next:
text = manipulate.get_stripped_text(p)
else:
text = manipulate.get_stripped_text(pprev)
p = pprev
# likely this is a table identifier
split_title = text.split(separator)
title = split_title[0].strip()
caption = (''.join(split_title[1:])).strip()
# strip all formatting from caption for ease of parsing
# TODO: preserve formatting (far harder)
new_p = etree.Element('p')
new_p.text = caption
if p.tag.endswith('title'):
new_title = etree.Element('title')
#.........这里部分代码省略.........
示例13: run
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
#.........这里部分代码省略.........
"article-title",
"attrib",
"author-comment",
"author-notes",
"award-group",
"bio",
"boxed-text",
"caption",
"chem-struct",
"chem-struct-wrap",
"col",
"colgroup",
"collab",
"compound-kwd",
"contrib",
"contrib-group",
"corresp",
"custom-meta",
"def",
"def-item",
"def-list",
"disp-formula",
"disp-formula-group",
"disp-quote",
"element-citation",
"ext-link",
"fig",
"fig-group",
"fn",
"fn-group",
"funding-source",
"glossary",
"glyph-data",
"graphic",
"inline-formula",
"inline-graphic",
"inline-supplementary-material",
"institution",
"kwd",
"kwd-group",
"list",
"list-item",
"long-desc",
"media",
"milestone-end",
"milestone-start",
"mixed-citation",
"named-content",
"nlm-citation",
"note",
"notes",
"p",
"person-group",
"preformat",
"product",
"ref",
"ref-list",
"related-article",
"related-object",
"response",
"sec",
"sig",
"sig-block",
"source",
"speech",
"statement",
"sub-article",
"supplementary-material",
"table",
"table-wrap",
"table-wrap-group",
"tbody",
"td",
"term",
"tex-math",
"tfoot",
"th",
"thead",
"title",
"tr",
"trans-abstract",
"trans-source",
"trans-title",
"trans-title-group",
"verse-group",
"xref",
]
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
for element in elements:
self.debug.print_debug(self, u"Assigning ID to all {0} elements".format(element))
for item in tree.xpath(u"//{0}".format(element)):
if not "id" in item.attrib:
item.attrib["id"] = u"ID{0}".format(unicode(uuid.uuid4()))
tree.write(self.gv.nlm_file_path)
tree.write(self.gv.nlm_temp_file_path)
示例14: run_graphics
# 需要导入模块: from nlmmanipulate import NlmManipulate [as 别名]
# 或者: from nlmmanipulate.NlmManipulate import load_dom_tree [as 别名]
def run_graphics(self):
# images are hard to handle because Word/OO puts them in different places
# for instance, the caption can come before or after;
# <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float"
# orientation="portrait" xlink:type="simple"/>
self.debug.print_debug(self, u"Attempting to classify captions for graphics objects [plain]")
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
graphics = tree.xpath("//graphic")
graphic_titles = []
graphic_ids = []
graphic_regex_dot = re.compile("^.+?\s*\d+\..+")
graphic_regex_colon = re.compile("^.+?\s*\d+\:.+")
separator = ":"
for graphic in graphics:
use_next = False
# get the next sibling
p = graphic.getparent()
if p is not None and p.tag == "p":
text = manipulate.get_stripped_text(p)
if graphic_regex_colon.match(text):
use_next = True
separator = ":"
elif graphic_regex_dot.match(text):
use_next = True
separator = "."
if use_next:
text = manipulate.get_stripped_text(p)
# likely this is a table identifier
split_title = text.split(separator)
title = split_title[0].strip()
caption = ("".join(split_title[1:])).strip()
self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))
title_element = None
# use an existing title element if one exists
try:
title_element = graphic.xpath("label")[0]
except:
title_element = etree.Element("label")
graphic.insert(0, title_element)
title_element.text = title
caption_element = etree.Element("caption")
new_p = etree.Element("p")
new_p.text = caption
NlmManipulate.append_safe(caption_element, new_p, self)
NlmManipulate.append_safe(graphic, caption_element, self)
if graphic.tail:
graphic.tail = graphic.tail.replace(title + separator, "")
graphic.tail = graphic.tail.replace(caption + separator, "")
graphic.tail = graphic.tail.replace(caption, "")
if not "id" in graphic.attrib:
graphic.attrib["id"] = u"ID{0}".format(unicode(uuid.uuid4()))
graphic_titles.append(title)
graphic_ids.append(graphic.attrib["id"])
paragraphs = tree.xpath("//p")
self.link(graphic_ids, graphic_titles, paragraphs, "fig")
tree.write(self.gv.nlm_file_path)
tree.write(self.gv.nlm_temp_file_path)
self.run_graphics_sibling()