本文整理汇总了Python中bs4.NavigableString方法的典型用法代码示例。如果您正苦于以下问题:Python bs4.NavigableString方法的具体用法?Python bs4.NavigableString怎么用?Python bs4.NavigableString使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bs4
的用法示例。
在下文中一共展示了bs4.NavigableString方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: print_content
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def print_content(contents):
for content in contents:
name = content.name
#if not isinstance(content, Tag):
if isinstance(content, NavigableString):
s = str(content)
s = s.replace("\n","")
print s.strip()
else:
if name == "img":
'''
img = content.find("img")
if img:
print img.get("src")
'''
print "[图片]"
elif name == "br":
print ""
elif name == "noscript":
continue
elif name == "li":
print "•",
print_content(content.contents)
示例2: normalize_text_sections
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def normalize_text_sections(div):
paragraph = ''
for content in div.contents:
text = ''
if type(content) == NavigableString:
text = content
elif type(content) == Comment:
pass
elif content.name == 'li':
text = content.text
else:
text = content.text
text = text.strip()
paragraph += text.strip() + ' '
paragraph = paragraph.strip()
paragraph = paragraph.replace('\r', '')
paragraph = paragraph.replace('\n', ', ')
paragraph = paragraph.strip()
return paragraph
示例3: normalize_text_sections
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def normalize_text_sections(div):
paragraph = ''
for content in div.contents:
text = ''
if type(content) == NavigableString:
text = content
elif type(content) == Comment:
pass
elif content.name == 'li':
text = content.text
else:
text = content.text
text = text.strip()
paragraph += text.strip() + ' '
paragraph = paragraph.strip()
paragraph = paragraph.replace('\r', '')
paragraph = paragraph.replace('\n', ', ')
paragraph = paragraph.replace(' ', ' ')
paragraph = paragraph.strip()
return paragraph
示例4: __clear
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def __clear(parent_node,config):
# return bs.prettify()
content = ""
# print parent_node
if isinstance(parent_node, NavigableString):
return parent_node.string
if parent_node.name in line_elements:
content += "\n"
children = parent_node.contents
for child in children:
if child.name == "table":
content += parse_table(child,config)
else:
content += __clear(child,config)
return content
示例5: process_tag
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def process_tag(node):
"""
Recursively go through a tag's children, converting them, then
convert the tag itself.
"""
text = ''
exceptions = ['table']
for element in node.children:
if isinstance(element, NavigableString):
text += element
elif not node.name in exceptions:
text += process_tag(element)
try:
convert_fn = globals()["convert_%s" % node.name.lower()]
text = convert_fn(node, text)
except KeyError:
pass
return text
示例6: get_children
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def get_children(descendants, parsed):
subelement = False
descendants_buff = deque()
if descendants is None:
return descendants_buff
if (isinstance(descendants, NavigableString)):
parsed.append(descendants)
else:
for child in descendants.children:
if (child.name == None):
if (subelement == False):
parsed.append(child)
else:
descendants_buff.append(child)
else:
if (subelement == False):
subelement = True
descendants_buff.append(child)
else:
descendants_buff.append(child)
descendants_buff.reverse()
return descendants_buff
示例7: soup_strings
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def soup_strings(soup):
paragraph_tags = set(["caption", "details", "h1", "h2", "h3", "h4", "h5",
"h6", "li", "p", "td", "div", "span"])
skip_children = None
for descendant in soup.descendants:
# If we've treated a tag as a contiguous paragraph, don't re-emit the
# children (see below).
if skip_children is not None:
try:
in_skip = descendant in skip_children
except RecursionError:
# Possible for this check to hit a nasty infinite recursion because of
# BeautifulSoup __eq__ checks.
in_skip = True
if in_skip:
continue
else:
skip_children = None
# Treat some tags as contigous paragraphs, regardless of other tags nested
# inside (like <a> or <b>).
if isinstance(descendant, bs4.Tag):
if descendant.name in paragraph_tags:
if descendant.find_all(paragraph_tags):
# If there are nested paragraph tags, don't treat it as a single
# contiguous tag.
continue
skip_children = list(descendant.descendants)
text = " ".join(descendant.get_text(" ", strip=True).split())
if text:
yield text
continue
if (isinstance(descendant, bs4.Comment) or
not isinstance(descendant, bs4.NavigableString)):
continue
text = " ".join(descendant.strip().split())
if text:
yield text
示例8: apply_correction_map
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def apply_correction_map(soup, tag, cor_map):
for item in list(tag.descendants):
if isinstance(item, bs4.NavigableString):
origstr = str(item)
itemstr = origstr
for fontset in cor_map:
for badc, goodc in fontset.items():
if badc in itemstr:
itemstr = itemstr.replace(badc, goodc)
if origstr != itemstr:
news = soup.new_string(itemstr)
item.replace_with(news)
示例9: get_text_lines
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def get_text_lines(parent_node):
text_lines = ['']
for node in parent_node.children:
if isinstance(node, bs4.NavigableString):
text_lines[-1] += str(node)
elif node.name == 'br':
text_lines.append('')
else:
text_lines[-1] += node.text
return text_lines
示例10: clean_node
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def clean_node(self, doc, node):
"""Clean a BeautifulSoup document in-place"""
if isinstance(node, NavigableString):
self.clean_string_node(doc, node)
elif isinstance(node, Tag):
self.clean_tag_node(doc, node)
# This branch is here in case node is a BeautifulSoup object that does
# not inherit from NavigableString or Tag. I can't find any examples
# of such a thing at the moment, so this branch is untested.
else: # pragma: no cover
self.clean_unknown_node(doc, node)
示例11: make_catena_input
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def make_catena_input(src, dest):
text = open(src).read()
soup = BeautifulSoup(text, 'xml')
soup.find('DCT').insert_after(soup.new_tag('TITLE'))
soup.find('DCT').append(soup.new_tag('TIMEX3', functionInDocument="CREATION_TIME", temporalFunction="false", tid="t0", type="DATE", value=""))
for e in soup.find_all('event'):
new_e = soup.new_tag('EVENT', **e.attrs)
new_e.insert(0, NavigableString(e.get_text()))
e.replaceWith(new_e)
[s.extract() for s in soup('TLINK')]
with open(args.dest + src.split('/')[-1] + '.tml', 'w') as f:
f.write(str(soup))
示例12: is_navigable_string
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def is_navigable_string(obj):
"""Is navigable string."""
return isinstance(obj, bs4.NavigableString)
示例13: _html2text
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def _html2text(elem):
for child in elem.children:
if isinstance(child, Tag):
_html2text(child)
elif isinstance(child, NavigableString):
# No changes necessary
continue
if elem.parent:
if elem.name in _ELEMENT_REPLACER:
_ELEMENT_REPLACER[elem.name](elem)
示例14: wrap_elem_content
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def wrap_elem_content(elem, begin, end):
elem.insert(0, NavigableString(begin))
elem.append(NavigableString(end))
示例15: _insert_anchor
# 需要导入模块: import bs4 [as 别名]
# 或者: from bs4 import NavigableString [as 别名]
def _insert_anchor(el, anchor_id, prefix="xxanchor"):
el.insert(0, NavigableString(f' {prefix}-{anchor_id} '))