本文整理匯總了Python中bs4.UnicodeDammit.replace方法的典型用法代碼示例。如果您正苦於以下問題:Python UnicodeDammit.replace方法的具體用法?Python UnicodeDammit.replace怎麽用?Python UnicodeDammit.replace使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類bs4.UnicodeDammit
的用法示例。
在下文中一共展示了UnicodeDammit.replace方法的3個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: remove_evernote_link
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import replace [as 別名]
def remove_evernote_link(link, html):
html = UnicodeDammit(html, ["utf-8"], is_html=True).unicode_markup
link_converted = UnicodeDammit(link.WholeRegexMatch, ["utf-8"], is_html=True).unicode_markup
sep = u'<span style="color: rgb(105, 170, 53);"> | </span>'
sep_regex = escape_regex(sep)
no_start_tag_regex = r"[^<]*"
regex_replace = r"<{0}[^>]*>[^<]*{1}[^<]*</{0}>"
# html = re.sub(regex_replace.format('li', link.WholeRegexMatch), "", html)
# Remove link
html = html.replace(link.WholeRegexMatch, "")
# Remove empty li
html = re.sub(regex_replace.format("li", no_start_tag_regex), "", html)
# Remove dangling separator
regex_span = regex_replace.format("span", no_start_tag_regex) + no_start_tag_regex + sep_regex
html = re.sub(regex_span, "", html)
# Remove double separator
html = re.sub(sep_regex + no_start_tag_regex + sep_regex, sep_regex, html)
return html
示例2: getContent
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import replace [as 別名]
def getContent(soup, source=''):
newContent = []
# Cleanning phase
genericCleaning(soup)
sourceSpecificcleaning(soup, source)
# f = open("content.html", 'w'); f.write(soup.prettify().encode('utf-8')); f.close();
# Finding content in the tree
bestElem = None; bestText = '';
for el in soup.findAll(True):
score = 0.0; hasTitle = False
if el.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7'] and el.parent.name == '[document]':
score += 3
for c in el:
if c.name == 'br': # business insider style
score += 0.5
if c.name == 'p':
score += 1.0
if not hasTitle and c.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']:
score += 1.0
hasTitle = True
if score >= 3.0: # at least 3 paragraphs
textOutput = getText(el)
if float(len(textOutput))/score > 20.0: # we need at least 20 characters per container
newContent.append(textOutput)
elif score >= 1.0:
if bestElem is None:
bestElem = el; bestText = getText(el, False)
else:
a = getText(el, False)
if bestElem is None or len(a) > len(bestText):
bestElem = el; bestText = a
if len(newContent) == 0 and bestElem is not None: # in case nothing had a score of 3, but something had a score of 1 or more
newContent.append(bestText)
finalText = UnicodeDammit(u'\n'.join(newContent), smart_quotes_to='ascii').unicode_markup
return finalText.replace('\n\n', '\n')
示例3: urlparse
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import replace [as 別名]
# (or default to 'config.ini')
parser = argparse.ArgumentParser(description='Create players for past '
'DF Game Club sessions')
parser.add_argument('config', nargs='?', default='config.ini',
help='configuration file with past session data')
namespace = parser.parse_args()
# Read configuration file
conf = configparser.ConfigParser(interpolation=None)
conf.read(namespace.config)
for section in conf.sections():
pastebin_url = PASTEBIN_RAW_PREFIX + \
urlparse(conf[section]["pastebin_url"]).path.strip('/')
log = UnicodeDammit(urlopen(pastebin_url).read()).unicode_markup
log = log.replace('\r', '').split('\n')
regex = re.compile(conf[section]['regex'])
timestamp_format = conf[section]['timestamp_format']
video_timestamp = conf[section]['video_timestamp']
twitch_url = urlparse(conf[section]['twitch_url'])\
.path.strip('/').split('/')
channel, archive_id = twitch_url[0], twitch_url[2]
ignore_lines = [
int(num) for num in [
item for item in
conf[section]['ignore_lines'].replace(' ','').split(',')
if item != ''
]