本文整理汇总了Python中html.parser.HTMLParser.replace方法的典型用法代码示例。如果您正苦于以下问题:Python HTMLParser.replace方法的具体用法?Python HTMLParser.replace怎么用?Python HTMLParser.replace使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类html.parser.HTMLParser
的用法示例。
在下文中一共展示了HTMLParser.replace方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: normalize
# 需要导入模块: from html.parser import HTMLParser [as 别名]
# 或者: from html.parser.HTMLParser import replace [as 别名]
def normalize(text, emoticon=False, repeat=None):
text = HTMLParser().unescape(text)
text = text.replace('\r', '\n')
if emoticon is False:
text = remove_emoticon(text)
text = jaconv.h2z(text)
text = text.replace('よぉ', 'よ').replace('よぉ', 'よ')
text = text.replace('よお', 'よ').replace('よお', 'よ')
if repeat:
text = shorten_repeat(text, repeat)
return text
示例2: get_fingerprint
# 需要导入模块: from html.parser import HTMLParser [as 别名]
# 或者: from html.parser.HTMLParser import replace [as 别名]
def get_fingerprint(torrent_name):
"""
Tries to obtain a fingerprint from the torrent name that will uniquely
identify it's group (TV show).
"""
# Minimize typing differences
torrent_name = torrent_name.replace("ё", "е")
# Unescape HTML entities
torrent_name = HTMLParser().unescape(torrent_name)
# Drop all tags
torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name)
# Drop any additional info: timestamps, release versions, etc.
# -->
square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$")
preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$")
round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$")
angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$")
date_regex = re.compile(r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
# Unable to merge it into date_regex due to some strange behaviour of re
# module.
additional_date_regex = re.compile(r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$")
old_torrent_name = None
while torrent_name != old_torrent_name:
old_torrent_name = torrent_name
for regex in (
additional_date_regex,
date_regex,
preceding_square_braces_regex,
square_braces_regex,
round_braces_regex,
angle_braces_regex,
release_counter_regex,
):
torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,"))
torrent_name = re.sub(r"\s+/.*", "", torrent_name)
# <--
# We need all names in lowercase for easier analysis
torrent_name = torrent_name.lower()
# Try to get most possible short fingerprint -->
torrent_name = re.sub(
r"^«([^»]{6,})»", r"\1", torrent_name)
torrent_name = re.sub(
r'^"([^»]{6,})"', r"\1", torrent_name)
torrent_name = re.sub(
r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name)
# Try to get most possible short fingerprint <--
# Drop all punctuation and other non-alphabet characters
characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя"
torrent_name = torrent_name.replace(".", " ")
torrent_name = "".join(
c for c in torrent_name if c in " " + characters)
# Drop any additional info: timestamps, release versions, etc.
# -->
torrent_name = torrent_name.replace("г.", "")
while True:
new_torrent_name = re.sub(r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name)
if new_torrent_name == torrent_name:
break
torrent_name = new_torrent_name
for month in (
"январь", "января",
"февраль", "февраля",
"март", "марта",
"апрель", "апреля",
"май", "мая",
"июнь", "июня",
"июль", "июля",
"август", "августа",
"сентябрь", "сентября",
"октябрь", "октября",
"ноябрь", "ноября",
"декабрь", "декабря",
):
torrent_name = torrent_name.replace(month, "")
# <--
# Drop several spaces
torrent_name = re.sub(r"\s+", " ", torrent_name).strip()
return torrent_name.strip()