本文整理汇总了Python中html.parser.HTMLParser.strip方法的典型用法代码示例。如果您正苦于以下问题:Python HTMLParser.strip方法的具体用法?Python HTMLParser.strip怎么用?Python HTMLParser.strip使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类html.parser.HTMLParser
的用法示例。
在下文中一共展示了HTMLParser.strip方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: transform_title
# 需要导入模块: from html.parser import HTMLParser [as 别名]
# 或者: from html.parser.HTMLParser import strip [as 别名]
def transform_title(title, site):
"""Miscellaneous title transformations.
Handle some unicode, unescape HTML, simplify hierarchical titles, ...
"""
title = HTMLParser().unescape(title)
title = title.strip()
title = parse_fancy_titles(title, site)
return title
示例2: zeroclick
# 需要导入模块: from html.parser import HTMLParser [as 别名]
# 或者: from html.parser.HTMLParser import strip [as 别名]
def zeroclick(irc, source, msgtarget, args):
params = {"q":args[0]}
url = "http://duckduckgo.com/lite/?"
#try:
data = requests.get(url, params=params).content.decode()
search = re.findall("""\t<td>.\t\s+(.*?).<\/td>""",data,re.M|re.DOTALL)
if search:
answer = HTMLParser().unescape(search[-1].replace("<br>"," ").replace("<code>"," ").replace("</code>"," "))
answer = re.sub("<[^<]+?>"," ",answer)
out = re.sub("\s+"," ",answer.strip())
if out:
#if len(out.split(" More at")[0].split("}")[-1].strip()) < 400:
irc.msg(msgtarget, out.split(" More at")[0].split("}")[-1].strip())
#else:
# irc.msg(source.split("!")[0], out.split(" More at")[0].split("}")[-1].strip())
else:
irc.msg(msgtarget, "No results")
else:
irc.msg(msgtarget, "No results found.")
示例3: get_fingerprint
# 需要导入模块: from html.parser import HTMLParser [as 别名]
# 或者: from html.parser.HTMLParser import strip [as 别名]
def get_fingerprint(torrent_name):
"""
Tries to obtain a fingerprint from the torrent name that will uniquely
identify it's group (TV show).
"""
# Minimize typing differences
torrent_name = torrent_name.replace("ё", "е")
# Unescape HTML entities
torrent_name = HTMLParser().unescape(torrent_name)
# Drop all tags
torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name)
# Drop any additional info: timestamps, release versions, etc.
# -->
square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$")
preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$")
round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$")
angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$")
date_regex = re.compile(r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
# Unable to merge it into date_regex due to some strange behaviour of re
# module.
additional_date_regex = re.compile(r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$")
old_torrent_name = None
while torrent_name != old_torrent_name:
old_torrent_name = torrent_name
for regex in (
additional_date_regex,
date_regex,
preceding_square_braces_regex,
square_braces_regex,
round_braces_regex,
angle_braces_regex,
release_counter_regex,
):
torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,"))
torrent_name = re.sub(r"\s+/.*", "", torrent_name)
# <--
# We need all names in lowercase for easier analysis
torrent_name = torrent_name.lower()
# Try to get most possible short fingerprint -->
torrent_name = re.sub(
r"^«([^»]{6,})»", r"\1", torrent_name)
torrent_name = re.sub(
r'^"([^»]{6,})"', r"\1", torrent_name)
torrent_name = re.sub(
r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name)
# Try to get most possible short fingerprint <--
# Drop all punctuation and other non-alphabet characters
characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя"
torrent_name = torrent_name.replace(".", " ")
torrent_name = "".join(
c for c in torrent_name if c in " " + characters)
# Drop any additional info: timestamps, release versions, etc.
# -->
torrent_name = torrent_name.replace("г.", "")
while True:
new_torrent_name = re.sub(r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name)
if new_torrent_name == torrent_name:
break
torrent_name = new_torrent_name
for month in (
"январь", "января",
"февраль", "февраля",
"март", "марта",
"апрель", "апреля",
"май", "мая",
"июнь", "июня",
"июль", "июля",
"август", "августа",
"сентябрь", "сентября",
"октябрь", "октября",
"ноябрь", "ноября",
"декабрь", "декабря",
):
torrent_name = torrent_name.replace(month, "")
# <--
# Drop several spaces
torrent_name = re.sub(r"\s+", " ", torrent_name).strip()
return torrent_name.strip()