本文整理汇总了Python中html.parser.unescape方法的典型用法代码示例。如果您正苦于以下问题:Python parser.unescape方法的具体用法?Python parser.unescape怎么用?Python parser.unescape使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类html.parser
的用法示例。
在下文中一共展示了parser.unescape方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: mtranslate_google
# 需要导入模块: from html import parser [as 别名]
# 或者: from html.parser import unescape [as 别名]
def mtranslate_google(word):
import html.parser
import urllib.request
import urllib.parse
agent = {'User-Agent':
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36"}
def unescape(text):
parser = html.parser.HTMLParser()
return (parser.unescape(text))
def translate(to_translate, to_language="auto", from_language="auto"):
base_link = "http://translate.google.com/m?hl=%s&sl=%s&q=%s"
to_translate = urllib.parse.quote(to_translate)
link = base_link % (to_language, from_language, to_translate)
request = urllib.request.Request(link, headers=agent)
raw_data = urllib.request.urlopen(request).read()
data = raw_data.decode("utf-8")
expr = r'class="t0">(.*?)<'
re_result = re.findall(expr, data)
if (len(re_result) == 0):
result = ""
else:
result = unescape(re_result[0])
return (result)
return [[word, translate(word, lang_to, lang_from)]], ['', '']
# reverso.net
示例2: getNews
# 需要导入模块: from html import parser [as 别名]
# 或者: from html.parser import unescape [as 别名]
def getNews(symbol):
url = buildNewsUrl(symbol)
content = urlopen(url).read().decode('utf-8')
content_json = demjson.decode(content)
article_json = []
news_json = content_json['clusters']
for cluster in news_json:
for article in cluster:
if article == 'a':
article_json.extend(cluster[article])
return [[unescape(art['t']).strip(), art['u']] for art in article_json]
示例3: format_text
# 需要导入模块: from html import parser [as 别名]
# 或者: from html.parser import unescape [as 别名]
def format_text(text):
return parser.unescape(text).strip()
示例4: find_news
# 需要导入模块: from html import parser [as 别名]
# 或者: from html.parser import unescape [as 别名]
def find_news(): # I'm adventuring with regular expressions for parsing!
"""Finds news for tweeting, along with their links."""
nyTech = requests.get('https://nytimes.com/section/technology')
latest = latest_expr.search(nyTech.text)
news_blocks = news_block_expr.findall(latest.group(1))
news = []
for i in range(len(news_blocks)):
item = (
news_blocks[i][1].strip() + ' ' + shorten_url(news_blocks[i][0]),
news_blocks[i][2].strip()) # This is img src.
if item[1].startswith('Daily Report: '):
item = item[14:]
news.append(item)
'''tv = requests.get('https://theverge.com', headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'})
feed_patt = r'(?s)<div class="c-compact-river">(.*?)<div class="l-col__sidebar"'
bunches = re.findall(feed_patt, tv.text)
verge_news = []
for cluster in bunches:
snippets = re.findall(r'<h2.*?><a.*>(.*?)</a></h2>', cluster)
verge_news.extend(snippets)
for item in verge_news:
news.append(parser.unescape(item))
random.shuffle(news) #to bring a feel of randomness'''
return news
示例5: _unescape
# 需要导入模块: from html import parser [as 别名]
# 或者: from html.parser import unescape [as 别名]
def _unescape(data):
""" Return unescaped data such as > -> >, " -> ', etc. """
try:
return htmlparser.unescape(data)
except:
return data
示例6: clean_html
# 需要导入模块: from html import parser [as 别名]
# 或者: from html.parser import unescape [as 别名]
def clean_html(htmlFragment):
"""
Resilient textarea fields return html fragments. This routine will remove the html and insert any code within <div></div>
with a linefeed
:param htmlFragment:
:return: cleaned up code
"""
if not htmlFragment or not isinstance(htmlFragment, string_types):
return htmlFragment
s = BeautifulSoup(unescape(htmlFragment), "html.parser")
return ' '.join(s.strings)
示例7: unescape
# 需要导入模块: from html import parser [as 别名]
# 或者: from html.parser import unescape [as 别名]
def unescape(data):
""" Return unescaped data such as > -> >, " -> ', etc. """
try:
return htmlparser.unescape(data)
except:
return data
示例8: clean_html
# 需要导入模块: from html import parser [as 别名]
# 或者: from html.parser import unescape [as 别名]
def clean_html(htmlFragment):
"""
Resilient textarea fields return html fragments. This routine will remove the html and insert any code within <div></div>
with a linefeed
:param htmlFragment:
:return: cleaned up code
"""
if not htmlFragment or not isinstance(htmlFragment, string_types):
return htmlFragment
return BeautifulSoup(unescape(htmlFragment), "html.parser").text
示例9: run
# 需要导入模块: from html import parser [as 别名]
# 或者: from html.parser import unescape [as 别名]
def run(self):
"""
Checks through the submissions and archives and posts comments.
"""
if not self._setup:
raise Exception("Snapshill not ready yet!")
submissions = self.reddit.front.new(limit=self.limit)
for submission in submissions:
debugTime = time.time()
warned = False
log.debug("Found submission.\n" + submission.permalink)
if not should_notify(submission):
log.debug("Skipping.")
continue
archives = [ArchiveContainer(fix_url(submission.url), submission.title)]
if submission.is_self and submission.selftext_html is not None:
log.debug("Found text post...")
links = BeautifulSoup(unescape(submission.selftext_html)).find_all("a")
finishedURLs = []
for anchor in links:
if time.time() > debugTime + WARN_TIME and not warned:
log.warn(
"Spent over {} seconds on post (ID: {})".format(
WARN_TIME, submission.name
)
)
warned = True
log.debug("Found link in text post...")
url = fix_url(anchor["href"])
if skip_url(url):
continue
if url in finishedURLs:
continue # skip for sanity
archives.append(ArchiveContainer(url, anchor.contents[0]))
finishedURLs.append(url)
ratelimit(url)
Notification(
self.reddit,
submission,
self._get_header(submission.subreddit),
archives,
).notify()
db.commit()