本文整理匯總了Python中core.decoder.Decoder.removeHTML方法的典型用法代碼示例。如果您正苦於以下問題:Python Decoder.removeHTML方法的具體用法?Python Decoder.removeHTML怎麽用?Python Decoder.removeHTML使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類core.decoder.Decoder
的用法示例。
在下文中一共展示了Decoder.removeHTML方法的2個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: drawBbcCoUkNew
# 需要導入模塊: from core.decoder import Decoder [as 別名]
# 或者: from core.decoder.Decoder import removeHTML [as 別名]
def drawBbcCoUkNew(url):
htmlContent = Downloader.getContentFromUrl(url=url)
title = Decoder.extract('<p class="story-body__introduction">', "</p><div", htmlContent)
if 'property="articleBody"' in htmlContent:
body = Decoder.extract(
'property="articleBody"',
" </div>",
htmlContent,
)
body = body.replace('<span class="off-screen">Image copyright</span>', "")
body = body.replace('<span class="story-image-copyright">AFP</span>', "")
body = body.replace('<span class="story-image-copyright">Reuters</span>', "")
body = body.replace('<span class="off-screen">Image caption</span>', "")
body = body.replace('<span class="off-screen">Media caption</span>', "")
while '<span class="media-caption__text">' in body:
line = Decoder.extractWithRegex('<span class="media-caption__text">', "</span>", body)
body = body.replace(line, "")
elif 'class="text-wrapper"' in htmlContent:
# special content
body = Decoder.extract('class="text-wrapper"', "</p>\n", htmlContent)
dates = Decoder.extractWithRegex('<div class="date', "</div>", body)
lastUpdate = Decoder.extractWithRegex('<p class="date ', "</p>", body)
body = body.replace(dates, "")
body = body.replace(lastUpdate, "")
elif '<figcaption class="sp-media-asset' in htmlContent:
body = Decoder.extract('<figcaption class="sp-media-asset', "</p><div ", htmlContent)
if ">" in body:
body = body[body.find(">") + 1 :]
body = Decoder.removeHTML(body).replace(".", ".\n").replace(">", "")
logger.debug("body is: " + body)
drawNew(textContent=(body))
示例2: getChannels
# 需要導入模塊: from core.decoder import Decoder [as 別名]
# 或者: from core.decoder.Decoder import removeHTML [as 別名]
def getChannels(page):
x = []
if page == '0':
url = Reuters.LAST_NEWS_RSS + str(time.time() * 1000)
logger.debug("news rss url is: "+url)
bruteResult = Reuters.getContentFromUrl(url=url,launchLocation=False,ajax=True)
logger.debug("brute ajax response: "+bruteResult)
results = json.loads(bruteResult)
i=0
for result in results["headlines"]:
if i>0:
element = {}
img = result["mainPicUrl"]
link = Reuters.MAIN_URL+result["url"]
title = result["formattedDate"]+" - "+result["headline"]
logger.debug("appending result: "+title+", url: "+link+", img: "+img)
element["title"] = title
element["link"] = link
element["thumbnail"] = img
x.append(element)
i+=1
else:
html = Reuters.getContentFromUrl(url=page)
startRegex = '<span id="article-text">'
if '<span id="article-text">' in html:
startRegex = '<span id="article-text">'
else:
startRegex = '<span id="articleText">'
body = Decoder.extract(startRegex,'<div class="linebreak"></div>',html)
body = Decoder.removeHTML(body)
if '|' in body:
body = body[body.find('|')+1:]
try:
lowerCaseIndex = int(re.search("[a-z]", body).start())
body = body[:lowerCaseIndex-1]+"\n"+body[lowerCaseIndex-1:]
except:
logger.error("No break for city was done. Something goes wrong")
pass
element = {}
element["link"] = page
element["title"] = body
element["thumbnail"] = ''
x.append(element)
return x