本文整理汇总了Python中calibre.ebooks.BeautifulSoup.BeautifulSoup.decode_contents方法的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup.decode_contents方法的具体用法?Python BeautifulSoup.decode_contents怎么用?Python BeautifulSoup.decode_contents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类calibre.ebooks.BeautifulSoup.BeautifulSoup
的用法示例。
在下文中一共展示了BeautifulSoup.decode_contents方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _reformat
# 需要导入模块: from calibre.ebooks.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from calibre.ebooks.BeautifulSoup.BeautifulSoup import decode_contents [as 别名]
def _reformat(self, data, htmlpath):
if self.input_encoding:
data = data.decode(self.input_encoding)
try:
data = xml_to_unicode(data, strip_encoding_pats=True)[0]
soup = BeautifulSoup(data)
except ValueError:
# hit some strange encoding problems...
self.log.exception("Unable to parse html for cleaning, leaving it")
return data
# nuke javascript...
[s.extract() for s in soup('script')]
# See if everything is inside a <head> tag
# https://bugs.launchpad.net/bugs/1273512
body = soup.find('body')
if body is not None and body.parent.name == 'head':
html = soup.find('html')
html.insert(len(html), body)
# remove forward and back nav bars from the top/bottom of each page
# cos they really fuck with the flow of things and generally waste space
# since we can't use [a,b] syntax to select arbitrary items from a list
# we'll have to do this manually...
# only remove the tables, if they have an image with an alt attribute
# containing prev, next or team
t = soup('table')
if t:
if (t[0].previousSibling is None or t[0].previousSibling.previousSibling is None):
try:
alt = t[0].img['alt'].lower()
if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
t[0].extract()
except:
pass
if (t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None):
try:
alt = t[-1].img['alt'].lower()
if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
t[-1].extract()
except:
pass
# for some very odd reason each page's content appears to be in a table
# too. and this table has sub-tables for random asides... grr.
# remove br at top of page if present after nav bars removed
br = soup('br')
if br:
if check_all_prev_empty(br[0].previousSibling):
br[0].extract()
# some images seem to be broken in some chm's :/
base = os.path.dirname(htmlpath)
for img in soup('img', src=True):
src = img['src']
ipath = os.path.join(base, *src.split('/'))
if os.path.exists(ipath):
continue
src = src.split(';')[0]
if not src:
continue
ipath = os.path.join(base, *src.split('/'))
if not os.path.exists(ipath):
while src.startswith('../'):
src = src[3:]
img['src'] = src
try:
# if there is only a single table with a single element
# in the body, replace it by the contents of this single element
tables = soup.body.findAll('table', recursive=False)
if tables and len(tables) == 1:
trs = tables[0].findAll('tr', recursive=False)
if trs and len(trs) == 1:
tds = trs[0].findAll('td', recursive=False)
if tds and len(tds) == 1:
tdContents = tds[0].contents
tableIdx = soup.body.contents.index(tables[0])
tables[0].extract()
while tdContents:
soup.body.insert(tableIdx, tdContents.pop())
except:
pass
# do not prettify, it would reformat the <pre> tags!
try:
ans = soup.decode_contents()
self.re_encoded_files.add(os.path.abspath(htmlpath))
return ans
except RuntimeError:
return data