本文整理汇总了Python中lib.BeautifulSoup.BeautifulSoup.renderContents方法的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup.renderContents方法的具体用法?Python BeautifulSoup.renderContents怎么用?Python BeautifulSoup.renderContents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lib.BeautifulSoup.BeautifulSoup
的用法示例。
在下文中一共展示了BeautifulSoup.renderContents方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Items
# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import renderContents [as 别名]
def Items(self):
itemsprocessed = []
cnt4debug = 0
opener = URLOpener(self.host)
decoder = AutoDecoder()
for section, url in self.feeds:
content = None
cnt4debug += 1
if IsRunInLocal and cnt4debug > 1:
break
result = opener.open(url)
status_code, content = result.status_code, result.content
if status_code != 200 and content:
logging.error('err(%d) to fetch %s.' % (status_code,url))
continue
if self.feed_encoding:
content = content.decode(self.feed_encoding)
else:
content = decoder.decode(content)
content = self.preprocess(content)
feed = feedparser.parse(content)
for e in feed['entries']:
# 全文RSS中如果有广告或其他不需要的内容,可以在postprocess去掉
desc = self.postprocess(e.description)
desc = self.FragToXhtml(desc, e.title, self.feed_encoding)
if self.keep_image:
soup = BeautifulSoup(content)
self.soupbeforeimage(soup)
for img in soup.findAll('img'):
imgurl = img['src']
if not imgurl.startswith('http') and not imgurl.startswith('www'):
imgurl = self.urljoin(url, imgurl)
imgresult = opener.open(imgurl)
imgcontent = imgresult.content if imgresult.status_code == 200 else None
if imgcontent:
imgtype = imghdr.what(None, imgcontent)
if imgtype:
imgmime = r"image/" + imgtype
if imgtype == 'jpeg':
fnimg = "%d.jpg" % random.randint(10000,99999999)
else:
fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
img['src'] = fnimg
yield (imgmime, imgurl, fnimg, imgcontent)
self.soupprocessex(soup)
desc = soup.renderContents('utf-8').decode('utf-8')
soup = None
if e.title not in itemsprocessed and desc:
itemsprocessed.append(e.title)
yield (section, e.link, e.title, desc)
示例2: sanitize_contents
# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import renderContents [as 别名]
def sanitize_contents(self, contents):
soup = BeautifulSoup(contents)
for tagname in ['script', 'meta', 'head', 'link']:
[tag.extract() for tag in soup.findAll(tagname)]
attr_re = re.compile('^on.*', re.I)
for tag in soup.findAll():
for attr, _ in tag.attrs:
if attr_re.match(attr):
del tag[attr]
for tag in soup.findAll(attrs={'href': re.compile(r'^\s*javascript:.*', re.I)}):
del tag['href']
for tag in soup.findAll(attrs={'src': re.compile(r'^\s*javascript:.*', re.I)}):
del tag['src']
sanitized_contents = soup.renderContents()
return sanitized_contents
示例3: fulltext
# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import renderContents [as 别名]
#.........这里部分代码省略.........
if status_code != 200 or not content:
logging.error('err(%d) to fetch %s.' % (status_code,url))
return
if self.page_encoding:
content = content.decode(self.page_encoding)
else:
content = decoder.decode(content)
content = self.preprocess(content)
soup = BeautifulSoup(content)
try:
title = soup.html.head.title.string
except AttributeError:
logging.error('object soup invalid!(%s)'%url)
return
title = self.processtitle(title)
soup.html.head.title.string = title
if self.keep_only_tags:
body = Tag(soup, 'body')
try:
if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags]
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
except AttributeError: # soup has no body element
pass
def remove_beyond(tag, next): # 内联函数
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
if self.remove_tags_after:
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before:
tag = soup.find(**self.remove_tags_before)
remove_beyond(tag, 'previousSibling')
remove_tags = self.insta_remove_tags + self.remove_tags
remove_ids = self.insta_remove_ids + self.remove_ids
remove_classes = self.insta_remove_classes + self.remove_classes
remove_attrs = self.insta_remove_attrs + self.remove_attrs
for tag in soup.findAll(remove_tags):
tag.extract()
for id in remove_ids:
for tag in soup.findAll(attrs={"id":id}):
tag.extract()
for cls in remove_classes:
for tag in soup.findAll(attrs={"class":cls}):
tag.extract()
for attr in remove_attrs:
for tag in soup.findAll(attrs={attr:True}):
del tag[attr]
for tag in soup.findAll(attrs={"type":"text/css"}):
tag.extract()
for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)):
cmt.extract
if self.keep_image:
self.soupbeforeimage(soup)
for img in soup.findAll('img'):
imgurl = img['src']
if not imgurl.startswith('http') and not imgurl.startswith('www'):
imgurl = self.urljoin(url, imgurl)
imgresult = opener.open(imgurl)
imgcontent = imgresult.content if imgresult.status_code == 200 else None
if imgcontent:
imgtype = imghdr.what(None, imgcontent)
if imgtype:
imgmime = r"image/" + imgtype
if imgtype == 'jpeg':
fnimg = "%d.jpg" % random.randint(10000,99999999)
else:
fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
img['src'] = fnimg
yield (imgmime, imgurl, fnimg, imgcontent)
else:
for img in soup.findAll('img'):
img.extract()
self.soupprocessex(soup)
content = soup.renderContents('utf-8').decode('utf-8')
soup = None
yield (title, None, None, content)