当前位置: 首页>>代码示例>>Python>>正文


Python BeautifulSoup.renderContents方法代码示例

本文整理汇总了Python中lib.BeautifulSoup.BeautifulSoup.renderContents方法的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup.renderContents方法的具体用法?Python BeautifulSoup.renderContents怎么用?Python BeautifulSoup.renderContents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lib.BeautifulSoup.BeautifulSoup的用法示例。


在下文中一共展示了BeautifulSoup.renderContents方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Items

# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import renderContents [as 别名]
 def Items(self):
     itemsprocessed = []
     cnt4debug = 0
     opener = URLOpener(self.host)
     decoder = AutoDecoder()
     for section, url in self.feeds:
         content = None
         cnt4debug += 1
         if IsRunInLocal and cnt4debug > 1:
             break
         
         result = opener.open(url)
         status_code, content = result.status_code, result.content
         if status_code != 200 and content:
             logging.error('err(%d) to fetch %s.' % (status_code,url))
             continue
         
         if self.feed_encoding:
             content = content.decode(self.feed_encoding)
         else:
             content = decoder.decode(content)
         
         content = self.preprocess(content)
         
         feed = feedparser.parse(content)
         for e in feed['entries']:
             # 全文RSS中如果有广告或其他不需要的内容,可以在postprocess去掉
             desc = self.postprocess(e.description)
             desc = self.FragToXhtml(desc, e.title, self.feed_encoding)
             
             if self.keep_image:
                 soup = BeautifulSoup(content)
                 self.soupbeforeimage(soup)
                 for img in soup.findAll('img'):
                     imgurl = img['src']
                     if not imgurl.startswith('http') and not imgurl.startswith('www'):
                         imgurl = self.urljoin(url, imgurl)
                     imgresult = opener.open(imgurl)
                     imgcontent = imgresult.content if imgresult.status_code == 200 else None
                     if imgcontent:
                         imgtype = imghdr.what(None, imgcontent)
                         if imgtype:
                             imgmime = r"image/" + imgtype
                             if imgtype == 'jpeg':
                                 fnimg = "%d.jpg" % random.randint(10000,99999999)
                             else:
                                 fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                             img['src'] = fnimg
                             yield (imgmime, imgurl, fnimg, imgcontent)
                 self.soupprocessex(soup)
                 desc = soup.renderContents('utf-8').decode('utf-8')
                 soup = None
             
             if e.title not in itemsprocessed and desc:
                 itemsprocessed.append(e.title)
                 yield (section, e.link, e.title, desc)
开发者ID:lovejoy,项目名称:KindleEar,代码行数:58,代码来源:base.py

示例2: sanitize_contents

# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import renderContents [as 别名]
 def sanitize_contents(self, contents):
   soup = BeautifulSoup(contents)
   for tagname in ['script', 'meta', 'head', 'link']:
     [tag.extract() for tag in soup.findAll(tagname)]
   
   attr_re = re.compile('^on.*', re.I)
   for tag in soup.findAll():
     for attr, _ in tag.attrs:
       if attr_re.match(attr):
         del tag[attr]
   for tag in soup.findAll(attrs={'href': re.compile(r'^\s*javascript:.*', re.I)}):
     del tag['href']
   for tag in soup.findAll(attrs={'src': re.compile(r'^\s*javascript:.*', re.I)}):
     del tag['src']
     
   sanitized_contents = soup.renderContents()
   return sanitized_contents
开发者ID:battlehorse,项目名称:snippy-backend,代码行数:19,代码来源:api.py

示例3: fulltext

# 需要导入模块: from lib.BeautifulSoup import BeautifulSoup [as 别名]
# 或者: from lib.BeautifulSoup.BeautifulSoup import renderContents [as 别名]

#.........这里部分代码省略.........
     if status_code != 200 or not content:
         logging.error('err(%d) to fetch %s.' % (status_code,url))
         return
     
     if self.page_encoding:
         content = content.decode(self.page_encoding)
     else:
         content = decoder.decode(content)
     
     content = self.preprocess(content)
     soup = BeautifulSoup(content)
     
     try:
         title = soup.html.head.title.string
     except AttributeError:
         logging.error('object soup invalid!(%s)'%url)
         return
         
     title = self.processtitle(title)
     soup.html.head.title.string = title
     
     if self.keep_only_tags:
         body = Tag(soup, 'body')
         try:
             if isinstance(self.keep_only_tags, dict):
                 self.keep_only_tags = [self.keep_only_tags]
             for spec in self.keep_only_tags:
                 for tag in soup.find('body').findAll(**spec):
                     body.insert(len(body.contents), tag)
             soup.find('body').replaceWith(body)
         except AttributeError: # soup has no body element
             pass
     
     def remove_beyond(tag, next): # 内联函数
         while tag is not None and getattr(tag, 'name', None) != 'body':
             after = getattr(tag, next)
             while after is not None:
                 ns = getattr(tag, next)
                 after.extract()
                 after = ns
             tag = tag.parent
     
     if self.remove_tags_after:
         rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
         for spec in rt:
             tag = soup.find(**spec)
             remove_beyond(tag, 'nextSibling')
     
     if self.remove_tags_before:
         tag = soup.find(**self.remove_tags_before)
         remove_beyond(tag, 'previousSibling')
     
     remove_tags = self.insta_remove_tags + self.remove_tags
     remove_ids = self.insta_remove_ids + self.remove_ids
     remove_classes = self.insta_remove_classes + self.remove_classes
     remove_attrs = self.insta_remove_attrs + self.remove_attrs
     
     for tag in soup.findAll(remove_tags):
         tag.extract()
     for id in remove_ids:
         for tag in soup.findAll(attrs={"id":id}):
             tag.extract()
     for cls in remove_classes:
         for tag in soup.findAll(attrs={"class":cls}):
             tag.extract()
     for attr in remove_attrs:
         for tag in soup.findAll(attrs={attr:True}):
             del tag[attr]
     for tag in soup.findAll(attrs={"type":"text/css"}):
         tag.extract()
     for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)):
         cmt.extract
     
     if self.keep_image:
         self.soupbeforeimage(soup)
         for img in soup.findAll('img'):
             imgurl = img['src']
             if not imgurl.startswith('http') and not imgurl.startswith('www'):
                 imgurl = self.urljoin(url, imgurl)
             imgresult = opener.open(imgurl)
             imgcontent = imgresult.content if imgresult.status_code == 200 else None
             if imgcontent:
                 imgtype = imghdr.what(None, imgcontent)
                 if imgtype:
                     imgmime = r"image/" + imgtype
                     if imgtype == 'jpeg':
                         fnimg = "%d.jpg" % random.randint(10000,99999999)
                     else:
                         fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                     img['src'] = fnimg
                     yield (imgmime, imgurl, fnimg, imgcontent)
     else:
         for img in soup.findAll('img'):
             img.extract()
     
     self.soupprocessex(soup)
     content = soup.renderContents('utf-8').decode('utf-8')
     soup = None
     
     yield (title, None, None, content)
开发者ID:lovejoy,项目名称:KindleEar,代码行数:104,代码来源:base.py


注:本文中的lib.BeautifulSoup.BeautifulSoup.renderContents方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。