当前位置: 首页>>代码示例>>Python>>正文


Python BeautifulSoup.BeautifulSoup类代码示例

本文整理汇总了Python中lib.BeautifulSoup.BeautifulSoup的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup类的具体用法?Python BeautifulSoup怎么用?Python BeautifulSoup使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了BeautifulSoup类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_organic_data

def get_organic_data(html_data):
    bs = BeautifulSoup(str(html_data))
    div_filter = bs.find('div',{'id':'ires'})
    if div_filter:
        contents = div_filter.findAll('li',{'class':'g'})
        return contents
    return None
开发者ID:nava45,项目名称:gpage_crawler,代码行数:7,代码来源:webpage_splitter.py

示例2: fetch_trains

def fetch_trains(place_from, place_to, date):  
    key = 'trains_' + place_from + '_' + place_to + '_' + str(date)
    data = memcache.get(key) #@UndefinedVariable
    if data != None:
        return data
      
    params = {'fromName': place_from,
              'toName': place_to,
              'when': utils.date_serialize(date),
              'search_type': 'suburban'}
    url = 'http://m.rasp.yandex.ru/search?' + urllib.urlencode(params)
    response = urlfetch.fetch(url)
    html = response.content
    soup = BeautifulSoup(html)
    list_node = soup.find("ul", { "class" : "b-holster b-search-result" })
    if list_node != None:
        regex = re.compile(r'<.*?>')
        b_nodes = list_node.findAll("b")
        result = []
        for b_node in b_nodes:
            data = regex.split(b_node.renderContents())
            try:
                time = [datetime.datetime.strptime(x, '%H:%M').time() for x in data]
                result.append(TrainTiming(time[0], time[1]))
            except:
                pass
        memcache.add(key, result, 60*60)  #@UndefinedVariable
        return result
开发者ID:bordanton,项目名称:Youth,代码行数:28,代码来源:bot.py

示例3: handler

def handler(sock, url):
    htmlsource=sock.read().decode('gb18030','replace').encode('utf-8') 
    soup = BeautifulSoup(htmlsource)
    content = soup.find("td",{"class":"jiawenzhang-type"})
    if content is None:
        return "content not found"
    return unicode(content)
开发者ID:cylonbrain,项目名称:FullTextRss,代码行数:7,代码来源:mitbbs.py

示例4: parse_summary

    def parse_summary(self, summary, link):
        """处理文章"""

        soup = BeautifulSoup(summary)

        for span in list(soup.findAll(attrs={ "style" : "display: none;" })):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr:True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        for img in list(soup.findAll('img')):
            if (self.max_image_number >= 0  and img_count >= self.max_image_number) \
                or img.has_key('src') is False \
                or img['src'].startswith("http://union.vancl.com/") \
                or img['src'].startswith("http://www1.feedsky.com/") \
                or img['src'].startswith("http://feed.feedsky.com/~flare/"):
                img.extract()
            else:
                try:
                    localimage = self.down_image(img['src'], link)

                    if localimage:
                        img['src'] = localimage
                        img_count = img_count + 1
                    else:
                        img.extract()
                except Exception, e:
                    print e
                    img.extract()
开发者ID:userid,项目名称:kindlereader,代码行数:35,代码来源:kindlereader.py

示例5: get_episodes

def get_episodes():
	"""docstring for get_episodes"""
	
	html = retrieve_url("http://www.rtlklub.hu/most/musorok/automania")
	soup = BeautifulSoup(html, fromEncoding="utf-8")
	print soup.originalEncoding
	episodesHtml = soup.findAll("div", attrs={"class" : "video-img-cont-catchup cont-first"})
	
	""" result
	
	<div class="video-img-cont-catchup cont-first" id="5217">
		<div class="video-date">okt 24.<span>12:15</span></div>
		<a href="http://www.rtlklub.hu/most/5217_automania_09-10-24" class="video-img">
			<img src="http://www.rtlklub.hu/most/files/thumbnails/005/217/2.jpg" width="120" height="90" alt="Autómánia 09-10-24" title="Autómánia 09-10-24" />
		</a>
		<a href="javascript:void(0)" class="video-add" id="5217-0">
			<img src="http://www.rtlklub.hu/most/style/img/add_video_icon.png" alt="Add a kedvenceid közé" title="Add a kedvenceid közé" />
		</a>
		<div class="img-height-wide"></div>
		<h2>
			<a href="http://www.rtlklub.hu/most/5217_automania_09-10-24">Autómánia 09-10-24</a>
		</h2>
		<p>Toyota Prius, Aprilia Tuono 1000R, Honda Accord 2.2 I-DTEC</p>
	</div>
	
	"""
	
	episodes = []
	#print len(episodesHtml)
	for episode in episodesHtml:
		episodes.append({"title":episode.h2.a.string, "url":episode.h2.a['href'], "thumb":episode.a.img['src']})
	#print episodes
	return episodes
开发者ID:Xmister,项目名称:rtlmost-xbmc,代码行数:33,代码来源:test.py

示例6: handler

def handler(sock, url):
    htmlsource=sock.read()
    soup = BeautifulSoup(htmlsource)
    content = soup.find(id=re.compile("postmessage_\d+"),name="td")
    if content is None:
        return "failed to read content"
    return unicode(content)
开发者ID:cylonbrain,项目名称:FullTextRss,代码行数:7,代码来源:powerapple.py

示例7: render

    def render(self):
        # TODO: fix and enable caching
        # content         =  cache.get(self.content_url)
        content = None

        url = self.content_url

        # If the page is not cached, retrieve it
        if content == None:
            opener      = urllib2.build_opener()
            content     = opener.open(url, timeout=5).read()
            
            # Save the page in cache
            # cache.set(self.content_url, content)
        
        soup            = BeautifulSoup(content)

        # TODO: Disabled. Add GET parameter support and enable.
        # Make links absolute, quoted from http://stackoverflow.com/a/4468467:
        #for tag in soup.findAll('a', href=True):
        #    tag['href'] = urlparse.urljoin(self.content_url, tag['href'])
        
        # If there's no element specified, use the BODY. 
        # Otherwise find the element with given id.
        if self.element_id == "":
            html        = soup.find("body").renderContents()
        else:
            html        = str(soup.find(id=self.element_id))
        
        return html
开发者ID:kahaeia1,项目名称:a-plus,代码行数:30,代码来源:models.py

示例8: parse_response

 def parse_response(self):
     soup                = BeautifulSoup(self.response)
     
     head                = soup.find("head")
     
     self.max_points     = int(_get_value_from_soup(head, "meta", "value", {"name": "max-points"}, 0))
     
     if _get_value_from_soup(head, "meta", "value", {"name": "status"}) == "accepted":
         self.is_accepted= True
     
     meta_title          = _get_value_from_soup(head, "meta", "content", {"name": "DC.Title"})
     if meta_title:
         self.meta["title"]  = meta_title
     else:
         title               = soup.find("title")
         if title:
             self.meta["title"]  = title.contents
     
     self.meta["description"] = _get_value_from_soup(head, "meta", "content", {"name": "DC.Description"}, "")
     
     points              = _get_value_from_soup(head, "meta", "value", {"name": "points"})
     if points != None:
         self.points     = int(points)
         self.is_graded  = True
         self.is_accepted= True
     
     exercise_div        = soup.body.find("div", {"id": "exercise"})
     
     if exercise_div != None:
         self.content    = exercise_div.renderContents()
     else:
         self.content    = soup.body.renderContents()
开发者ID:OpenDSA,项目名称:OpenDSA-devserver,代码行数:32,代码来源:exercise_page.py

示例9: parse

    def parse(property_id, ratecode='SPGCP'):
        valid_property = False
        hotel_props = {'id': property_id}

        property_url = "%s?propertyID=%s" % (starwood_url, property_id)
        logging.debug("Property URL: %s" % property_url)
        starwood_response = urlfetch.fetch(url=property_url, deadline=10)
        if starwood_response:
            try:
                soup = BeautifulSoup(starwood_response.content).find(attrs={'id': 'propertyHighlight'}).find(attrs={'class': 'propertyContainer'})
            except:
                soup = None

            if soup:
                try:
                    hotel_props['name'] = unicode(soup.find("a", "propertyName").contents[0]).strip()
                    hotel_props['category'] = int(str(soup.find("span", "spgCategory").contents[0]).split()[-1])

                    valid_property = True
                except:
                    pass

                if valid_property:
                    hotel_props['address'] = StarwoodParser.parse_address(soup)
                    #hotel_props['awards'] = StarwoodParser.parse_starwood(soup.find("div", "tabsContentContainer").findAll("div", "tabContent"))
                    hotel_props['image_url'] = str("http://www.starwoodhotels.com%s" % (soup.find("img", "propertyThumbnail")['src']))

        return valid_property and hotel_props or None
开发者ID:mshafrir,项目名称:awardr,代码行数:28,代码来源:parsers.py

示例10: getRes

	def getRes(self):
		url = self.getResURL()
		page = urllib2.urlopen(url).read()#.decode('GBK').encode('utf-8')
		soup = BeautifulSoup(page)
		main_wrapper = soup.findAll('div', {'class': 'main_wrapper'})[0]
		#print main_wrapper.prettify()
		clr_after = main_wrapper.findAll('div', {'class': 'clr_after'})[0]
		#print clr_after.prettify()
		items = clr_after.findAll('div', {'class': 'main'})[0]
		#print items.prettify()
		items1 = items.findAll('div', {'class': 'lowpriceList'})[0]
		print items1.prettify().decode('utf-8').encode('gbk')
		items2 = items1.findAll('div', {'id': 'hdivResultTable'})[0]
		#print items2.prettify().decode('utf-8').encode('gbk')
		
		for item in items2:
			print item
			inc = str(item.findAll('td', {'class': 'col3'})[0].contents[0].string)
			fly_time = str(item.findAll('td', {'class': 'col4'})[0].contents[0].string)
			_time = str(item.findAll('td', {'class': 'col2'})[0].contents[0].string)
			_discount = str(item.findAll('span', {'class': 'disc'})[0].contents[0].string)
			_price = str(item.findAll('span', {'class': 'pr'})[0].contents[0].string)
			
			print inc#.decode('utf-8').encode('gbk')
			print fly_time#.decode('utf-8').encode('gbk')
			print _time#.decode('utf-8').encode('gbk')
			print _discount.decode('utf-8').encode('gbk')
			print _price.decode('utf-8').encode('gbk')
开发者ID:zuojie,项目名称:KKT,代码行数:28,代码来源:Ticket.py

示例11: view_page

def view_page(slug):
    page = Page.gql("WHERE slug = :1", slug)[0]
    content = BeautifulSoup(page.content)
    codes = content.findAll('pre')
    for code in codes:
        code.contents[0].replaceWith(controllers.prettify_code(code.contents[0]))
    page.content = str(content)
    return render_template('cms_view_page.html', page=page)
开发者ID:joemarct,项目名称:flask-gae-app,代码行数:8,代码来源:views.py

示例12: location

	def location(self,ip):
		try:
			self.current_page = self.br.open('http://www.114best.com/ip/114.aspx?w=%s' % ip)
		except Exception:
			return "Earth"
		soup = BeautifulSoup(self.current_page)
		lo = soup.findAll('div', { "id" : "output" })[0].findAll('b')[1].text.encode('utf-8','ignore')
		return lo
开发者ID:knarfytrebil,项目名称:MF-MANA,代码行数:8,代码来源:spider.py

示例13: getPresentation

	def getPresentation(self):
		base_url = 'http://my.yingjiesheng.com/xuanjianghui_province_'
		for i in range(1, 35): #取出34[1-34]个省份的未来两天的招聘信息
			url = base_url + str(i) + '.html'
			#print url
			try:
				page = self.getRes(url) 
				soup = BeautifulSoup(page)
			except: #url打开失败
				continue
			#取出所有的倒计时
			try: #当前城市可能未来一段时间没有宣讲会信息
				countdowns = soup.findAll('div', {'class': 'list_topic'})
				y_m_d2, y_m_d3 = '', ''; #记录第二天和第三天的宣讲会日期
				first, second = -1, -1 #第二天和第三天的宣讲会出现的名字为campusTalk的table下标.其位置是和倒计时出现的div保持错开一个位置
				# 因为第0个名为campusTalk的table是表格标题栏,从第1个开始才是宣讲会的信息,因此day初始化为1
				day = 1 
				for countdown in countdowns:
					cd = string.atoi(countdown.contents[0].contents[2].string)
					if cd > 2: #倒计时超过2天的宣讲会,暂不考虑
						break
					elif cd == 1: #第二天要举行的宣讲会【倒计时剩1天】
						first = day
						y_m_d2 = countdown.contents[1].string
					elif cd == 2: #第三天要举行的宣讲会【倒计时剩2天】
						second = day
						y_m_d3 = countdown.contents[1].string
					day = day + 1
				# first是第2天信息,second是第三天的信息,假如为-1,表示那天没有宣讲会
				if first != -1:
					tables = soup.findAll('table', {'class':'campusTalk'})[first]
					trs = tables.findAll('tr')
					for tr in trs:
						tds = tr.findAll('td')
						city = tds[0].a.string.strip()
						school = tds[1].a.string.strip()
						addr = tds[2].string.strip()
						inc = tds[3].a.string.strip()
						try: # 有些宣讲会未标出具体开始时间[H-M-S]
							pdate = y_m_d2 + ' ' + tds[4].string
						except Exception, e:
							pdate = y_m_d2 #那么只记录年月日即可
						self.presentations.append(CPresentation(city, inc, school, pdate, addr))
				if second != -1:
					tables = soup.findAll('table', {'class':'campusTalk'})[second]
					trs = tables.findAll('tr')
					for tr in trs:
						tds = tr.findAll('td')
						city = tds[0].a.string.strip()
						school = tds[1].a.string.strip()
						addr = tds[2].string.strip()
						inc = tds[3].a.string.strip()
						try:
							pdate = y_m_d3 + ' ' + tds[4].string
						except:
							pdate = y_m_d3
						self.presentations.append(CPresentation(city, inc, school, pdate, addr))
			except:
开发者ID:zuojie,项目名称:KKT,代码行数:58,代码来源:Presentation.py

示例14: assert_no_error_message_in_response

 def assert_no_error_message_in_response(self, response):
     """Check that response has no error messages."""
     soup = BeautifulSoup(response)
     el = soup.find("p", "alert-error")
     if el:
         self.fail("error message found in response unexpectedly: {}".format(el.contents))
     el = soup.findAll("label", "alert-error")
     if el:
         self.fail("error message found in response unexpectedly: {}".format(el.contents))
开发者ID:JElbourne,项目名称:PubCart,代码行数:9,代码来源:test_helpers.py

示例15: Items

 def Items(self):
     itemsprocessed = []
     cnt4debug = 0
     opener = URLOpener(self.host)
     decoder = AutoDecoder()
     for section, url in self.feeds:
         content = None
         cnt4debug += 1
         if IsRunInLocal and cnt4debug > 1:
             break
         
         result = opener.open(url)
         status_code, content = result.status_code, result.content
         if status_code != 200 and content:
             logging.error('err(%d) to fetch %s.' % (status_code,url))
             continue
         
         if self.feed_encoding:
             content = content.decode(self.feed_encoding)
         else:
             content = decoder.decode(content)
         
         content = self.preprocess(content)
         
         feed = feedparser.parse(content)
         for e in feed['entries']:
             # 全文RSS中如果有广告或其他不需要的内容,可以在postprocess去掉
             desc = self.postprocess(e.description)
             desc = self.FragToXhtml(desc, e.title, self.feed_encoding)
             
             if self.keep_image:
                 soup = BeautifulSoup(content)
                 self.soupbeforeimage(soup)
                 for img in soup.findAll('img'):
                     imgurl = img['src']
                     if not imgurl.startswith('http') and not imgurl.startswith('www'):
                         imgurl = self.urljoin(url, imgurl)
                     imgresult = opener.open(imgurl)
                     imgcontent = imgresult.content if imgresult.status_code == 200 else None
                     if imgcontent:
                         imgtype = imghdr.what(None, imgcontent)
                         if imgtype:
                             imgmime = r"image/" + imgtype
                             if imgtype == 'jpeg':
                                 fnimg = "%d.jpg" % random.randint(10000,99999999)
                             else:
                                 fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                             img['src'] = fnimg
                             yield (imgmime, imgurl, fnimg, imgcontent)
                 self.soupprocessex(soup)
                 desc = soup.renderContents('utf-8').decode('utf-8')
                 soup = None
             
             if e.title not in itemsprocessed and desc:
                 itemsprocessed.append(e.title)
                 yield (section, e.link, e.title, desc)
开发者ID:lovejoy,项目名称:KindleEar,代码行数:56,代码来源:base.py


注:本文中的lib.BeautifulSoup.BeautifulSoup类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。