本文整理汇总了Python中lib.BeautifulSoup.BeautifulSoup类的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup类的具体用法?Python BeautifulSoup怎么用?Python BeautifulSoup使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BeautifulSoup类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_organic_data
def get_organic_data(html_data):
bs = BeautifulSoup(str(html_data))
div_filter = bs.find('div',{'id':'ires'})
if div_filter:
contents = div_filter.findAll('li',{'class':'g'})
return contents
return None
示例2: fetch_trains
def fetch_trains(place_from, place_to, date):
key = 'trains_' + place_from + '_' + place_to + '_' + str(date)
data = memcache.get(key) #@UndefinedVariable
if data != None:
return data
params = {'fromName': place_from,
'toName': place_to,
'when': utils.date_serialize(date),
'search_type': 'suburban'}
url = 'http://m.rasp.yandex.ru/search?' + urllib.urlencode(params)
response = urlfetch.fetch(url)
html = response.content
soup = BeautifulSoup(html)
list_node = soup.find("ul", { "class" : "b-holster b-search-result" })
if list_node != None:
regex = re.compile(r'<.*?>')
b_nodes = list_node.findAll("b")
result = []
for b_node in b_nodes:
data = regex.split(b_node.renderContents())
try:
time = [datetime.datetime.strptime(x, '%H:%M').time() for x in data]
result.append(TrainTiming(time[0], time[1]))
except:
pass
memcache.add(key, result, 60*60) #@UndefinedVariable
return result
示例3: handler
def handler(sock, url):
htmlsource=sock.read().decode('gb18030','replace').encode('utf-8')
soup = BeautifulSoup(htmlsource)
content = soup.find("td",{"class":"jiawenzhang-type"})
if content is None:
return "content not found"
return unicode(content)
示例4: parse_summary
def parse_summary(self, summary, link):
"""处理文章"""
soup = BeautifulSoup(summary)
for span in list(soup.findAll(attrs={ "style" : "display: none;" })):
span.extract()
for attr in self.remove_attributes:
for x in soup.findAll(attrs={attr:True}):
del x[attr]
for tag in soup.findAll(self.remove_tags):
tag.extract()
img_count = 0
for img in list(soup.findAll('img')):
if (self.max_image_number >= 0 and img_count >= self.max_image_number) \
or img.has_key('src') is False \
or img['src'].startswith("http://union.vancl.com/") \
or img['src'].startswith("http://www1.feedsky.com/") \
or img['src'].startswith("http://feed.feedsky.com/~flare/"):
img.extract()
else:
try:
localimage = self.down_image(img['src'], link)
if localimage:
img['src'] = localimage
img_count = img_count + 1
else:
img.extract()
except Exception, e:
print e
img.extract()
示例5: get_episodes
def get_episodes():
"""docstring for get_episodes"""
html = retrieve_url("http://www.rtlklub.hu/most/musorok/automania")
soup = BeautifulSoup(html, fromEncoding="utf-8")
print soup.originalEncoding
episodesHtml = soup.findAll("div", attrs={"class" : "video-img-cont-catchup cont-first"})
""" result
<div class="video-img-cont-catchup cont-first" id="5217">
<div class="video-date">okt 24.<span>12:15</span></div>
<a href="http://www.rtlklub.hu/most/5217_automania_09-10-24" class="video-img">
<img src="http://www.rtlklub.hu/most/files/thumbnails/005/217/2.jpg" width="120" height="90" alt="Autómánia 09-10-24" title="Autómánia 09-10-24" />
</a>
<a href="javascript:void(0)" class="video-add" id="5217-0">
<img src="http://www.rtlklub.hu/most/style/img/add_video_icon.png" alt="Add a kedvenceid közé" title="Add a kedvenceid közé" />
</a>
<div class="img-height-wide"></div>
<h2>
<a href="http://www.rtlklub.hu/most/5217_automania_09-10-24">Autómánia 09-10-24</a>
</h2>
<p>Toyota Prius, Aprilia Tuono 1000R, Honda Accord 2.2 I-DTEC</p>
</div>
"""
episodes = []
#print len(episodesHtml)
for episode in episodesHtml:
episodes.append({"title":episode.h2.a.string, "url":episode.h2.a['href'], "thumb":episode.a.img['src']})
#print episodes
return episodes
示例6: handler
def handler(sock, url):
htmlsource=sock.read()
soup = BeautifulSoup(htmlsource)
content = soup.find(id=re.compile("postmessage_\d+"),name="td")
if content is None:
return "failed to read content"
return unicode(content)
示例7: render
def render(self):
# TODO: fix and enable caching
# content = cache.get(self.content_url)
content = None
url = self.content_url
# If the page is not cached, retrieve it
if content == None:
opener = urllib2.build_opener()
content = opener.open(url, timeout=5).read()
# Save the page in cache
# cache.set(self.content_url, content)
soup = BeautifulSoup(content)
# TODO: Disabled. Add GET parameter support and enable.
# Make links absolute, quoted from http://stackoverflow.com/a/4468467:
#for tag in soup.findAll('a', href=True):
# tag['href'] = urlparse.urljoin(self.content_url, tag['href'])
# If there's no element specified, use the BODY.
# Otherwise find the element with given id.
if self.element_id == "":
html = soup.find("body").renderContents()
else:
html = str(soup.find(id=self.element_id))
return html
示例8: parse_response
def parse_response(self):
soup = BeautifulSoup(self.response)
head = soup.find("head")
self.max_points = int(_get_value_from_soup(head, "meta", "value", {"name": "max-points"}, 0))
if _get_value_from_soup(head, "meta", "value", {"name": "status"}) == "accepted":
self.is_accepted= True
meta_title = _get_value_from_soup(head, "meta", "content", {"name": "DC.Title"})
if meta_title:
self.meta["title"] = meta_title
else:
title = soup.find("title")
if title:
self.meta["title"] = title.contents
self.meta["description"] = _get_value_from_soup(head, "meta", "content", {"name": "DC.Description"}, "")
points = _get_value_from_soup(head, "meta", "value", {"name": "points"})
if points != None:
self.points = int(points)
self.is_graded = True
self.is_accepted= True
exercise_div = soup.body.find("div", {"id": "exercise"})
if exercise_div != None:
self.content = exercise_div.renderContents()
else:
self.content = soup.body.renderContents()
示例9: parse
def parse(property_id, ratecode='SPGCP'):
valid_property = False
hotel_props = {'id': property_id}
property_url = "%s?propertyID=%s" % (starwood_url, property_id)
logging.debug("Property URL: %s" % property_url)
starwood_response = urlfetch.fetch(url=property_url, deadline=10)
if starwood_response:
try:
soup = BeautifulSoup(starwood_response.content).find(attrs={'id': 'propertyHighlight'}).find(attrs={'class': 'propertyContainer'})
except:
soup = None
if soup:
try:
hotel_props['name'] = unicode(soup.find("a", "propertyName").contents[0]).strip()
hotel_props['category'] = int(str(soup.find("span", "spgCategory").contents[0]).split()[-1])
valid_property = True
except:
pass
if valid_property:
hotel_props['address'] = StarwoodParser.parse_address(soup)
#hotel_props['awards'] = StarwoodParser.parse_starwood(soup.find("div", "tabsContentContainer").findAll("div", "tabContent"))
hotel_props['image_url'] = str("http://www.starwoodhotels.com%s" % (soup.find("img", "propertyThumbnail")['src']))
return valid_property and hotel_props or None
示例10: getRes
def getRes(self):
url = self.getResURL()
page = urllib2.urlopen(url).read()#.decode('GBK').encode('utf-8')
soup = BeautifulSoup(page)
main_wrapper = soup.findAll('div', {'class': 'main_wrapper'})[0]
#print main_wrapper.prettify()
clr_after = main_wrapper.findAll('div', {'class': 'clr_after'})[0]
#print clr_after.prettify()
items = clr_after.findAll('div', {'class': 'main'})[0]
#print items.prettify()
items1 = items.findAll('div', {'class': 'lowpriceList'})[0]
print items1.prettify().decode('utf-8').encode('gbk')
items2 = items1.findAll('div', {'id': 'hdivResultTable'})[0]
#print items2.prettify().decode('utf-8').encode('gbk')
for item in items2:
print item
inc = str(item.findAll('td', {'class': 'col3'})[0].contents[0].string)
fly_time = str(item.findAll('td', {'class': 'col4'})[0].contents[0].string)
_time = str(item.findAll('td', {'class': 'col2'})[0].contents[0].string)
_discount = str(item.findAll('span', {'class': 'disc'})[0].contents[0].string)
_price = str(item.findAll('span', {'class': 'pr'})[0].contents[0].string)
print inc#.decode('utf-8').encode('gbk')
print fly_time#.decode('utf-8').encode('gbk')
print _time#.decode('utf-8').encode('gbk')
print _discount.decode('utf-8').encode('gbk')
print _price.decode('utf-8').encode('gbk')
示例11: view_page
def view_page(slug):
page = Page.gql("WHERE slug = :1", slug)[0]
content = BeautifulSoup(page.content)
codes = content.findAll('pre')
for code in codes:
code.contents[0].replaceWith(controllers.prettify_code(code.contents[0]))
page.content = str(content)
return render_template('cms_view_page.html', page=page)
示例12: location
def location(self,ip):
try:
self.current_page = self.br.open('http://www.114best.com/ip/114.aspx?w=%s' % ip)
except Exception:
return "Earth"
soup = BeautifulSoup(self.current_page)
lo = soup.findAll('div', { "id" : "output" })[0].findAll('b')[1].text.encode('utf-8','ignore')
return lo
示例13: getPresentation
def getPresentation(self):
base_url = 'http://my.yingjiesheng.com/xuanjianghui_province_'
for i in range(1, 35): #取出34[1-34]个省份的未来两天的招聘信息
url = base_url + str(i) + '.html'
#print url
try:
page = self.getRes(url)
soup = BeautifulSoup(page)
except: #url打开失败
continue
#取出所有的倒计时
try: #当前城市可能未来一段时间没有宣讲会信息
countdowns = soup.findAll('div', {'class': 'list_topic'})
y_m_d2, y_m_d3 = '', ''; #记录第二天和第三天的宣讲会日期
first, second = -1, -1 #第二天和第三天的宣讲会出现的名字为campusTalk的table下标.其位置是和倒计时出现的div保持错开一个位置
# 因为第0个名为campusTalk的table是表格标题栏,从第1个开始才是宣讲会的信息,因此day初始化为1
day = 1
for countdown in countdowns:
cd = string.atoi(countdown.contents[0].contents[2].string)
if cd > 2: #倒计时超过2天的宣讲会,暂不考虑
break
elif cd == 1: #第二天要举行的宣讲会【倒计时剩1天】
first = day
y_m_d2 = countdown.contents[1].string
elif cd == 2: #第三天要举行的宣讲会【倒计时剩2天】
second = day
y_m_d3 = countdown.contents[1].string
day = day + 1
# first是第2天信息,second是第三天的信息,假如为-1,表示那天没有宣讲会
if first != -1:
tables = soup.findAll('table', {'class':'campusTalk'})[first]
trs = tables.findAll('tr')
for tr in trs:
tds = tr.findAll('td')
city = tds[0].a.string.strip()
school = tds[1].a.string.strip()
addr = tds[2].string.strip()
inc = tds[3].a.string.strip()
try: # 有些宣讲会未标出具体开始时间[H-M-S]
pdate = y_m_d2 + ' ' + tds[4].string
except Exception, e:
pdate = y_m_d2 #那么只记录年月日即可
self.presentations.append(CPresentation(city, inc, school, pdate, addr))
if second != -1:
tables = soup.findAll('table', {'class':'campusTalk'})[second]
trs = tables.findAll('tr')
for tr in trs:
tds = tr.findAll('td')
city = tds[0].a.string.strip()
school = tds[1].a.string.strip()
addr = tds[2].string.strip()
inc = tds[3].a.string.strip()
try:
pdate = y_m_d3 + ' ' + tds[4].string
except:
pdate = y_m_d3
self.presentations.append(CPresentation(city, inc, school, pdate, addr))
except:
示例14: assert_no_error_message_in_response
def assert_no_error_message_in_response(self, response):
"""Check that response has no error messages."""
soup = BeautifulSoup(response)
el = soup.find("p", "alert-error")
if el:
self.fail("error message found in response unexpectedly: {}".format(el.contents))
el = soup.findAll("label", "alert-error")
if el:
self.fail("error message found in response unexpectedly: {}".format(el.contents))
示例15: Items
def Items(self):
itemsprocessed = []
cnt4debug = 0
opener = URLOpener(self.host)
decoder = AutoDecoder()
for section, url in self.feeds:
content = None
cnt4debug += 1
if IsRunInLocal and cnt4debug > 1:
break
result = opener.open(url)
status_code, content = result.status_code, result.content
if status_code != 200 and content:
logging.error('err(%d) to fetch %s.' % (status_code,url))
continue
if self.feed_encoding:
content = content.decode(self.feed_encoding)
else:
content = decoder.decode(content)
content = self.preprocess(content)
feed = feedparser.parse(content)
for e in feed['entries']:
# 全文RSS中如果有广告或其他不需要的内容,可以在postprocess去掉
desc = self.postprocess(e.description)
desc = self.FragToXhtml(desc, e.title, self.feed_encoding)
if self.keep_image:
soup = BeautifulSoup(content)
self.soupbeforeimage(soup)
for img in soup.findAll('img'):
imgurl = img['src']
if not imgurl.startswith('http') and not imgurl.startswith('www'):
imgurl = self.urljoin(url, imgurl)
imgresult = opener.open(imgurl)
imgcontent = imgresult.content if imgresult.status_code == 200 else None
if imgcontent:
imgtype = imghdr.what(None, imgcontent)
if imgtype:
imgmime = r"image/" + imgtype
if imgtype == 'jpeg':
fnimg = "%d.jpg" % random.randint(10000,99999999)
else:
fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
img['src'] = fnimg
yield (imgmime, imgurl, fnimg, imgcontent)
self.soupprocessex(soup)
desc = soup.renderContents('utf-8').decode('utf-8')
soup = None
if e.title not in itemsprocessed and desc:
itemsprocessed.append(e.title)
yield (section, e.link, e.title, desc)