本文整理汇总了Python中BeautifulSoup.BeautifulSoup方法的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup.BeautifulSoup方法的具体用法?Python BeautifulSoup.BeautifulSoup怎么用?Python BeautifulSoup.BeautifulSoup使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类BeautifulSoup
的用法示例。
在下文中一共展示了BeautifulSoup.BeautifulSoup方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_context
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def extract_context(html, url):
soup = BeautifulSoup(html)
# Insert into Content (under this domain)
texts = soup.findAll(text=True)
try:
Content.objects.create(
url=url,
title=soup.title.string,
summary=helpers.strip_tags(" \n".join(filter(visible, texts)))[:4000],
last_crawled_at=datetime.datetime.utcnow().replace(tzinfo=pytz.utc)
)
except IntegrityError:
println('%s - already existed in Content' % url)
soup.prettify()
return [str(anchor['href'])
for anchor in soup.findAll('a', attrs={'href': re.compile("^http://")}) if anchor['href']]
示例2: parse
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def parse(self, html):
"""
This method initiates parsing of HTML content, cleans resulting
content as needed, and notifies the parser instance of
resulting instances via the handle_article callback.
"""
self.soup = BeautifulSoup(html)
# This parses any global, non-itemized attributes from the page.
self._parse_globals()
# Now parse out listed articles:
for div in self.soup.findAll(ScholarArticleParser._tag_results_checker):
self._parse_article(div)
self._clean_article()
if self.article['title']:
self.handle_article(self.article)
示例3: list_of_all_href
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def list_of_all_href(self,html):
'''
It will return all hyper links found in the mr-jatt page for download
'''
soup=BeautifulSoup(html)
links=[]
a_list=soup.findAll('a','touch')
for x in xrange(len(a_list)-1):
link = a_list[x].get('href')
name = a_list[x]
name = str(name)
name=re.sub(r'<a.*/>|<span.*">|</span>|</a>|<a.*html">|<font.*">|</font>','',name)
name=re.sub(r'^[0-9]+\.','',name)
links.append([link,name])
#quit()
return links
示例4: parse_checkerproxy
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def parse_checkerproxy(self, html):
''' Only get elite proxies from checkerproxy '''
ips = []
soup = BeautifulSoup(html)
for tr in soup.findAll('tr'):
if len(tr) == 19:
ip_found = False
elite = False
ip_port = None
tds = tr.findAll('td')
for td in tds:
if ':' in td.text:
ip_found = True
ip_port_re = re.match('(\d{1,3}\.){3}\d{1,3}:\d{1,5}', td.text)
if ip_port_re:
ip_port = ip_port_re.group()
if not ip_port:
ip_found = False
if 'Elite' in td.text:
elite = True
if ip_found == True and elite == True:
ips.append(str(ip_port))
break
return ips
示例5: sanitize_html
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def sanitize_html(value):
valid_tags = ALLOWED_TAGS.split()
valid_attrs = 'href src target alt'.split()
if not value:
return ''
soup = BeautifulSoup(value)
if not (soup.find('div', 'not-sanitize')):
for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
comment.extract()
for tag in soup.findAll(True):
if tag.name not in valid_tags:
tag.hidden = True
tag.attrs = [(attr, val) for attr, val in tag.attrs
if attr in valid_attrs]
return '<p>' + soup.renderContents().decode('utf8').replace('javascript:', '').replace("\n", '</p><p>') + '</p>'
return soup.renderContents().decode('utf8')
示例6: get_soup
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def get_soup(content):
# check if page content can be used
pattern = "subtitles from the source! - Addic7ed.com"
try:
soup = BeautifulSoup(content)
title = str(soup.findAll("title")[0])
if title.find(pattern) > -1:
return soup
else:
log("bad page, maybe index after 404")
return False
except:
log("badly formatted content")
if self_notify:
xbmc.executebuiltin((u'Notification(%s,%s,%s,%s)' % (__addonname__, __language__(30009), 750, __icon__)).encode('utf-8', 'ignore'))
return False
示例7: get_soup
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def get_soup(content):
# check if page content can be used
pattern = "TVsubtitles.net - "
try:
soup = BeautifulSoup(content)
title = str(soup.findAll("title")[0])
if title.find(pattern) > -1:
return soup
else:
log("bad page, maybe index after 404")
return False
except:
log("badly formatted content")
if self_notify:
xbmc.executebuiltin((u'Notification(%s,%s,%s,%s)' % (__addonname__, __language__(30009), 750, __icon__)).encode('utf-8', 'ignore'))
return False
示例8: ubuntu_url
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def ubuntu_url(start_date, end_date):
"""
Args:
start_date (date object): Starting date from which logs need to be fetched
end_date (date object) : Last date for which logs need to be fetched
Returns:
Yields channel name, current_date, and url at which log for returned
channel and current_date is present.
"""
for current_date in rrule(freq=DAILY, dtstart=start_date, until=end_date):
url = UBUNTU_ENDPOINT.format(current_date.year,month=current_date.month, day=current_date.day)
r = send_request(url)
soup = BeautifulSoup(r)
links = soup.findAll(href=re.compile(".txt"))
for link in links:
channel = link.string
channel_ = channel[1:]
yield channel, current_date, UBUNTU_CHANNEL_ENDPOINT.format(current_date.year, month=current_date.month, day=current_date.day, channel=channel_)
示例9: run
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def run(self):
url = 'http://urlvoid.com/scan/%s/' % self.artifact['name']
try:
status, response = get(url, headers=self.headers)
if status:
data = BeautifulSoup(response.text)
if data.findAll('div', attrs={'class': 'bs-callout bs-callout-info'}):
pass
elif data.findAll('div', attrs={'class': 'bs-callout bs-callout-warning'}):
self.artifact['data']['urlvoid'] = {}
for each in data.findAll('img', alt='Alert'):
site = each.parent.parent.td.text.lstrip()
url = each.parent.a['href']
self.artifact['data']['urlvoid'][site] = url
except Exception as err:
warning('Caught exception in module (%s)' % str(err))
示例10: ip
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def ip(self):
url = 'http://bgp.he.net/ip/%s#_dns' % self.artifact['name']
headers = {'User-Agent': 'OSINT Omnibus (https://github.com/InQuest/Omnibus)'}
try:
status, response = get(url, headers=headers)
if status:
result = []
data = BeautifulSoup(response.text)
for item in data.findAll(attrs={'id': 'dns', 'class': 'tabdata hidden'}):
result.append(item.text.strip())
except Exception as err:
warning('Caught exception in module (%s)' % str(err))
示例11: run
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def run(self):
url = 'http://www.ipvoid.com/scan/%s/' % self.artifact['name']
try:
status, response = get(url, headers=self.headers)
if status:
data = BeautifulSoup(response.text)
if data.findAll('span', attrs={'class': 'label label-success'}):
pass
elif data.findAll('span', attrs={'class': 'label label-danger'}):
for each in data.findAll('img', alt='Alert'):
site = each.parent.parent.td.text.lstrip()
url = each.parent.a['href']
self.artifact['data']['ipvoid'] = {site: url}
except Exception as err:
warning('Caught exception in module (%s)' % str(err))
示例12: update_planet_fleet
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def update_planet_fleet(self, planet):
resp = self.br.open(self._get_url('fleet', planet))
soup = BeautifulSoup(resp)
ships = {}
for k, v in self.SHIPS.iteritems():
available = 0
try:
s = soup.find(id='button' + v)
available = int(s.find('span', 'textlabel').nextSibling.replace('.', ''))
except:
available = 0
ships[k] = available
#self.logger.info('Updating %s fleet' % planet)
#self.logger.info('%s' % fleet)
planet.ships = ships
示例13: get_player_status
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def get_player_status(self, destination, origin_planet=None):
if not destination:
return
status = {}
origin_planet = origin_planet or self.get_closest_planet(destination)
galaxy, system, position = destination.split(':')
url = self._get_url('galaxyCnt', origin_planet)
data = urlencode({'galaxy': galaxy, 'system': system})
resp = self.br.open(url, data=data)
soup = BeautifulSoup(resp)
soup.find(id='galaxytable')
planets = soup.findAll('tr', {'class': 'row'})
target_planet = planets[int(position)-1]
name_el = target_planet.find('td', 'playername')
status['name'] = name_el.find('span').text
status['inactive'] = 'inactive' in name_el.get('class', '')
return status
示例14: spider_image
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def spider_image(url):
user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
headers = {'User-Agent': user_agent}
request = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(request)
soup = BeautifulSoup(response.read())
result = {}
img = soup.findAll("img", attrs={"class": re.compile("wall")}, limit=1)
result['url'] = img[0]['src']
for size in soup.findAll("div", "l1"):
if size.parent.name == "a":
result['size'] = size.text
break
return result
示例15: getLinks
# 需要导入模块: import BeautifulSoup [as 别名]
# 或者: from BeautifulSoup import BeautifulSoup [as 别名]
def getLinks():
req = urllib2.urlopen('http://www.example.com')
soup = BeautifulSoup(req.read())
for link in soup.findAll('a'):
linkArray.append(link.get('href'))
print(len(linkArray))