本文整理汇总了Python中imdb.parser.http.bsouplxml._bsoup.BeautifulSoup类的典型用法代码示例。如果您正苦于以下问题:Python BeautifulSoup类的具体用法?Python BeautifulSoup怎么用?Python BeautifulSoup使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BeautifulSoup类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getItems
def getItems(self, data):
results = []
soup = BeautifulSoup(data)
table = soup.find("table", { "class" : "chart" })
try:
for tr in table.findAll("tr"):
item = {}
for td in tr.findAll('td'):
# Get title and ID from <a>
if td.a and not td.a.img:
item['id'] = int(td.a['href'].split('/')[-1])
item['name'] = str(td.a.contents[0])
# Get year from <td>
if not td.h3 and not td.a:
if len(td.contents) == 1:
for y in td.contents:
try:
item['year'] = int(y)
except ValueError:
pass
if item:
results.append(item)
except AttributeError:
log.error('No search results.')
return results
示例2: checkForUpdateWindows
def checkForUpdateWindows(self):
try:
data = urllib2.urlopen(self.downloads, timeout = self.timeout).read()
except (IOError, URLError):
log.error('Failed to open %s.' % self.downloads)
return False
try:
tables = SoupStrainer('table')
html = BeautifulSoup(data, parseOnlyThese = tables)
resultTable = html.find('table', attrs = {'id':'s3_downloads'})
latestUrl = 'http://github.com' + resultTable.find('a')['href'].replace(' ', '%20')
try:
latest = urllib2.urlopen(latestUrl, timeout = self.timeout)
except (IOError, URLError):
log.error('Failed to open %s.' % latestUrl)
return False
downloadUrl = latest.geturl()
if 'r' + str(version.windows) in downloadUrl:
return False
return downloadUrl
except AttributeError:
log.debug('Nothing found.')
return False
示例3: checkForUpdateWindows
def checkForUpdateWindows(self):
try:
data = urllib2.urlopen(self.downloads, timeout = self.timeout).read()
except (IOError, URLError):
log.error('Failed to open %s.' % self.downloads)
return False
try:
html = BeautifulSoup(data)
results = html.findAll('a', attrs = {'href':re.compile('/downloads/')})
for link in results:
if 'windows' in str(link.parent).lower():
downloadUrl = 'http://github.com' + link.get('href').replace(' ', '%20')
break
if 'r' + str(version.windows) in downloadUrl:
return False
return downloadUrl
except AttributeError:
log.debug('Nothing found.')
return False
示例4: findViaAlternative
def findViaAlternative(self, movie):
results = {'480p':[], '720p':[], '1080p':[]}
arguments = urlencode({
's':movie
})
url = "%s?%s" % (self.backupUrl, arguments)
log.info('Searching %s' % url)
try:
data = urllib2.urlopen(url, timeout = self.timeout).read()
except (IOError, URLError):
log.error('Failed to open %s.' % url)
return results
try:
tables = SoupStrainer('div')
html = BeautifulSoup(data, parseOnlyThese = tables)
resultTable = html.findAll('h2', text = re.compile(movie))
for h2 in resultTable:
if 'trailer' in h2.lower():
parent = h2.parent.parent.parent
trailerLinks = parent.findAll('a', text = re.compile('480p|720p|1080p'))
try:
for trailer in trailerLinks:
results[trailer].insert(0, trailer.parent['href'])
except:
pass
except AttributeError:
log.debug('No trailers found in via alternative.')
return results
示例5: findByProvider
def findByProvider(self, data, provider):
results = {'480p':[], '720p':[], '1080p':[]}
try:
tables = SoupStrainer('table')
html = BeautifulSoup(data, parseOnlyThese = tables)
resultTable = html.find('table', attrs = {'class':'bottomTable'})
for tr in resultTable.findAll('tr'):
trtext = str(tr).lower()
if 'clips' in trtext:
break
if 'trailer' in trtext and not 'clip' in trtext and provider in trtext:
nr = 0
resolutions = tr.findAll('td', attrs = {'class':'bottomTableResolution'})
#sizes = tr.findNext('tr').findAll('td', attrs = {'class':'bottomTableFileSize'})
for res in resolutions:
results[str(res.a.contents[0])].insert(0, res.a['href'])
#int(sizes[nr].contents[0].replace('MB', ''))
nr += 1
return results
except AttributeError:
log.debug('No trailers found in provider %s.' % provider)
results['404'] = True
return results
示例6: find
def find(self, movie, quality, type):
results = []
if not self.enabled() or not self.isAvailable(self.searchUrl):
return results
url = self.searchUrl % quote_plus(self.toSearchString(movie.name + ' ' + quality))
log.info('Searching: %s' % url)
data = urllib.urlopen(url)
try:
tables = SoupStrainer('table')
html = BeautifulSoup(data, parseOnlyThese = tables)
resultTable = html.find('table', attrs = {'class':'requests'})
for result in resultTable.findAll('tr', attrs = {'class':'req_filled'}):
new = self.feedItem()
id = result.find('td', attrs = {'class':'reqid'})
new.id = id.contents[0]
name = result.find('td', attrs = {'class':'release'})
new.name = self.toSaveString(name.contents[0])
new.size = 9999
new.content = 'x264'
new.type = 'nzb'
new.url = self.downloadUrl % (new.id)
new.date = time.time()
new.score = self.calcScore(new, movie)
if self.isCorrectMovie(new, movie, type):
results.append(new)
log.info('Found: %s' % new.name)
return results
except AttributeError:
log.debug('No search results found.')
示例7: getItems
def getItems(self, data):
results = []
soup = BeautifulSoup(data)
table = soup.find("table", { "class" : "filmSubtitleList" })
try:
for tr in table.findAll("tr"):
item = {}
for td in tr.findAll('td'):
if td.a:
spans = td.a.findAll('span')
if len(spans) == 2:
item['id'] = int(spans[1].get('id').replace('r', ''))
item['name'] = str(spans[1].contents[0]).strip()
item['rating'] = int(spans[0].get('class', '0').replace('r', ''))
# Language
lang = str(spans[0].contents[0]).strip()
item['language'] = self.languages.get(lang, lang)
if td.div:
item['hi'] = td.div.get('id') == 'imgEar'
if item.get('name'):
results.append(item)
except AttributeError:
log.error('No search results.')
return results
示例8: getDetails
def getDetails(self, id):
url = self.detailUrl + str(id)
log.info('Scanning %s.' % url)
try:
data = urllib2.urlopen(url, timeout = self.timeout).read()
except (IOError, URLError):
log.error('Failed to open %s.' % url)
return False
# Search for theater release
theaterDate = 0
try:
theaterLink = SoupStrainer('a', href = re.compile('/month_theaters.html\?'))
theater = BeautifulSoup(data, parseOnlyThese = theaterLink)
theaterDate = int(time.mktime(parse(theater.a.contents[0]).timetuple()))
except AttributeError:
log.debug('No Theater release info found.')
# Search for dvd release date
dvdDate = 0
try:
try:
dvdLink = SoupStrainer('a', href = re.compile('/month_video.html\?'))
dvd = BeautifulSoup(data, parseOnlyThese = dvdLink)
dvdDate = int(time.mktime(parse(dvd.a.contents[0]).timetuple()))
except:
pass
# Try left column
if not dvdDate:
dvdReleases = SoupStrainer('p', text = re.compile('Released'))
dvd = BeautifulSoup(data, parseOnlyThese = dvdReleases)
for date in dvd:
foundDate = int(time.mktime(parse(date.replace('Released', '')).timetuple()))
dvdDate = foundDate if foundDate > dvdDate else dvdDate
except AttributeError:
log.debug('No DVD release info found.')
# Does it have blu-ray release?
bluray = []
try:
bees = SoupStrainer('b')
soup = BeautifulSoup(data, parseOnlyThese = bees)
bluray = soup.findAll('b', text = re.compile('Blu-ray'))
except AttributeError:
log.info('No Bluray release info found.')
dates = {
'id': id,
'dvd': dvdDate,
'theater': theaterDate,
'bluray': len(bluray) > 0
}
log.debug('Found: %s' % dates)
return dates
示例9: getInfo
def getInfo(self, url):
log.debug('Getting info: %s' % url)
try:
data = urllib2.urlopen(url, timeout = self.timeout).read()
pass
except IOError:
log.error('Failed to open %s.' % url)
return ''
div = SoupStrainer('div')
html = BeautifulSoup(data, parseOnlyThese = div)
html = html.find('div', attrs = {'class':'nfo'})
return str(html).decode("utf-8", "replace")
示例10: getInfo
def getInfo(self, url):
log.debug('Getting info: %s' % url)
try:
data = urllib2.urlopen(url, timeout = self.timeout).read()
pass
except IOError:
log.error('Failed to open %s.' % url)
return ''
tables = SoupStrainer('table')
html = BeautifulSoup(data)
movieInformation = html.find('div', attrs = {'class':'i_info'})
return str(movieInformation).decode("utf-8", "replace")
示例11: find
def find(self, movie, quality, type):
results = []
if not self.enabled() or not self.isAvailable(self.searchUrl):
return results
url = self.searchUrl % quote_plus(self.toSearchString(movie.name + ' ' + quality))
log.info('Searching: %s' % url)
try:
data = urllib2.urlopen(url, timeout = self.timeout).read()
except (IOError, URLError):
log.error('Failed to open %s.' % url)
return results
try:
tables = SoupStrainer('table')
html = BeautifulSoup(data, parseOnlyThese = tables)
resultable = html.find('table', attrs = {'class':'t'})
for result in resultable.findAll('span', attrs = {'class':'cname'}):
new = self.feedItem()
a = result.find('a')
id = re.search('(?<=detail\?c\=)\w+', a['href'])
new.id = id.group(0)
text = a.findAll(text = True)
words = ''
for text in a.findAll(text = True):
words = words + unicode(text).encode('utf-8')
new.name = words
new.size = 9999
new.content = 'mysterbin'
new.type = 'nzb'
new.url = self.downloadUrl % (new.id)
new.date = time.time()
new.score = self.calcScore(new, movie)
if self.isCorrectMovie(new, movie, type):
results.append(new)
log.info('Found: %s' % new.name)
return results
except AttributeError:
log.debug('No search results found.')
return results
示例12: find
def find(self, movie, quality, type):
results = []
if not self.enabled() or not self.isAvailable(self.apiUrl):
return results
url = self.apiUrl % (quote_plus(self.toSearchString(movie.name + ' ' + quality) + self.makeIgnoreString(type)), self.getCatId(type))
log.info('Searching: %s' % url)
try:
data = urllib2.urlopen(url, timeout = self.timeout).read()
except (IOError, URLError):
log.error('Failed to open %s.' % url)
return results
try:
tables = SoupStrainer('table')
html = BeautifulSoup(data, parseOnlyThese = tables)
resultTable = html.find('table', attrs = {'id':'searchResult'})
for result in resultTable.findAll('tr'):
details = result.find('a', attrs = {'class':'detLink'})
if details:
href = re.search('/(?P<id>\d+)/', details['href'])
id = href.group('id')
name = self.toSaveString(details.contents[0])
desc = result.find('font', attrs = {'class':'detDesc'}).contents[0].split(',')
date = ''
size = 0
for item in desc:
# Weird date stuff
if 'uploaded' in item.lower():
date = item.replace('Uploaded', '')
date = date.replace('Today', '')
# Do something with yesterday
yesterdayMinus = 0
if 'Y-day' in date:
date = date.replace('Y-day', '')
yesterdayMinus = 86400
datestring = date.replace(' ', ' ').strip()
date = int(time.mktime(parse(datestring).timetuple())) - yesterdayMinus
# size
elif 'size' in item.lower():
size = item.replace('Size', '')
seedleech = []
for td in result.findAll('td'):
try:
seedleech.append(int(td.contents[0]))
except ValueError:
pass
seeders = 0
leechers = 0
if len(seedleech) == 2 and seedleech[0] > 0 and seedleech[1] > 0:
seeders = seedleech[0]
leechers = seedleech[1]
# to item
new = self.feedItem()
new.id = id
new.type = 'torrent'
new.name = name
new.date = date
new.size = self.parseSize(size)
new.seeders = seeders
new.leechers = leechers
new.url = self.downloadLink(id, name)
new.score = self.calcScore(new, movie) + self.uploader(result) + (seeders / 10)
if seeders > 0 and (new.date + (int(self.conf('wait')) * 60 * 60) < time.time()) and Qualities.types.get(type).get('minSize') <= new.size:
new.detailUrl = self.detailLink(id)
new.content = self.getInfo(new.detailUrl)
if self.isCorrectMovie(new, movie, type):
results.append(new)
log.info('Found: %s' % new.name)
return results
except AttributeError:
log.debug('No search results found.')
return []
示例13: download
def download(self, subtitle):
subtitle = subtitle['subtitles'].pop()
url = self.downloadUrl % subtitle['id']
try:
data = self.urlopen(url, timeout = self.timeout).read()
except (IOError, URLError):
log.error('Failed to open %s.' % url)
return False
soup = BeautifulSoup(data)
postUrl = self.siteUrl + soup.find("a", {'id' : 's_lc_bcr_downloadLink' }).get('href').split('"')[-2]
typeId = soup.find("input", {"name" : "typeId" }).get('value')
params = urllib.urlencode({
'__EVENTTARGET': 's$lc$bcr$downloadLink',
'__EVENTARGUMENT': '',
'__VIEWSTATE': soup.find("input", {"id" : "__VIEWSTATE" }).get('value'),
'__PREVIOUSPAGE': soup.find("input", { "id" : "__PREVIOUSPAGE" }).get('value'),
'subtitleId': soup.find("input", {"id" : "subtitleId" }).get('value'),
'typeId': typeId,
'filmId': soup.find("input", {"name" : "filmId" }).get('value')
})
# No unrarring yet
if 'rar' in typeId:
log.error('Unrar not supported yet.')
return False
req = urllib2.Request(postUrl, headers = {
'Referer' : url,
'User-Agent' : 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8'
})
subtitleFiles = []
try:
self.wait()
data = urllib2.urlopen(req, params)
self.lastUse = time.time()
hash = hashlib.md5(url).hexdigest()
tempdir = cherrypy.config.get('cachePath')
tempSubtitleFile = os.path.join(tempdir, hash + '.' + typeId)
# Remove the old
if os.path.isfile(tempSubtitleFile): os.remove(tempSubtitleFile)
with open(tempSubtitleFile, 'wb') as f:
f.write(data.read())
if 'zip' in typeId:
zip = ZipFile(tempSubtitleFile)
extract = []
for name in zip.namelist():
for ext in self.extensions:
if ext.replace('*', '') in name:
subtitleFiles.append(os.path.join(tempdir, name))
extract.append(name)
zip.extractall(tempdir, extract)
os.remove(tempSubtitleFile)
else:
subtitleFiles.append(tempSubtitleFile)
log.info('Subtitle download "%s" finished. %dKB.' % (subtitle['name'], int(data.info().getheaders("Content-Length")[0]) / 1024))
return subtitleFiles
except:
log.error('Subtitle download %s failed.' % subtitle['name'])
return False
示例14: urlencode
arguments = urlencode({
's':movie
})
url = "%s?%s" % (self.backupUrl, arguments)
log.debug('Searching %s' % url)
try:
data = urllib2.urlopen(url, timeout = self.timeout).read()
except (IOError, URLError), e:
log.debug('Failed to open %s. %s' % (url, e))
return results
try:
tables = SoupStrainer('div')
html = BeautifulSoup(data, parseOnlyThese = tables)
resultTable = html.findAll('h2', text = re.compile(movie))
for h2 in resultTable:
if 'trailer' in h2.lower():
parent = h2.parent.parent.parent
trailerLinks = parent.findAll('a', text = re.compile('480p|720p|1080p'))
try:
for trailer in trailerLinks:
results[trailer].insert(0, trailer.parent['href'])
except:
pass
except AttributeError:
log.debug('No trailers found in via alternative.')