本文整理汇总了Python中salts_lib.scraper_utils.cleanse_title函数的典型用法代码示例。如果您正苦于以下问题:Python cleanse_title函数的具体用法?Python cleanse_title怎么用?Python cleanse_title使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cleanse_title函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: search
def search(self, video_type, title, year, season=''): # @UnusedVariable
results = []
search_url = '/search/' + urllib.quote_plus(title)
html = self._http_get(search_url, require_debrid=True, cache_limit=1)
if video_type == VIDEO_TYPES.TVSHOW:
seen_urls = {}
for _attr, post in dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')}):
if CATEGORIES[video_type] not in post: continue
match = re.search('<span>\s*TAGS:\s*</span>\s*<a\s+href="([^"]+)[^>]+>([^<]+)', post, re.I)
if match:
show_url, match_title = match.groups()
if show_url in seen_urls: continue
result = {'url': scraper_utils.pathify_url(show_url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''}
seen_urls[show_url] = result
results.append(result)
elif video_type == VIDEO_TYPES.MOVIE:
norm_title = scraper_utils.normalize_title(title)
headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html)
posts = [result.content for result in dom_parser2.parse_dom(html, 'div', {'id': re.compile('post-\d+')})]
for heading, post in zip(headings, posts):
if CATEGORIES[video_type] not in post or self.__too_old(post): continue
post_url, post_title = heading
meta = scraper_utils.parse_movie_link(post_title)
full_title = '%s [%s] (%sp)' % (meta['title'], meta['extra'], meta['height'])
match_year = meta['year']
match_norm_title = scraper_utils.normalize_title(meta['title'])
if (match_norm_title in norm_title or norm_title in match_norm_title) and (not year or not match_year or year == match_year):
result = {'url': scraper_utils.pathify_url(post_url), 'title': scraper_utils.cleanse_title(full_title), 'year': match_year}
results.append(result)
return results
示例2: search
def search(self, video_type, title, year, season=''):
search_url = urlparse.urljoin(self.base_url, '/index.php?search=%s&image.x=0&image.y=0')
search_url = search_url % (urllib.quote_plus(title))
html = self._http_get(search_url, cache_limit=.25)
results = []
# Are we on a results page?
if not re.search('window\.location', html):
pattern = '<td[^>]+class="movieText"[^>]*>(.*?)</p>.*?href="(/watch/[^"]+)'
for match in re.finditer(pattern, html, re.DOTALL):
match_title_year, match_url = match.groups('')
# skip porn
if '-XXX-' in match_url.upper() or ' XXX:' in match_title_year: continue
match_title_year = re.sub('</?.*?>', '', match_title_year)
match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year)
if match:
match_title, match_year = match.groups()
else:
match_title = match_title_year
match_year = ''
if not year or not match_year or year == match_year:
result = {'url': match_url, 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
results.append(result)
else:
match = re.search('window\.location\s+=\s+"([^"]+)', html)
if match:
url = match.group(1)
if url != 'movies.php':
result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': year}
results.append(result)
return results
示例3: search
def search(self, video_type, title, year, season=''):
results = []
search_url = urlparse.urljoin(self.base_url, '/index.php')
params = {'search': title, 'image.x': 0, 'image.y': 0}
html = self._http_get(search_url, params=params, cache_limit=1)
# Are we on a results page?
if not re.search('window\.location', html):
pattern = '<td[^>]+class="movieText"[^>]*>(.*?)</p>.*?href="(/watch/[^"]+)'
for match in re.finditer(pattern, html, re.DOTALL):
match_title_year, match_url = match.groups('')
# skip porn
if '-XXX-' in match_url.upper() or ' XXX:' in match_title_year: continue
match_title_year = re.sub('</?.*?>', '', match_title_year)
match_title, match_year = scraper_utils.extra_year(match_title_year)
if not year or not match_year or year == match_year:
result = {'url': match_url, 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
results.append(result)
else:
match = re.search('window\.location\s+=\s+"([^"]+)', html)
if match:
url = match.group(1)
if url != 'movies.php':
result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': year}
results.append(result)
return results
示例4: __get_ok
def __get_ok(self, embed, flashvars):
hosters = []
link = flashvars[0].attrs['value']
match = re.search('metadataUrl=([^"]+)', link)
if match:
referer = scraper_utils.cleanse_title(urllib.unquote(embed[0].attrs['data']))
ok_url = scraper_utils.cleanse_title(urllib.unquote(match.group(1)))
html = self._http_get(ok_url, data='ok', headers={'Referer': referer}, cache_limit=.25)
js_data = scraper_utils.parse_json(html, ok_url)
stream_url = js_data.get('movie', {}).get('url')
if stream_url is not None:
host = urlparse.urlparse(stream_url).hostname
hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': QUALITIES.HD720, 'views': None, 'rating': None, 'url': stream_url, 'direct': False, 'subs': 'Turkish Subtitles'}
hosters.append(hoster)
return hosters
示例5: search
def search(self, video_type, title, year, season=''):
search_url = urlparse.urljoin(self.base_url, '/search/%s' % (urllib.quote_plus(title)))
html = self._http_get(search_url, cache_limit=.25)
results = []
for item in dom_parser.parse_dom(html, 'div', {'class': 'name_top'}):
match = re.search('href="([^"]+)[^>]+>([^<]+)', item)
if match:
url, match_title_year = match.groups()
is_season = re.search('Season\s+(\d+)', match_title_year, re.IGNORECASE)
if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON:
match_year = ''
if video_type == VIDEO_TYPES.MOVIE:
match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year)
if match:
match_title, match_year = match.groups()
else:
match_title = match_title_year
else:
match_title = match_title_year
if season and int(is_season.group(1)) != int(season):
continue
if not year or not match_year or year == match_year:
result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url)}
results.append(result)
return results
示例6: search
def search(self, video_type, title, year, season=''):
results = []
if video_type == VIDEO_TYPES.TVSHOW:
url = urlparse.urljoin(self.base_url, '/series/all/')
html = self._http_get(url, cache_limit=8)
links = dom_parser.parse_dom(html, 'a', {'class': 'underilne'}, 'href')
titles = dom_parser.parse_dom(html, 'a', {'class': 'underilne'})
items = zip(links, titles)
else:
url = urlparse.urljoin(self.base_url, '/search?=%s' % urllib.quote_plus(title))
data = {'q': title, 'go': 'Search'}
html = self._http_get(url, data=data, cache_limit=8)
match = re.search('you can search again in (\d+) seconds', html, re.I)
if match:
wait = int(match.group(1))
if wait > self.timeout: wait = self.timeout
time.sleep(wait)
html = self._http_get(url, data=data, cache_limit=0)
pattern = 'class="movie_box.*?href="([^"]+).*?<h1>([^<]+)'
items = re.findall(pattern, html, re.DOTALL)
norm_title = scraper_utils.normalize_title(title)
for item in items:
url, match_title = item
if norm_title in scraper_utils.normalize_title(match_title):
result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''}
results.append(result)
return results
示例7: search
def search(self, video_type, title, year, season=''):
search_url = urlparse.urljoin(self.base_url, '/movies.php?list=search&search=')
search_url += urllib.quote_plus(title)
cookies = {'onlylanguage': 'en', 'lang': 'en'}
html = self._http_get(search_url, cookies=cookies, cache_limit=.25)
results = []
pattern = 'id="tdmovies">\s*<a\s+href="([^"]+)">([^<]+).*?id="f7">(.*?)</TD>'
for match in re.finditer(pattern, html, re.DOTALL):
url, title, extra = match.groups('')
if (video_type == VIDEO_TYPES.MOVIE and '(TVshow)' in title) or (video_type == VIDEO_TYPES.TVSHOW and '(TVshow)' not in title):
continue
title = title.replace('(TVshow)', '')
title = title.strip()
r = re.search('>(\d{4})<', extra)
if r:
match_year = r.group(1)
else:
match_year = ''
if not year or not match_year or year == match_year:
result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(title), 'year': match_year}
results.append(result)
return results
示例8: search
def search(self, video_type, title, year, season=''):
results = []
search_url = scraper_utils.urljoin(self.base_url, '/search/%s.html' % (urllib.quote_plus(title)))
html = self._http_get(search_url, cache_limit=1)
fragment = dom_parser2.parse_dom(html, 'ul', {'class': 'cfv'})
if not fragment: return results
norm_title = scraper_utils.normalize_title(title)
for _attrs, item in dom_parser2.parse_dom(fragment[0].content, 'li'):
is_season = dom_parser2.parse_dom(item, 'div', {'class': 'status'})
if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (is_season and video_type == VIDEO_TYPES.SEASON):
match = dom_parser2.parse_dom(item, 'a', req=['href', 'title'])
if not match: continue
match_title = match[0].attrs['title']
match_url = match[0].attrs['href']
match_year = ''
if video_type == VIDEO_TYPES.SEASON:
if season and not re.search('Season\s+%s$' % (season), match_title, re.I):
continue
else:
match = re.search('-(\d{4})[-.]', match_url)
if match:
match_year = match.group(1)
match_norm_title = scraper_utils.normalize_title(match_title)
title_match = (norm_title in match_norm_title) or (match_norm_title in norm_title)
if title_match and (not year or not match_year or year == match_year):
result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
results.append(result)
return results
示例9: search
def search(self, video_type, title, year, season=''):
results = []
search_url = urlparse.urljoin(self.base_url, '/search/')
search_url += urllib.quote_plus(title)
html = self._http_get(search_url, cache_limit=1)
for fragment in dom_parser.parse_dom(html, 'div', {'class': 'inner'}):
name = dom_parser.parse_dom(fragment, 'div', {'class': 'name'})
if name:
match = re.search('href="([^"]+)[^>]+>(.*?)</a>', name[0])
if match:
match_url, match_title_year = match.groups()
if 'tv-series' in match_url and video_type == VIDEO_TYPES.MOVIE: continue
match_title_year = re.sub('</?[^>]*>', '', match_title_year)
match_title_year = re.sub('[Ww]atch\s+[Mm]ovie\s*', '', match_title_year)
match_title_year = match_title_year.replace('’', "'")
match = re.search('(.*?)\s+\((\d{4})[^)]*\)$', match_title_year)
if match:
match_title, match_year = match.groups()
else:
match_title = match_title_year
match_year = ''
if not match_year:
year_span = dom_parser.parse_dom(fragment, 'span', {'class': 'year'})
if year_span:
year_text = dom_parser.parse_dom(year_span[0], 'a')
if year_text:
match_year = year_text[0].strip()
if not year or not match_year or year == match_year:
result = {'title': scraper_utils.cleanse_title(match_title), 'url': scraper_utils.pathify_url(match_url), 'year': match_year}
results.append(result)
return results
示例10: __alt_search
def __alt_search(self, video_type, title, year, season=''):
results = []
params = title.lower()
if year: params += ' %s' % (year)
if video_type == VIDEO_TYPES.SEASON and season:
params += ' Season %s' % (season)
params = {'key': params}
search_url = urlparse.urljoin(self.base_url, '/search')
html = self._http_get(search_url, params=params, cache_limit=1)
norm_title = scraper_utils.normalize_title(title)
for item in dom_parser.parse_dom(html, 'div', {'class': 'caption'}):
match = re.search('href="([^"]+)[^>]+>(.*?)<span[^>]*>', item)
if match:
match_url, match_title = match.groups()
is_season = re.search('-season-\d+', match_url)
if (video_type == VIDEO_TYPES.MOVIE and not is_season) or (video_type == VIDEO_TYPES.SEASON and is_season):
if video_type == VIDEO_TYPES.SEASON:
if season and not re.search('season-0*%s$' % (season), match_url): continue
match_title = re.sub('</?[^>]*>', '', match_title)
match_title = re.sub('\s+Full\s+Movie', '', match_title)
match = re.search('-(\d{4})(?:$|-)', match_url)
if match:
match_year = match.group(1)
else:
match_year = ''
if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year):
result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)}
results.append(result)
return results
示例11: search
def search(self, video_type, title, year, season=''):
search_url = self.base_url
if video_type in [VIDEO_TYPES.TVSHOW, VIDEO_TYPES.EPISODE]:
search_url += '/tvshow'
search_url += '/advanced-search.php?search='
search_url += urllib.quote_plus(title)
search_url += '&year=' + urllib.quote_plus(str(year))
search_url += '&advanced_search=Search'
html = self._http_get(search_url, cache_limit=.25)
results = []
for element in dom_parser.parse_dom(html, 'div', {'class': 'list_box_title'}):
match = re.search('href="([^"]+)"\s+title="(?:Watch )?([^"]+)', element)
if match:
url, match_title_year = match.groups()
match = re.search('(.*?)(?:\s+\(?\s*(\d{4})\s*\)?)', match_title_year)
if match:
match_title, match_year = match.groups()
else:
match_title = match_title_year
match_year = ''
if not year or not match_year or year == match_year:
result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
results.append(result)
return results
示例12: search
def search(self, video_type, title, year, season=''): # @UnusedVariable
results = []
search_url = scraper_utils.urljoin(self.base_url, '/movies.php')
cookies = {'onlylanguage': 'en', 'lang': 'en'}
params = {'list': 'search', 'search': title}
html = self._http_get(search_url, params=params, cookies=cookies, cache_limit=8)
for _attrs, content in dom_parser2.parse_dom(html, 'TR', {'id': re.compile('coverPreview\d+')}):
match = dom_parser2.parse_dom(content, 'a', req='href')
if not match: continue
match_url, match_title = match[0].attrs['href'], match[0].content
is_show = re.search('\(tvshow\)', match_title, re.I)
if (video_type == VIDEO_TYPES.MOVIE and is_show) or (video_type == VIDEO_TYPES.TVSHOW and not is_show):
continue
match_title = match_title.replace('(TVshow)', '')
match_title = match_title.strip()
match_year = ''
for _attrs, div in dom_parser2.parse_dom(content, 'div'):
match = re.match('\s*(\d{4})\s*', div)
if match:
match_year = match.group(1)
if not year or not match_year or year == match_year:
result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
results.append(result)
return results
示例13: search
def search(self, video_type, title, year, season=''):
search_url = urlparse.urljoin(self.base_url, '/search-movies/%s.html')
search_url = search_url % (urllib.quote_plus(title))
html = self._http_get(search_url, cache_limit=0)
results = []
for thumb in dom_parser.parse_dom(html, 'div', {'class': 'thumb'}):
match_title = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='title')
url = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='href')
if match_title and url:
match_title, url = match_title[0], url[0]
is_season = re.search('Season\s+(\d+)$', match_title, re.I)
if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON:
match_year = ''
if video_type == VIDEO_TYPES.MOVIE:
match_year = dom_parser.parse_dom(thumb, 'div', {'class': '[^"]*status-year[^"]*'})
if match_year:
match_year = match_year[0]
else:
if season and int(is_season.group(1)) != int(season):
continue
if not year or not match_year or year == match_year:
result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year}
results.append(result)
return results
示例14: search
def search(self, video_type, title, year, season=''):
results = []
html = self._http_get(self.base_url, params={'s': title}, cache_limit=8)
for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}):
match = re.search('href="([^"]+)', item)
match_title = dom_parser.parse_dom(item, 'span', {'class': 'tt'})
year_frag = dom_parser.parse_dom(item, 'span', {'class': 'year'})
if match and match_title:
url = match.group(1)
match_title = match_title[0]
if re.search('\d+\s*x\s*\d+', match_title): continue # exclude episodes
match_title, match_year = scraper_utils.extra_year(match_title)
if not match_year and year_frag:
match_year = year_frag[0]
match = re.search('(.*?)\s+\d{3,}p', match_title)
if match:
match_title = match.group(1)
extra = dom_parser.parse_dom(item, 'span', {'class': 'calidad2'})
if extra:
match_title += ' [%s]' % (extra[0])
if not year or not match_year or year == match_year:
result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url)}
results.append(result)
return results
示例15: search
def search(self, video_type, title, year, season=''):
search_url = urlparse.urljoin(self.base_url, '/movie/search/')
title = re.sub('[^A-Za-z0-9 ]', '', title)
search_url += urllib.quote_plus(title)
html = self._http_get(search_url, cache_limit=1)
results = []
for item in dom_parser.parse_dom(html, 'div', {'class': 'ml-item'}):
match_title = dom_parser.parse_dom(item, 'span', {'class': 'mli-info'})
match_url = re.search('href="([^"]+)', item, re.DOTALL)
match_year = re.search('class="jt-info">(\d{4})<', item)
is_episodes = dom_parser.parse_dom(item, 'span', {'class': 'mli-eps'})
if (video_type == VIDEO_TYPES.MOVIE and not is_episodes) or (video_type == VIDEO_TYPES.SEASON and is_episodes):
if match_title and match_url:
match_title = match_title[0]
match_title = re.sub('</?h2>', '', match_title)
match_title = re.sub('\s+\d{4}$', '', match_title)
if video_type == VIDEO_TYPES.SEASON:
if season and not re.search('Season\s+%s$' % (season), match_title): continue
url = urlparse.urljoin(match_url.group(1), 'watching.html')
match_year = match_year.group(1) if match_year else ''
if not year or not match_year or year == match_year:
result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url)}
results.append(result)
return results