本文整理汇总了Python中urlparse.urljoin函数的典型用法代码示例。如果您正苦于以下问题:Python urljoin函数的具体用法?Python urljoin怎么用?Python urljoin使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urljoin函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_sub
def get_sub(self):
"""Fetches the subtitles from addic7ed from url specified in given database (db) for that episode"""
url_split = urlparse.urlsplit (self.url)
head, tail = url_split.path.rsplit ('/', 1)
new_path = head, 'addic7ed'
referer = urlparse.urlunsplit(url_split._replace(path=urlparse.urljoin(*new_path)))
domain = self.url
response = urllib2.urlopen(domain)#Opens the url
html = response.read ()#loads the html code
soup = BeautifulSoup (html)#interprets (parse?) the html code
links = []
for x in soup.find_all (class_ ="buttonDownload"):
links.append (x.attrs['href'])
domain = 'http://www.addic7ed.com/'
urls = []
for link in links:
urls.append (urlparse.urljoin (domain, link))
page = urls[0]
req = urllib2.Request(page, headers ={'User-Agent' : 'Mozilla 5.10', 'Referer' : referer})
response = urllib2.urlopen (req)
data = response.read()
test = response.info()
print test
if response.info().has_key('Content-Disposition'):
with open(os.path.join(self.db.env.subs_dir ,'%s.srt' % self.title), 'wb') as f:
f.write(data)
else:
return response.info()
示例2: handle_captcha
def handle_captcha(self, response, solver):
sel = scrapy.Selector(response)
iframe_src = sel.xpath(self.CAPTCHA_XPATH).extract()[0]
iframe_url = urljoin(response.url, iframe_src)
iframe_request = scrapy.Request(iframe_url)
iframe_response = yield download(self.crawler, iframe_request)
iframe_sel = scrapy.Selector(iframe_response)
img_src, = iframe_sel.xpath('//img/@src').extract()[:1] or [None]
if img_src is None:
raise DecaptchaError('No //img/@src found on CAPTCHA page')
img_url = urljoin(iframe_response.url, img_src)
img_request = scrapy.Request(img_url)
img_response = yield download(self.crawler, img_request)
scrapy.log.msg('CAPTCHA image downloaded, solving')
captcha_text = yield solver.solve(img_response.body)
scrapy.log.msg('CAPTCHA solved: %s' % captcha_text)
challenge_request = scrapy.FormRequest.from_response(
iframe_response, formxpath='//form',
formdata={'recaptcha_response_field': captcha_text}
)
challenge_response = yield download(self.crawler, challenge_request)
challenge_sel = scrapy.Selector(challenge_response)
challenge, = challenge_sel.xpath(
'//textarea/text()'
).extract()[:1] or [None]
if not challenge:
raise DecaptchaError('Bad challenge from reCAPTCHA API:\n%s' %
challenge_response.body)
scrapy.log.msg('CAPTCHA solved, submitting challenge')
submit_request = scrapy.FormRequest.from_response(
response, formxpath='//form[.%s]' % self.CAPTCHA_XPATH,
formdata={'recaptcha_challenge_field': challenge}
)
yield download(self.crawler, submit_request)
示例3: sources
def sources(self, url, hostDict, hostprDict):
try:
sources = []
if url == None: return sources
url = urlparse.urljoin(self.base_link, url)
for i in range(3):
result = client.request(url, timeout=10)
if not result == None: break
dom = dom_parser.parse_dom(result, 'div', attrs={'class':'links', 'id': 'noSubs'})
result = dom[0].content
links = re.compile('<tr\s*>\s*<td><i\s+class="fa fa-youtube link-logo"></i>([^<]+).*?href="([^"]+)"\s+class="watch',re.DOTALL).findall(result)
for link in links[:5]:
try:
url2 = urlparse.urljoin(self.base_link, link[1])
for i in range(2):
result2 = client.request(url2, timeout=3)
if not result2 == None: break
r = re.compile('href="([^"]+)"\s+class="action-btn').findall(result2)[0]
valid, hoster = source_utils.is_host_valid(r, hostDict)
if not valid: continue
urls, host, direct = source_utils.check_directstreams(r, hoster)
for x in urls: sources.append({'source': host, 'quality': x['quality'], 'language': 'en', 'url': x['url'], 'direct': direct, 'debridonly': False})
except:
#traceback.print_exc()
pass
return sources
except:
return sources
示例4: novedades
def novedades(item):
logger.info("[serieonline.py] novedades")
# Descarga la página
data = scrapertools.cachePage(item.url)
# Extrae las entradas
patronvideos = '<a href="([^"]+)" title="([^"]+)"><img src="([^"]+)" alt="([^"]+)" class="captify" /></a>'
matches = re.compile(patronvideos,re.DOTALL).findall(data)
if DEBUG: scrapertools.printMatches(matches)
itemlist = []
for match in matches:
scrapedtitle = match[1] + " " + match[3]
scrapedplot = ""
scrapedurl = urlparse.urljoin(item.url,match[0])
scrapedthumbnail = urlparse.urljoin(item.url,match[2])
if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]")
# Añade al listado de XBMC
itemlist.append( Item(channel=CHANNELNAME, action="findvideos", title=scrapedtitle , url=scrapedurl , thumbnail=scrapedthumbnail , plot=scrapedplot , folder=True) )
# Extrae el paginador
patronvideos = '<div class="paginacion-num"><a href="([^"]+)">'
matches = re.compile(patronvideos,re.DOTALL).findall(data)
scrapertools.printMatches(matches)
if len(matches)>0:
scrapedtitle = "Página siguiente"
scrapedurl = urlparse.urljoin(item.url,matches[0])
itemlist.append( Item(channel=CHANNELNAME, action="novedades", title=scrapedtitle , url=scrapedurl , folder=True) )
return itemlist
示例5: getL10nRepositories
def getL10nRepositories(changesets, l10nRepoPath, relbranch=None):
"""Parses a list of locale names and revisions for their associated
repository from the 'changesets' string passed in."""
# urljoin() will strip the last part of l10nRepoPath it doesn't end with
# "/"
if not l10nRepoPath.endswith('/'):
l10nRepoPath = l10nRepoPath + '/'
repositories = {}
try:
for locale, data in json.loads(changesets).iteritems():
locale = urljoin(l10nRepoPath, locale)
repositories[locale] = {
'revision': data['revision'],
'relbranchOverride': relbranch,
'bumpFiles': []
}
except (TypeError, ValueError):
for locale, revision in parsePlainL10nChangesets(changesets).iteritems():
if revision == 'FIXME':
raise Exception('Found FIXME in changesets for locale "%s"' % locale)
locale = urljoin(l10nRepoPath, locale)
repositories[locale] = {
'revision': revision,
'relbranchOverride': relbranch,
'bumpFiles': []
}
return repositories
示例6: __init__
def __init__(self, layer, mapfile, fonts=None):
""" Initialize Mapnik provider with layer and mapfile.
XML mapfile keyword arg comes from TileStache config,
and is an absolute path by the time it gets here.
"""
maphref = urljoin(layer.config.dirpath, mapfile)
scheme, h, path, q, p, f = urlparse(maphref)
if scheme in ('file', ''):
self.mapfile = path
else:
self.mapfile = maphref
self.layer = layer
self.mapnik = None
engine = mapnik.FontEngine.instance()
if fonts:
fontshref = urljoin(layer.config.dirpath, fonts)
scheme, h, path, q, p, f = urlparse(fontshref)
if scheme not in ('file', ''):
raise Exception('Fonts from "%s" can\'t be used by Mapnik' % fontshref)
for font in glob(path.rstrip('/') + '/*.ttf'):
engine.register_font(str(font))
示例7: processJob
def processJob(jobDetails):
try:
job = {}
url = urljoin(rootUrl, jobDetails.a['href'])
soup = thisInstitution.getSoup(url)
subLinks = soup.select('.pinkbox_heading a')
if subLinks:
for link in subLinks:
job['url'] = urljoin(rootUrl, link['href'])
job['title'] = link.get_text()
print job['title']
job["language"] = 'de'
jobPage = thisInstitution.getSoup(job['url'])
content = jobPage.find(id='contentblock')
job['text'] = unicode(content)
thisInstitution.addRecord(job)
else:
job['url'] = url
job['title'] = jobDetails.a.get_text()
print job['title']
job["language"] = 'de'
content = soup.find(id='contentblock')
job['text'] = unicode(content)
thisInstitution.addRecord(job)
except Exception as e:
print e
# record the error with the shared code and continue on to the next url
thisInstitution.error(e.message, job)
return False
示例8: parse
def parse(self, response):
self._logger.info("start response in parse -> response type:%s"%type(response).__name__)
item_urls = [
urljoin(response.url, x) for x in list(set(
response.xpath('//div[@id="resultsCol"]//div[@class="a-row a-spacing-none"]/a[@class="a-link-normal a-text-normal"]/@href').extract()
))
]
self.crawler.stats.inc_total_pages(response.meta['crawlid'], response.meta['spiderid'], response.meta['appid'], len(item_urls))
for item_url in item_urls:
yield Request(url=item_url,
callback=self.parse_item,
meta=response.meta)
workers = response.meta.get('workers', {})
for worker in workers.keys():
workers[worker] = 0
if "if_next_page" in response.meta: del response.meta["if_next_page"]
next_page_urls = [
urljoin(response.url, x) for x in list(set(
response.xpath('//div[@id="pagn"]//span[@class="pagnRA"]/a/@href').extract()
))
]
response.meta["if_next_page"] = True
for next_page_url in next_page_urls:
yield Request(url=next_page_url,
callback=self.parse,
meta=response.meta)
示例9: __search
def __search(self, titles, type, year, season=0, episode=False):
try:
years = [str(year), str(int(year) + 1), str(int(year) - 1)]
years = ['&veroeffentlichung[]=%s' % i for i in years]
query = self.search_link % (type, urllib.quote_plus(cleantitle.query(titles[0])))
query += ''.join(years)
query = urlparse.urljoin(self.base_link, query)
t = [cleantitle.get(i) for i in set(titles) if i]
r = self.__proceed_search(query)
r = [i[0] for i in r if cleantitle.get(i[1]) in t and int(i[2]) == int(season)][0]
url = source_utils.strip_domain(r)
if episode:
r = client.request(urlparse.urljoin(self.base_link, url))
r = dom_parser.parse_dom(r, 'div', attrs={'class': 'season-list'})
r = dom_parser.parse_dom(r, 'li')
r = dom_parser.parse_dom(r, 'a', req='href')
r = [i.attrs['href'] for i in r if i and int(i.content) == int(episode)][0]
url = source_utils.strip_domain(r)
return url
except:
return
示例10: mainlist
def mainlist(item):
logger.info()
thumb_series = get_thumb("squares", "thumb_canales_series.png")
thumb_series_az = get_thumb("squares", "thumb_canales_series_az.png")
thumb_buscar = get_thumb("squares", "thumb_buscar.png")
itemlist = []
itemlist.append(Item(channel=item.channel, title="Listado alfabético", action="series_listado_alfabetico",
thumbnail=thumb_series_az))
itemlist.append(Item(channel=item.channel, title="Todas las series", action="series",
url=urlparse.urljoin(HOST, "listado/"), thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Capítulos de estreno", action="homeSection", extra=CAPITULOS_DE_ESTRENO_STR,
url=HOST , thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Último actualizado", action="homeSection", extra="Último Actualizado",
url=HOST , thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Series más vistas", action="homeSection", extra="Series Más vistas",
url=HOST , thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Series menos vistas", action="homeSection", extra="Series Menos vistas",
url=HOST , thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Últimas fichas creadas", action="series",
url=urlparse.urljoin(HOST, "fichas_creadas/"), thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Buscar...", action="search", url=HOST, thumbnail=thumb_buscar))
if filtertools.context:
itemlist = filtertools.show_option(itemlist, item.channel, list_idiomas, CALIDADES)
return itemlist
示例11: parseImgLinks
def parseImgLinks(self,depth=1):
url_response = None
try:
url_response = urllib2.urlopen(self.scrap_url,timeout=self._timeout)
except Exception as e:
print(" [ERROR]: Could not open {0}: {1}".format(self.scrap_url,e.reason))
return self.img_list
html_parse = BeautifulSoup(url_response)
unique_images_found = 0
total_images_found = 0
self.visited[self.scrap_url] = 1
for img in html_parse.findAll('img'):
try:
abs_url = urljoin(self.scrap_url,img['src']) if urlparse(img['src']).netloc == "" else img['src']
if abs_url not in self.img_list:
self.img_list.add(abs_url)
unique_images_found += 1
total_images_found += 1
except:
pass
print(" [Found %d images / %d new]: %s" % (total_images_found,unique_images_found,self.scrap_url))
if depth > 1:
for a in html_parse.findAll('a'):
try:
if (urlparse(a['href']).netloc == "") or (urlparse(self.scrape_url_orig).netloc == urlparse(a['href']).netloc):
self.scrap_url = urljoin(self.scrape_url_orig,a['href'])
if self.scrap_url in self.visited: continue
self.parseImgLinks(depth - 1)
except:
pass
return self.img_list
示例12: episodios
def episodios(item):
logger.info("{0} - {1}".format(item.title, item.url))
itemlist = []
# Descarga la página
data = scrapertools.cache_page(item.url)
fanart = scrapertools.find_single_match(data, "background-image[^'\"]+['\"]([^'\"]+)")
plot = scrapertools.find_single_match(data, "id=['\"]profile2['\"]>\s*(.*?)\s*</div>")
logger.debug("fanart: {0}".format(fanart))
logger.debug("plot: {0}".format(plot))
episodes = re.findall("<tr.*?href=['\"](?P<url>[^'\"]+).+?>(?P<title>.+?)</a>.*?<td>(?P<flags>.*?)</td>", data, re.MULTILINE | re.DOTALL)
for url, title, flags in episodes:
idiomas = " ".join(["[{0}]".format(IDIOMAS.get(language, "OVOS")) for language in re.findall("banderas/([^\.]+)", flags, re.MULTILINE)])
displayTitle = "{show} - {title} {languages}".format(show = item.show, title = title, languages = idiomas)
logger.debug("Episode found {0}: {1}".format(displayTitle, urlparse.urljoin(HOST, url)))
itemlist.append(item.clone(title=displayTitle, url=urlparse.urljoin(HOST, url),
action="findvideos", plot=plot, fanart=fanart, language=idiomas,
list_idiomas=list_idiomas, list_calidad=CALIDADES, context=filtertools.context))
if len(itemlist) > 0 and filtertools.context:
itemlist = filtertools.get_links(itemlist, item.channel)
if config.get_library_support() and len(itemlist) > 0:
itemlist.append(item.clone(title="Añadir esta serie a la biblioteca", action="add_serie_to_library", extra="episodios"))
return itemlist
示例13: choose_reference
def choose_reference(experiment, biorep_n, server, keypair, sex_specific):
replicates = [common.encoded_get(urlparse.urljoin(server,rep_uri), keypair, frame='embedded') for rep_uri in experiment['replicates']]
replicate = next(rep for rep in replicates if rep.get('biological_replicate_number') == biorep_n)
logging.debug('Replicate uuid %s' %(replicate.get('uuid')))
organism_uri = replicate.get('library').get('biosample').get('organism')
organism_obj = common.encoded_get(urlparse.urljoin(server,organism_uri), keypair)
try:
organism_name = organism_obj['name']
except:
logging.error('%s:rep%d Cannot determine organism.' %(experiment.get('accession'), biorep_n))
raise
return None
else:
logging.debug("Organism name %s" %(organism_name))
if sex_specific:
try:
sex = replicate.get('library').get('biosample').get('sex')
assert sex in ['male', 'female']
except:
logging.warning('%s:rep%d Sex is %s. Mapping to male reference.' %(experiment.get('accession'), biorep_n, sex))
sex = 'male'
logging.debug('Organism %s sex %s' %(organism_name, sex))
else:
sex = 'male'
genome_assembly = args.assembly
reference = next((ref.get('file') for ref in REFERENCES if ref.get('organism') == organism_name and ref.get('sex') == sex and ref.get('assembly') == genome_assembly), None)
logging.debug('Found reference %s' %(reference))
return reference
示例14: check_page
def check_page(self, page):
self.marionette.navigate(urlparse.urljoin(self.server_prefix, page))
try:
self.marionette.find_element("id", 'complete')
except NoSuchElementException:
fullPageUrl = urlparse.urljoin(self.relPath, page)
details = "%s: 1 failure encountered\n%s" % \
(fullPageUrl,
self.get_failure_summary(
fullPageUrl, "Waiting for Completion",
"Could not find the test complete indicator"))
raise AssertionError(details)
fail_node = self.marionette.find_element("css selector",
'.failures > em')
if fail_node.text == "0":
return
# This may want to be in a more general place triggerable by an env
# var some day if it ends up being something we need often:
#
# If you have browser-based unit tests which work when loaded manually
# but not from marionette, uncomment the two lines below to break
# on failing tests, so that the browsers won't be torn down, and you
# can use the browser debugging facilities to see what's going on.
#from ipdb import set_trace
#set_trace()
raise AssertionError(self.get_failure_details(page))
示例15: as_obi_serialization
def as_obi_serialization(self, request=None):
"""Produce an Open Badge Infrastructure serialization of this badge"""
if request:
base_url = request.build_absolute_uri('/')
else:
base_url = 'http://%s' % (Site.objects.get_current().domain,)
# see: https://github.com/brianlovesdata/openbadges/wiki/Assertions
if not self.creator:
issuer = SITE_ISSUER
else:
issuer = {
# TODO: Get from user profile instead?
"origin": urljoin(base_url, self.creator.get_absolute_url()),
"name": self.creator.username,
"contact": self.creator.email
}
data = {
# The version of the spec/hub this manifest is compatible with. Use
# "0.5.0" for the beta.
"version": OBI_VERSION,
# TODO: truncate more intelligently
"name": self.title[:128],
# TODO: truncate more intelligently
"description": self.description[:128],
"criteria": urljoin(base_url, self.get_absolute_url()),
"issuer": issuer
}
if self.image:
data['image'] = urljoin(base_url, self.image.url)
return data