本文整理汇总了Python中urlparse.urlparse函数的典型用法代码示例。如果您正苦于以下问题:Python urlparse函数的具体用法?Python urlparse怎么用?Python urlparse使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urlparse函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: sitemap_parse
def sitemap_parse(sitemap_option, astring, google_results, website_url):
not_indexed = []
not_sitemap = []
error = ''
sitemap_results = []
website_host = urlparse(website_url).scheme
if website_host != '':
website_url = urlparse(website_url).scheme + "://" + urlparse(website_url).netloc
if website_url[-1] != '/':
website_url += '/'
if astring != '':
if sitemap_option == 'sitemap':
resp = requests.get(astring)
soup = Soup(resp.content)
elif sitemap_option == 'upload_sitemap':
soup = Soup(astring)
urls = soup.findAll('url')
for u in urls:
loc = u.find('loc').string
sitemap_results.append(loc)
if loc not in google_results:
not_indexed.append(loc)
for loc in google_results:
if loc not in sitemap_results:
not_sitemap.append(loc)
return not_indexed, not_sitemap, error
示例2: ConfigureHostnames
def ConfigureHostnames(config):
"""This configures the hostnames stored in the config."""
if flags.FLAGS.external_hostname:
hostname = flags.FLAGS.external_hostname
else:
try:
hostname = socket.gethostname()
except (OSError, IOError):
print "Sorry, we couldn't guess your hostname.\n"
hostname = RetryQuestion("Please enter your hostname e.g. "
"grr.example.com", "^[\\.A-Za-z0-9-]+$", hostname)
print """\n\n-=Server URL=-
The Server URL specifies the URL that the clients will connect to
communicate with the server. For best results this should be publicly
accessible. By default this will be port 8080 with the URL ending in /control.
"""
frontend_url = RetryQuestion("Frontend URL", "^http://.*/$",
"http://%s:8080/" % hostname)
config.Set("Client.server_urls", [frontend_url])
frontend_port = urlparse.urlparse(frontend_url).port or config_lib.CONFIG.Get(
"Frontend.bind_port")
config.Set("Frontend.bind_port", frontend_port)
print """\n\n-=AdminUI URL=-:
The UI URL specifies where the Administrative Web Interface can be found.
"""
ui_url = RetryQuestion("AdminUI URL", "^http[s]*://.*$",
"http://%s:8000" % hostname)
config.Set("AdminUI.url", ui_url)
ui_port = urlparse.urlparse(ui_url).port or config_lib.CONFIG.Get(
"AdminUI.port")
config.Set("AdminUI.port", ui_port)
示例3: __call__
def __call__(self, url, count_of_crawler):
"""
Function which fetch the content from the given URL and collect all the
URL in the content and pass the first url of the page to fetch the
content.
"""
try:
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
links_on_page = map(lambda anchor: anchor.get('href'),
soup.find_all('a'))
cleaned_url = map(lambda link: link if urlparse(link).scheme
and urlparse(url).netloc else (urlparse(url)
.scheme+"://"+urlparse(url).netloc+link if
link[0] == "/" else url+link), links_on_page)
visited_url.append(url)
total_collected_url.append(cleaned_url)
next_url_to_visit = [next_url for next_url in cleaned_url\
if not next_url in visited_url and not "#" in next_url][0]
if count_of_crawler and next_url_to_visit:
count_of_crawler = crawler(next_url_to_visit,
count_of_crawler-1)
except:
print "It seems there is some issue in URL "+url
return count_of_crawler
示例4: absolute_url
def absolute_url(url, base_href):
"""
>>> absolute_url('foo', 'http://base/whatever/ooo/fdsh')
'http://base/whatever/ooo/foo'
>>> absolute_url('foo/bar/', 'http://base')
'http://base/foo/bar/'
>>> absolute_url('/foo/bar', 'http://base/whatever/fdskf')
'http://base/foo/bar'
>>> absolute_url('\\n/foo/bar', 'http://base/whatever/fdskf')
'http://base/foo/bar'
>>> absolute_url('http://localhost/foo', 'http://base/whatever/fdskf')
'http://localhost/foo'
"""
url = url.strip()
proto = urlparse(url)[0]
if proto:
return url
base_url_parts = urlparse(base_href)
base_server = '://'.join(base_url_parts[:2])
if url.startswith('/'):
return base_server + url
else:
path = base_url_parts[2]
if '/' in path:
path = path.rsplit('/', 1)[0] + '/'
else:
path = '/'
return base_server + path + url
示例5: checkRedir
def checkRedir(self, orig_path):
# old_url = portal_url+item['_orig_path']
# XXX: referers to target and not portal
old_url = self.target + orig_path
# this downloads file. We need a way to do this without the download
_, host, targetpath, _, _, _ = urlparse.urlparse(self.target)
if "@" in host:
auth, host = host.split("@")
else:
auth = None
conn = httplib.HTTPConnection(host)
headers = {}
if auth:
auth = "Basic " + string.strip(base64.encodestring(auth))
headers["Authorization"] = auth
# /view is a hack as zope seems to send all content on head request
conn.request("HEAD", targetpath + orig_path, headers=headers)
res = conn.getresponse()
redir = res.status == 301
if redir and res.getheader("location"):
_, _, oldpath, _, _, _ = urlparse.urlparse(res.getheader("location"))
parts = oldpath.split("/")
if parts[-1] == "view":
parts = parts[:-1]
return "/".join(parts)
if res.status == 200:
return orig_path
return None
示例6: __init__
def __init__(self, layer, mapfile, fonts=None):
""" Initialize Mapnik provider with layer and mapfile.
XML mapfile keyword arg comes from TileStache config,
and is an absolute path by the time it gets here.
"""
maphref = urljoin(layer.config.dirpath, mapfile)
scheme, h, path, q, p, f = urlparse(maphref)
if scheme in ('file', ''):
self.mapfile = path
else:
self.mapfile = maphref
self.layer = layer
self.mapnik = None
engine = mapnik.FontEngine.instance()
if fonts:
fontshref = urljoin(layer.config.dirpath, fonts)
scheme, h, path, q, p, f = urlparse(fontshref)
if scheme not in ('file', ''):
raise Exception('Fonts from "%s" can\'t be used by Mapnik' % fontshref)
for font in glob(path.rstrip('/') + '/*.ttf'):
engine.register_font(str(font))
示例7: getRepStr
def getRepStr(self):
urlList1 = [ i for i in self.url1.replace('http://', '').split('/') if i]
urlList2 = [ i for i in self.url2.replace('http://', '').split('/') if i]
# print urlList1
# print urlList2
n = 0
while True:
if urlList1[:n]==urlList2[:n]:
n+=1
if n>10:
break
continue
break
urlPart = 'http://'+'/'.join(urlList1[:n-1])
if urlparse(urlPart).netloc and ('.' not in urlparse(urlPart).path):
urlPart += '/'
urlListLen = len(urlList1[n-1:])
if urlListLen<1:
return (urlPart, './')
if urlListLen>=1:
return (urlPart, urlListLen*'../', self.url1, self.url2)
示例8: test_that_checks_redirect_using_incorrect_query_values
def test_that_checks_redirect_using_incorrect_query_values(self, base_url):
param = {
'product': 'firefox-31.0',
'lang': 'kitty_language',
'os': 'stella'
}
response = self._head_request(base_url, params=param)
assert (requests.codes.not_found == response.status_code,
self.response_info_failure_message(base_url, param, response))
parsed_url = urlparse(response.url)
assert ('http' == parsed_url.scheme, 'Failed to redirect to the correct scheme. %s' %
self.response_info_failure_message(base_url, param, response))
assert (urlparse(base_url).netloc == parsed_url.netloc,
self.response_info_failure_message(base_url, param, response))
assert (urlencode(param) == parsed_url.query,
self.response_info_failure_message(base_url, param, response))
assert ('Unknown' != self.get_x_backend_server(response),
'Failed, x-backend-server was not in the response object. %s' %
self.response_info_failure_message(base_url, param, response))
示例9: generateUrls
def generateUrls(url):
baseulp = urlparse(url)
host = baseulp.netloc
paths = getCrawlerPaths(url)
#pprint(paths)
urls = []
rulefile = BASEDIR + '/lib/db/compresed_file.rule'
for eachpath in paths:
eachulp = urlparse(eachpath)
if eachulp.path == '':
host = eachulp.netloc
domain = GetFirstLevelDomain(host)
args = {'host':host,'com':domain}
else:
pos = eachulp.path.rfind('/')
tmp = eachulp.path[pos+1:]
args = {'com':tmp}
rf = RuleFile(rulefile,args)
rf._getRules()
for i in rf.ret:
urls.append(eachpath + '/' +i)
ret = list(set(urls))
ret.sort()
return ret
示例10: get_show
def get_show(self, imdb, tvdb, show, show_alt, year):
try:
query = self.search_link
post = urllib.urlencode({'searchquery': show, 'searchin': '2'})
result = ''
links = [self.link_1, self.link_3]
for base_link in links:
result = client.source(urlparse.urljoin(base_link, query), post=post, headers=self.headers)
if 'widget search-page' in str(result): break
result = client.parseDOM(result, "div", attrs = { "class": "widget search-page" })[0]
result = client.parseDOM(result, "td")
shows = [cleantitle.tv(show), cleantitle.tv(show_alt)]
years = ['(%s)' % str(year), '(%s)' % str(int(year)+1), '(%s)' % str(int(year)-1)]
result = [(client.parseDOM(i, "a", ret="href")[-1], client.parseDOM(i, "a")[-1]) for i in result]
result = [i for i in result if any(x == cleantitle.tv(i[1]) for x in shows)]
result = [i[0] for i in result if any(x in i[1] for x in years)][0]
url = client.replaceHTMLCodes(result)
try: url = urlparse.parse_qs(urlparse.urlparse(url).query)['u'][0]
except: pass
url = urlparse.urlparse(url).path
url = url.encode('utf-8')
return url
except:
return
示例11: sources
def sources(self, url, hostDict, hostprDict):
try:
sources = []
if url == None: return sources
url = urlparse.urljoin(self.base_link, url)
r = proxy.request(url, 'tv shows')
links = client.parseDOM(r, 'a', ret='href', attrs = {'target': '.+?'})
links = [x for y,x in enumerate(links) if x not in links[:y]]
for i in links:
try:
url = i
url = proxy.parse(url)
url = urlparse.parse_qs(urlparse.urlparse(url).query)['r'][0]
url = url.decode('base64')
url = client.replaceHTMLCodes(url)
url = url.encode('utf-8')
host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0]
if not host in hostDict: raise Exception()
host = host.encode('utf-8')
sources.append({'source': host, 'quality': 'SD', 'language': 'en', 'url': url, 'direct': False, 'debridonly': False})
except:
pass
return sources
except:
return sources
示例12: mangle_url
def mangle_url(self, url):
self.check_connection()
try:
endpoint_url = urlparse.urlparse(url)
except Exception as e:
script_unknown("you must provide an endpoint_url in the form"
+ "<scheme>://<url>/ (%s)\n" % e)
scheme = endpoint_url.scheme
if scheme is None:
script_unknown("you must provide an endpoint_url in the form"
+ "<scheme>://<url>/ (%s)\n" % e)
catalog_url = None
try:
catalog_url = urlparse.urlparse(
self.nova_client.client.management_url)
except Exception as e:
script_unknown("unknown error parsing the catalog url : %s\n" % e)
port = endpoint_url.port
if port is None:
if catalog_url.port is None:
port = 8774
else:
port = catalog_url.port
netloc = "%s:%i" % (endpoint_url.hostname, port)
url = urlparse.urlunparse([scheme,
netloc,
catalog_url.path,
catalog_url.params,
catalog_url.query,
catalog_url.fragment])
self.nova_client.client.set_management_url(url)
示例13: get_download_url_ssl
def get_download_url_ssl(self):
"""
SSL-enabled links should be used for the specific verions, except the
Windows stub installers.
"""
# SSL-enabled links won't be used for 26.0
url = firefox_details.get_download_url("OS X", "pt-BR", "26.0")
self.assertListEqual(
parse_qsl(urlparse(url).query), [("product", "firefox-26.0"), ("os", "osx"), ("lang", "pt-BR")]
)
# SSL-enabled links won't be used for 27.0 Windows builds (but SSL
# download is enabled by default for stub installers)
url = firefox_details.get_download_url("Windows", "pt-BR", "27.0")
self.assertListEqual(
parse_qsl(urlparse(url).query), [("product", "firefox-27.0"), ("os", "win"), ("lang", "pt-BR")]
)
# SSL-enabled links will be used for 27.0 OS X builds
url = firefox_details.get_download_url("OS X", "pt-BR", "27.0")
self.assertListEqual(
parse_qsl(urlparse(url).query), [("product", "firefox-27.0-SSL"), ("os", "osx"), ("lang", "pt-BR")]
)
# SSL-enabled links will be used for 27.0 Linux builds
url = firefox_details.get_download_url("Linux", "pt-BR", "27.0")
self.assertListEqual(
parse_qsl(urlparse(url).query), [("product", "firefox-27.0-SSL"), ("os", "linux"), ("lang", "pt-BR")]
)
示例14: get_all_href_list
def get_all_href_list(root_my_url, soup, file_encode):
root_parse = urlparse.urlparse(root_my_url.get_abs_url())
href_list = []
if not root_parse.hostname:
return href_list
# get tags' href
tag_list = soup.find_all(['a', 'img', 'link'])
href_filter = r'#|\n|(mailto:)'
for tag in tag_list:
add_my_url = DownloadUrl(None, None, root_my_url.get_abs_path())
if tag.get('href') and not re.search(href_filter, tag.get('href')):
add_my_url.url = tag.get('href')
elif tag.get('src'):
add_my_url.url = tag.get('src')
if add_my_url.url:
temp_parse = urlparse.urlparse(add_my_url.url)
if temp_parse.hostname:
add_my_url.host = temp_parse.hostname
else:
add_my_url.host = root_parse.hostname
href_list.append(add_my_url)
return href_list
示例15: __init__
def __init__(self, uri, consumer, extra_headers=None):
asyncore.dispatcher_with_send.__init__(self)
# turn the uri into a valid request
scheme, host, path, params, query, fragment = urlparse.urlparse(uri)
# use origin host
self.host = host
# get proxy settings, if any
proxy = self.proxies.get(scheme)
if proxy:
scheme, host, x, x, x, x = urlparse.urlparse(proxy)
assert scheme == "http", "only supports HTTP requests (%s)" % scheme
if not path:
path = "/"
if params:
path = path + ";" + params
if query:
path = path + "?" + query
if proxy:
path = scheme + "://" + self.host + path
self.path = path
# get port number
try:
host, port = host.split(":", 1)
port = int(port)
except (TypeError, ValueError):
port = 80 # default port
self.consumer = consumer
self.status = None
self.header = None
self.bytes_in = 0
self.bytes_out = 0
self.content_type = None
self.content_length = None
self.content_encoding = None
self.transfer_encoding = None
self.data = ""
self.chunk_size = None
self.timestamp = time.time()
self.extra_headers = extra_headers
self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
try:
self.connect((host, port))
except socket.error:
self.consumer.http(0, self, sys.exc_info())