本文整理汇总了Python中urllib.parse.urlsplit函数的典型用法代码示例。如果您正苦于以下问题:Python urlsplit函数的具体用法?Python urlsplit怎么用?Python urlsplit使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urlsplit函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _main
def _main():
base_url = sys.argv[1]
soup = bs4.BeautifulSoup(urlopen(base_url), from_encoding="windows-1252")
index_urls = [urljoin(base_url, h3("a")[0]["href"]) for h3 in soup("h3")]
for index_url in index_urls:
try:
resp = urlopen(index_url)
except HTTPError as err:
print(err, err.url, file=sys.stderr)
print("Skipping..", file=sys.stderr)
continue
index_soup = bs4.BeautifulSoup(resp, from_encoding="iso-8859-1")
index_path = urlsplit(index_url).path
index_filepath = os.path.normpath("." + index_path)
try:
os.makedirs(os.path.dirname(index_filepath))
except OSError as e:
if e.errno != errno.EEXIST:
raise e
for issue_url in iter_issue_urls(index_soup):
issue_url = urljoin(index_url, issue_url)
try:
resp = urlopen(issue_url)
except HTTPError as err:
print(err, err.url, file=sys.stderr)
print("Skipping..", file=sys.stderr)
continue
issue_soup = bs4.BeautifulSoup(resp, from_encoding="windows-1252")
issue_path = urlsplit(issue_url).path
issue_filepath = os.path.normpath("." + issue_path)
with open(issue_filepath, "w") as f:
print(klupu.clean_soup(issue_soup), file=f)
with open(index_filepath, "w") as f:
print(klupu.clean_soup(index_soup), file=f)
示例2: oauth
def oauth(self, req, credentials = None, params = {}):
#NOTE: While flickr supports HTTPS in its oauth endpoints, flickr
#thinks that the HTTPS endpoints are being accessed via HTTP, and thus
#constructs the signature base string accordingly, which
#will hence not match the signature base string generated by
#pyoauth1client. We solve this by replacing HTTPS with HTTP
#when generating the signature base string, and then revert the change
#after the base string is generated. This way the signature
#base string will match the one generated by flickr even though
#we are accessing the endpoints via HTTPS for ADDED SECURITY!!!111one
x = urlsplit(req.url)
if x.scheme == "https":
#Remove the HTTPS Scheme
https = True
x = x._replace(scheme = "http")
req = req._replace(url = urlunsplit(x))
else:
https = False
y = super().oauth(req, credentials, params)
if https:
#Add back the HTTPS scheme
x = urlsplit(y.url)
x = x._replace(scheme = "https")
y = y._replace(url = urlunsplit(x))
return y
示例3: main
def main(GET):
global mail,error,error_list
parser = argparse.ArgumentParser(description='Scrape a simple site.')
parser.add_argument('url', help='the URL at which to begin')
start_url = parser.parse_args().url
starting_netloc = urlsplit(start_url).netloc
url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
scrape((GET, start_url), url_filter)
print ("\n\nresult--------------------------------\nerror:%d" %(error))
count = 1;
for url in error_list:
print(url)
print("\n")
for url in mail:
print("[%d]url:%s" %(count,url))
data = mail[url][0]
if data:
tmp = []
for val in data:
if not val in tmp:
print (val)
tmp.append(val)
else:
print("None")
print ("")
count+=1
示例4: __form_data
def __form_data(text, formid, params, soup=None, form_url=None):
if type(params) is not dict:
raise TypeError('Params must be a dict')
if soup is None:
soup = BeautifulSoup(text, 'html.parser')
form = soup.find('form', attrs={'id': formid})
action = form.attrs.get('action')
if not urlsplit(action).netloc:
if form_url is None or not urlsplit(form_url).netloc:
raise ValueError('kwarg form_url must be specified if form '
'action lacks a host')
action = urljoin(form_url, action)
inputs = form.find_all('input') + form.find_all('textarea')
for i in inputs:
try:
name = i.attrs['name']
type_ = i.attrs['type']
value = params.get(name)
if type_ == 'submit':
continue
elif type_ == 'hidden':
value = i.attrs['value'] if value is None else value
elif value is None:
raise ValueError('kwarg params dictionary is missing a '
'value for a non-hidden field')
except KeyError:
pass
else:
params[name] = value
return Session.FormInfo(params=params, post_url=action)
示例5: clean_url
def clean_url(value):
"""
Taken from Django' URLField, this helps to normalize URLs. Raises a
ValueError if an invalid url is passed.
Example:
>>> clean_url("www.google.com")
"http://www.google.com"
>>> clean_url("_.com")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: Enter a valid URL.
"""
if value:
value = value.strip()
value = value.encode('ascii', 'ignore').decode("utf-8")
url_fields = list(urlsplit((value)))
if not url_fields[0]:
# If no URL scheme given, assume http://
url_fields[0] = 'http'
if not url_fields[1]:
# Assume that if no domain is provided, that the path segment
# contains the domain.
url_fields[1] = url_fields[2]
url_fields[2] = ''
# Rebuild the url_fields list, since the domain segment may now
# contain the path too.
url_fields = list(urlsplit((urlunsplit(url_fields))))
if not url_fields[2]:
# the path portion may need to be added before query params
url_fields[2] = '/'
value = urlunsplit(url_fields)
return value
示例6: assertRedirects
def assertRedirects(self, response, expected_url, status_code=302,
target_status_code=200, host=None):
"""Asserts that a response redirected to a specific URL, and that the
redirect URL can be loaded.
Note that assertRedirects won't work for external links since it uses
TestClient to do a request.
"""
self.assertEqual(response.status_code, status_code,
("Response didn't redirect as expected: Response code was %d"
" (expected %d)" % (response.status_code, status_code)))
url = response['Location']
scheme, netloc, path, query, fragment = urlsplit(url)
e_scheme, e_netloc, e_path, e_query, e_fragment = urlsplit(expected_url)
if not (e_scheme or e_netloc):
expected_url = urlunsplit(('http', host or 'testserver', e_path,
e_query, e_fragment))
self.assertEqual(url, expected_url,
"Response redirected to '%s', expected '%s'" % (url, expected_url))
# Get the redirection page, using the same client that was used
# to obtain the original response.
redirect_response = response.client.get(path, QueryDict(query))
self.assertEqual(redirect_response.status_code, target_status_code,
("Couldn't retrieve redirection page '%s': response code was %d"
" (expected %d)") %
(path, redirect_response.status_code, target_status_code))
示例7: parse_url
def parse_url(link):
"""Say Website Title information in channel"""
baseurl = '{uri.scheme}://{uri.netloc}'.format(uri=urlsplit(link))
path = urlsplit(link).path
query = '?{uri.query}'.format(uri=urlsplit(link))
try:
headers = {'Accept-Encoding': 'utf-8',
'User-Agent': 'Mozilla/5.0'}
response = get(baseurl + path + query, headers=headers)
except:
return
if response.headers["Content-Type"] and "text/html" in response.headers["Content-Type"]:
try:
URL = BeautifulSoup(response.text, "html.parser")
except:
return
if not URL.title:
return
if URL.title.string is None:
return
if len(URL.title.string) > 250:
title=URL.title.string[0:250] + '…'
else:
title=URL.title.string
return title.replace('\n', ' ').strip() + " (" + urlsplit(link).netloc + ")"
else:
return
示例8: find_pingback_urls
def find_pingback_urls(self, urls):
"""Find the pingback urls of each urls"""
pingback_urls = {}
for url in urls:
try:
page = urlopen(url)
headers = page.info()
if 'text/' not in headers.get('Content-Type', '').lower():
continue
server_url = headers.get('X-Pingback')
if not server_url:
server_url = self.find_pingback_href(page.read())
if server_url:
server_url_splitted = urlsplit(server_url)
if not server_url_splitted.netloc:
url_splitted = urlsplit(url)
server_url = '%s://%s%s' % (url_splitted.scheme,
url_splitted.netloc,
server_url)
pingback_urls[url] = server_url
except IOError:
pass
return pingback_urls
示例9: run
def run(self):
while True:
# grabs url from queue
level, u = self.input_q.get()
main = '{0.scheme}://{0.netloc}/'.format(urlsplit(u))
# fetching urls
if level < MAX_URL_LEVEL:
html = _get_content(u)
if not isinstance(html, list):
soup = bs(html)
for link in soup.find_all('a'):
href = link.get('href')
if not href or len(href) < 2:
continue
# Check if URL is relative
elif not urlsplit(href)[0] and not urlsplit(href)[1]:
self.output_q.put((level+1, _url_discard(urljoin(u, href))))
elif href.startswith(main):
self.output_q.put((level+1, _url_discard(href)))
else:
# Place for possible error logs (:
pass
# signals to queue job is done
self.input_q.task_done()
示例10: test_flow
def test_flow(self):
url = self.sp.make_auth_req()
status, headers, _ = self.getPage(url)
assert status == '303 See Other'
url = self.get_redirect_location(headers)
req = parse_qs(urlsplit(url).query)
assert 'SAMLRequest' in req
assert 'RelayState' in req
action, body = self.idp.handle_auth_req(req['SAMLRequest'][0],
req['RelayState'][0],
BINDING_HTTP_REDIRECT,
'test1')
status, headers, body = self.getPage(action, method='POST',
body=urlencode(body))
assert status == '302 Found'
url = self.get_redirect_location(headers)
req = parse_qs(urlsplit(url).query)
assert 'SAMLResponse' in req
assert 'RelayState' in req
resp = self.sp.parse_authn_request_response(req['SAMLResponse'][0],
BINDING_HTTP_REDIRECT)
identity = resp.ava
assert identity["displayName"][0] == "Test1"
assert identity["sn"][0] == "[email protected]"
assert identity['o'][0] == "Small university"
示例11: _url
def _url(self, hashed_name_func, name, force=False, hashed_files=None):
"""
Return the non-hashed URL in DEBUG mode.
"""
if settings.DEBUG and not force:
hashed_name, fragment = name, ''
else:
clean_name, fragment = urldefrag(name)
if urlsplit(clean_name).path.endswith('/'): # don't hash paths
hashed_name = name
else:
args = (clean_name,)
if hashed_files is not None:
args += (hashed_files,)
hashed_name = hashed_name_func(*args)
final_url = super().url(hashed_name)
# Special casing for a @font-face hack, like url(myfont.eot?#iefix")
# http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
query_fragment = '?#' in name # [sic!]
if fragment or query_fragment:
urlparts = list(urlsplit(final_url))
if fragment and not urlparts[4]:
urlparts[4] = fragment
if query_fragment and not urlparts[3]:
urlparts[2] += '?'
final_url = urlunsplit(urlparts)
return unquote(final_url)
示例12: https_open
def https_open(self, request):
"""
Send an HTTP request, which can be either GET or POST,
depending on req.has_data()
Args:
request - instance of urllib2.Request
"""
full_url = request.get_full_url()
url_parts = parse.urlsplit(full_url)
robo = None
if url_parts.netloc in self.robots:
robo = self.robots[url_parts.netloc]
else:
# Getting request url, for checking robots.txt
host = parse.urlsplit(full_url)[1]
rurl = parse.urlunparse(("http", host, "/robots.txt", "", ""))
robo = reppy.cache.RobotsCache()
robo.fetch(rurl, self.agent_name)
self.robots[url_parts.netloc] = robo
# Is url allow for crawler in robots.txt
if robo.allowed(full_url, self.agent_name):
# Return result of request
return request.HTTPHandler.https_open(self, request)
else:
raise RuntimeError('Forbidden by robots.txt')
示例13: get_fetcher
def get_fetcher(url=None, *, item=dict()):
RTMP_PROTOCOLS = {'rtmp', 'rtmpt', 'rtmpe', 'rtmpte'}
url = item.get("url", url)
if urlsplit(url).scheme in RTMP_PROTOCOLS:
return RtmpFetcher(url, live=True)
auth = comm.get_auth()
protocol = urlsplit(auth['server']).scheme
if protocol in RTMP_PROTOCOLS:
(url, ext) = url.rsplit('.', 1) # strip the extension (.flv or .mp4)
url = auth['playpath_prefix'] + url
if ext == 'mp4':
url = 'mp4:' + url
rtmp_url = auth['rtmp_url']
token = auth.get('token')
if token:
# Cannot use urljoin() because
# the RTMP scheme would have to be added to its whitelist
rtmp_url += '?auth=' + token
return RtmpFetcher(rtmp_url, playpath=url)
else:
return HdsFetcher(url, auth)
示例14: zoom_article
def zoom_article(self, ticket_id, article_id):
art_descr = self.__db.article_description(article_id)
if art_descr[4] & ART_TEXT:
return eval(self.__db.article_message(article_id))
self.echo("Zoom article:", ticket_id, article_id)
url_beg = urlsplit(self.runtime.get("site"))[:3]
params = (
("Action", "AgentTicketZoom"), ("Subaction", "ArticleUpdate"),
("TicketID", ticket_id), ("ArticleID", article_id),
("OTRSAgentInterface", self.runtime["OTRSAgentInterface"]))
url = urlunsplit(url_beg + (urlencode(params), ""))
pg = TicketsPage(self.core)
page = pg.load(url)
if page is None:
return
mail_header = page.get("mail_header", [])
if "mail_src" in page:
url = urlunsplit(url_beg[:2] + urlsplit(page["mail_src"])[2:])
self.echo("Get message:", url)
pg = MessagePage(self.core)
try:
mail_text = pg.load(url)
except LoginError:
mail_text = pg.login()
else:
mail_text = page["message_text"]
if mail_header:
mail_text.insert(0, ("\n",))
for i in reversed(mail_header):
mail_text.insert(0, ("%s\t%s\n" % i,))
shrink_tupled_text(mail_text)
self.__db.article_message(article_id, repr(mail_text))
return mail_text
示例15: main
def main(GET):
parser = argparse.ArgumentParser(description='Scrape a simple site.')
parser.add_argument('url', help='the URL at which to begin')
start_url = parser.parse_args().url
starting_netloc = urlsplit(start_url).netloc
url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
scrape((GET, start_url), url_filter)