本文整理汇总了Python中w3lib.url.safe_url_string函数的典型用法代码示例。如果您正苦于以下问题:Python safe_url_string函数的具体用法?Python safe_url_string怎么用?Python safe_url_string使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了safe_url_string函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_safe_url_port_number
def test_safe_url_port_number(self):
self.assertEqual(
safe_url_string(u"http://www.example.com:80/résumé?q=résumé"),
"http://www.example.com:80/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
self.assertEqual(
safe_url_string(u"http://www.example.com:/résumé?q=résumé"),
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
示例2: test_safe_url_string_misc
def test_safe_url_string_misc(self):
# mixing Unicode and percent-escaped sequences
safeurl = safe_url_string(u"http://www.example.com/£?unit=%C2%B5")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
safeurl = safe_url_string(u"http://www.example.com/%C2%A3?unit=µ")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
示例3: _set_url
def _set_url(self, url):
if isinstance(url, str):
self._url = safe_url_string(url)
elif isinstance(url, unicode):
if self.encoding is None:
raise TypeError('Cannot convert unicode url - %s has no encoding' %
type(self).__name__)
unicode_url = url if isinstance(url, unicode) else url.decode(self.encoding)
self._url = safe_url_string(unicode_url, self.encoding)
else:
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
示例4: test_safe_url_string_bytes_input_nonutf8
def test_safe_url_string_bytes_input_nonutf8(self):
# latin1
safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")
# cp1251
# >>> u'Россия'.encode('cp1251')
# '\xd0\xee\xf1\xf1\xe8\xff'
safeurl = safe_url_string(b"http://www.example.com/country/\xd0\xee\xf1\xf1\xe8\xff")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/country/%D0%EE%F1%F1%E8%FF")
示例5: _set_url
def _set_url(self, url):
if isinstance(url, str):
self._url = escape_ajax(safe_url_string(url))
elif isinstance(url, unicode):
if self.encoding is None:
raise TypeError('Cannot convert unicode url - %s has no encoding' %
type(self).__name__)
unicode_url = url if isinstance(url, unicode) else url.decode(self.encoding)
self._url = safe_url_string(unicode_url, self.encoding)
else:
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
if ':' not in self._url:
raise ValueError('Missing scheme in request url: %s' % self._url)
示例6: test_safe_url_idna_encoding_failure
def test_safe_url_idna_encoding_failure(self):
# missing DNS label
self.assertEqual(
safe_url_string(u"http://.example.com/résumé?q=résumé"),
"http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
# DNS label too long
self.assertEqual(
safe_url_string(
u"http://www.{label}.com/résumé?q=résumé".format(
label=u"example"*11)),
"http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
label=u"example"*11))
示例7: get_meta_refresh
def get_meta_refresh(text, baseurl='', encoding='utf-8'):
"""Return the http-equiv parameter of the HTML meta element from the given
HTML text and return a tuple ``(interval, url)`` where interval is an integer
containing the delay in seconds (or zero if not present) and url is a
string with the absolute url to redirect.
If no meta redirect is found, ``(None, None)`` is returned.
"""
if six.PY2:
baseurl = unicode_to_str(baseurl, encoding)
try:
text = str_to_unicode(text, encoding)
except UnicodeDecodeError:
print(text)
raise
text = remove_comments(remove_entities(text))
m = _meta_refresh_re.search(text)
if m:
interval = float(m.group('int'))
url = safe_url_string(m.group('url').strip(' "\''), encoding)
url = moves.urllib.parse.urljoin(baseurl, url)
return interval, url
else:
return None, None
示例8: parse_all
def parse_all(self, response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
base_url = get_base_url(response)
base_site = get_url_site(base_url)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
filename = abs_url.split("?")[0].split("/")[-1]
if filename :
ctype = filename.split(".")[-1].lower()
else:
ctype = None
if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]:
continue
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
site = get_url_site(abs_url)
if site != base_site:
continue
if ctype in ["pdf","doc","docx","rtf",]:
continue
yield scrapy.Request(url=abs_url,callback=self.parse_all)
示例9: get_base_url
def get_base_url(text, baseurl='', encoding='utf-8'):
"""Return the base url if declared in the given HTML `text`,
relative to the given base url.
If no base url is found, the given `baseurl` is returned.
"""
text = to_unicode(text, encoding)
m = _baseurl_re.search(text)
if m:
return moves.urllib.parse.urljoin(
safe_url_string(baseurl),
safe_url_string(m.group(1), encoding=encoding)
)
else:
return safe_url_string(baseurl)
示例10: std_url
def std_url(url, keep_blank_values=True, keep_fragments=False):
scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
keyvals = cgi.parse_qsl(query, keep_blank_values)
keyvals.sort()
query = urllib.urlencode(keyvals)
path = safe_url_string(path) or '/'
fragment = '' if not keep_fragments else fragment
return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
示例11: test_safe_url_idna
def test_safe_url_idna(self):
# adapted from:
# https://ssl.icu-project.org/icu-bin/idnbrowser
# http://unicode.org/faq/idn.html
# + various others
websites = (
(u'http://www.färgbolaget.nu/färgbolaget', 'http://www.xn--frgbolaget-q5a.nu/f%C3%A4rgbolaget'),
(u'http://www.räksmörgås.se/?räksmörgås=yes', 'http://www.xn--rksmrgs-5wao1o.se/?r%C3%A4ksm%C3%B6rg%C3%A5s=yes'),
(u'http://www.brændendekærlighed.com/brændende/kærlighed', 'http://www.xn--brndendekrlighed-vobh.com/br%C3%A6ndende/k%C3%A6rlighed'),
(u'http://www.예비교사.com', 'http://www.xn--9d0bm53a3xbzui.com'),
(u'http://理容ナカムラ.com', 'http://xn--lck1c3crb1723bpq4a.com'),
(u'http://あーるいん.com', 'http://xn--l8je6s7a45b.com'),
# --- real websites ---
# in practice, this redirect (301) to http://www.buecher.de/?q=b%C3%BCcher
(u'http://www.bücher.de/?q=bücher', 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher'),
# Japanese
(u'http://はじめよう.みんな/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5'),
# Russian
(u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'),
(u'http://кто.рф/index.php?domain=Что', 'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'),
# Korean
(u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'),
(u'http://맨체스터시티축구단.한국/', 'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'),
# Arabic
(u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'),
# Chinese
(u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'),
(u'https://www2.xn--0kwr83e.在线', 'https://www2.xn--0kwr83e.xn--3ds443g'),
(u'https://www3.贷款.xn--3ds443g', 'https://www3.xn--0kwr83e.xn--3ds443g'),
)
for idn_input, safe_result in websites:
safeurl = safe_url_string(idn_input)
self.assertEqual(safeurl, safe_result)
# make sure the safe URL is unchanged when made safe a 2nd time
for _, safe_result in websites:
safeurl = safe_url_string(safe_result)
self.assertEqual(safeurl, safe_result)
示例12: _set_url
def _set_url(self, url):
if not isinstance(url, six.string_types):
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
s = safe_url_string(url, self.encoding)
self._url = escape_ajax(s)
if ':' not in self._url:
raise ValueError('Missing scheme in request url: %s' % self._url)
示例13: test_safe_url_string_bytes_input
def test_safe_url_string_bytes_input(self):
safeurl = safe_url_string(b"http://www.example.com/")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/")
# bytes input is assumed to be UTF-8
safeurl = safe_url_string(b"http://www.example.com/\xc2\xb5")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%B5")
# page-encoding encoded bytes still end up as UTF-8 sequences in path
safeurl = safe_url_string(b"http://www.example.com/\xb5", encoding='latin1')
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%B5")
safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5", encoding='latin1')
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")
示例14: _set_url
def _set_url(self, url):
if not isinstance(url, six.string_types):
raise TypeError('Request url must be str or unicode, got {0!s}:'.format(type(url).__name__))
url = to_native_str(url, self.encoding)
self._url = escape_ajax(safe_url_string(url))
if ':' not in self._url:
raise ValueError('Missing scheme in request url: {0!s}'.format(self._url))
示例15: parse_zgyszz
def parse_zgyszz(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
#base_site = get_url_site(base_url)
if "qklist/show-" in response.url:
base_url = get_base_url(response)
downLink = response.xpath("//div[@id='down']//a/@onclick").extract()[0]
relative_url = downLink.split("'")[1]
abs_url = urljoin_rfc(base_url,relative_url)
yield scrapy.Request(abs_url,callback=self.parse_zgyszz)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
return
if '/upload/qklist/' in response.url:
yield self.baidu_rpc_request({"url":response.url,"src_id":22})
return
base_url = response.url
for sel in response.xpath("//div[@class='main_box']//table/tr[1]/td/a/@href"):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
#request.meta["dont_redirect"] = True
yield request
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
for sel in response.xpath("//div[@class='flickr']/a/@href"):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
yield request
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})