本文整理匯總了Python中w3lib.url.canonicalize_url方法的典型用法代碼示例。如果您正苦於以下問題:Python url.canonicalize_url方法的具體用法?Python url.canonicalize_url怎麽用?Python url.canonicalize_url使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類w3lib.url
的用法示例。
在下文中一共展示了url.canonicalize_url方法的5個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: _parse_links
# 需要導入模塊: from w3lib import url [as 別名]
# 或者: from w3lib.url import canonicalize_url [as 別名]
def _parse_links(self, response):
#????????
#????url??????url
l = LinkLoader(html.html_to_unicode(response))
l.add_xpath(xpath='//a/@href', re_patten=r'/subject/[0-9]+/$|/tag/.*')
#?????????????????????
#l.add_xpath(xpath, re_patten)
#l.add_xpath(xpath, re_patten)
#????get()?????????????????????????url???
links = l.get()
base = urlparse.urlparse(response.url)
domain = '://'.join((base.scheme, base.netloc))
for url in links:
#????????scrapy??link extrackor???
#???????????????url??????url
component = urlparse.urlparse(url)
#??????url??host?response?url?host????url
#??scrapy???offsite spider?????????????????????url
if (component.netloc) and (component.netloc != base.netloc):
continue
#?????url??????url
if domain not in url:
url = urlparse.urljoin(domain, url)
#?url???,????url??#???
url = canonicalize_url(url)
#??request??????
priority = 5 if self.item_url.search(url) else 0
#????????????callback?????????
#errback???????
yield Request(url=url, callback=self.parse, errback=self.error_back,
priority=priority)
示例2: extract_links
# 需要導入模塊: from w3lib import url [as 別名]
# 或者: from w3lib.url import canonicalize_url [as 別名]
def extract_links(self, response):
l = []
base_url = response.url
text = html.html_to_unicode(response)
self.sel = self.selector(text, type='html')
links = set(self.sel.xpath('//a/@href').extract())
links = [url for url in links if self.url_allowed(url)]
for url in links:
url = canonicalize_url(urljoin(base_url, url))
l.append(url)
else:
return l
示例3: _get_fingerprint
# 需要導入模塊: from w3lib import url [as 別名]
# 或者: from w3lib.url import canonicalize_url [as 別名]
def _get_fingerprint(self, url):
return self.fingerprint_function(canonicalize_url(url))
示例4: _request_fingerprint
# 需要導入模塊: from w3lib import url [as 別名]
# 或者: from w3lib.url import canonicalize_url [as 別名]
def _request_fingerprint(self, request):
fp = hashlib.sha1()
fp.update(to_bytes(request.method))
fp.update(to_bytes(canonicalize_url(request.url)))
fp.update(request.body or b'')
# FIXME - proper field name
fp.update(to_bytes('login={}'.format(request.meta.get('logged-in'))))
return fp.hexdigest()
示例5: canonicalize
# 需要導入模塊: from w3lib import url [as 別名]
# 或者: from w3lib.url import canonicalize_url [as 別名]
def canonicalize(url,
remove_parameters=('utm_medium', 'utm_source', 'utm_campaign',
'utm_term', 'utm_content')):
"""Canonicalize URL."""
try:
curl = url_query_cleaner(
canonicalize_url(
url, keep_blank_values=False, keep_fragments=False),
parameterlist=remove_parameters,
remove=True)
return canonicalize_url(
curl, keep_blank_values=False, keep_fragments=False)
except Exception as e:
logger.warning('Fail to canonicalize url %r: %s', url, e)
return None