本文整理汇总了Python中w3lib.url.canonicalize_url方法的典型用法代码示例。如果您正苦于以下问题:Python url.canonicalize_url方法的具体用法?Python url.canonicalize_url怎么用?Python url.canonicalize_url使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类w3lib.url
的用法示例。
在下文中一共展示了url.canonicalize_url方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _parse_links
# 需要导入模块: from w3lib import url [as 别名]
# 或者: from w3lib.url import canonicalize_url [as 别名]
def _parse_links(self, response):
#????????
#????url??????url
l = LinkLoader(html.html_to_unicode(response))
l.add_xpath(xpath='//a/@href', re_patten=r'/subject/[0-9]+/$|/tag/.*')
#?????????????????????
#l.add_xpath(xpath, re_patten)
#l.add_xpath(xpath, re_patten)
#????get()?????????????????????????url???
links = l.get()
base = urlparse.urlparse(response.url)
domain = '://'.join((base.scheme, base.netloc))
for url in links:
#????????scrapy??link extrackor???
#???????????????url??????url
component = urlparse.urlparse(url)
#??????url??host?response?url?host????url
#??scrapy???offsite spider?????????????????????url
if (component.netloc) and (component.netloc != base.netloc):
continue
#?????url??????url
if domain not in url:
url = urlparse.urljoin(domain, url)
#?url???,????url??#???
url = canonicalize_url(url)
#??request??????
priority = 5 if self.item_url.search(url) else 0
#????????????callback?????????
#errback???????
yield Request(url=url, callback=self.parse, errback=self.error_back,
priority=priority)
示例2: extract_links
# 需要导入模块: from w3lib import url [as 别名]
# 或者: from w3lib.url import canonicalize_url [as 别名]
def extract_links(self, response):
l = []
base_url = response.url
text = html.html_to_unicode(response)
self.sel = self.selector(text, type='html')
links = set(self.sel.xpath('//a/@href').extract())
links = [url for url in links if self.url_allowed(url)]
for url in links:
url = canonicalize_url(urljoin(base_url, url))
l.append(url)
else:
return l
示例3: _get_fingerprint
# 需要导入模块: from w3lib import url [as 别名]
# 或者: from w3lib.url import canonicalize_url [as 别名]
def _get_fingerprint(self, url):
return self.fingerprint_function(canonicalize_url(url))
示例4: _request_fingerprint
# 需要导入模块: from w3lib import url [as 别名]
# 或者: from w3lib.url import canonicalize_url [as 别名]
def _request_fingerprint(self, request):
fp = hashlib.sha1()
fp.update(to_bytes(request.method))
fp.update(to_bytes(canonicalize_url(request.url)))
fp.update(request.body or b'')
# FIXME - proper field name
fp.update(to_bytes('login={}'.format(request.meta.get('logged-in'))))
return fp.hexdigest()
示例5: canonicalize
# 需要导入模块: from w3lib import url [as 别名]
# 或者: from w3lib.url import canonicalize_url [as 别名]
def canonicalize(url,
remove_parameters=('utm_medium', 'utm_source', 'utm_campaign',
'utm_term', 'utm_content')):
"""Canonicalize URL."""
try:
curl = url_query_cleaner(
canonicalize_url(
url, keep_blank_values=False, keep_fragments=False),
parameterlist=remove_parameters,
remove=True)
return canonicalize_url(
curl, keep_blank_values=False, keep_fragments=False)
except Exception as e:
logger.warning('Fail to canonicalize url %r: %s', url, e)
return None