本文整理汇总了Python中scrapy.exceptions.IgnoreRequest方法的典型用法代码示例。如果您正苦于以下问题:Python exceptions.IgnoreRequest方法的具体用法?Python exceptions.IgnoreRequest怎么用?Python exceptions.IgnoreRequest使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.exceptions
的用法示例。
在下文中一共展示了exceptions.IgnoreRequest方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _redirect
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def _redirect(self, redirected, request, spider, reason):
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
redirects = request.meta.get('redirect_times', 0) + 1
if ttl and redirects <= self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
[request.url]
redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \
[reason]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
{'reason': reason, 'redirected': redirected, 'request': request},
extra={'spider': spider})
return redirected
else:
logger.debug("Discarding %(request)s: max redirections reached",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest("max redirections reached")
示例2: fetch
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
if isinstance(request_or_url, Request):
request = request_or_url
else:
url = any_to_uri(request_or_url)
request = Request(url, dont_filter=True, **kwargs)
if redirect:
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
else:
request.meta['handle_httpstatus_all'] = True
response = None
try:
response, spider = threads.blockingCallFromThread(
reactor, self._schedule, request, spider)
except IgnoreRequest:
pass
self.populate_vars(response, request, spider)
示例3: _log_download_errors
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def _log_download_errors(self, spider_failure, download_failure, request, spider):
"""Log and silence errors that come from the engine (typically download
errors that got propagated thru here)
"""
if (isinstance(download_failure, Failure) and
not download_failure.check(IgnoreRequest)):
if download_failure.frames:
logger.error('Error downloading %(request)s',
{'request': request},
exc_info=failure_to_exc_info(download_failure),
extra={'spider': spider})
else:
errmsg = download_failure.getErrorMessage()
if errmsg:
logger.error('Error downloading %(request)s: %(errmsg)s',
{'request': request, 'errmsg': errmsg},
extra={'spider': spider})
if spider_failure is not download_failure:
return spider_failure
示例4: process_exception
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_exception(self, request, exception, spider):
if isinstance(exception, (IgnoreRequest, DropItem)):
return
if not self._is_enabled_for_request(request):
return
autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
stop_time = time.time()
latency = time.time() - autoextract['timing']['start_ts']
autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})
# Make sure to log all unknown failures
logger.warning('AutoExtract failure after %.3fs for %s: %s',
latency,
autoextract['original_url'],
repr(exception),
extra={'spider': spider})
request.meta['autoextract'] = autoextract
ex_class = global_object_name(exception.__class__)
self.inc_metric('autoextract/errors/total_count', spider=spider)
self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)
示例5: process_request
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request(self, request, spider):
# don't use this middleware while testing is site is up
if hasattr(spider, "test") and spider.test=="yes":
#logger = logging.getLogger()
#logger.info("Testing mode, dead domains disabled")
return None
if not Domain.is_onion_url(request.url):
return None
domain = Domain.find_by_url(request.url)
if not domain or domain.is_up:
return None
raise IgnoreRequest('Domain %s is dead, skipping' % domain.host)
示例6: process_request
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request(self, request, spider):
if 'x-ignore-request' in request.url:
raise IgnoreRequest()
elif 'x-error-request' in request.url:
_ = 1 / 0
示例7: process_response
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_response(self, request, response, spider):
if 'x-ignore-response' in request.url:
raise IgnoreRequest()
elif 'x-error-response' in request.url:
_ = 1 / 0
else:
return response
示例8: process_request
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request(self, request, spider):
if not request.url:
return None
channel_id = request.meta.get('channel_id', 0)
# 处理详情页面(忽略列表页面)与pipeline配合
if is_dup_detail(request.url, spider.name, channel_id):
raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url))
示例9: process_request
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request(self, request, spider):
# 处理微信反爬(反爬机制一, sogou)
if spider.name in ['weixin'] and 'antispider' in request.url:
# 获取来源链接
redirect_urls = request.meta['redirect_urls']
# 清理失效 cookies
cookies_id = request.meta['cookiejar']
del_cookies(spider.name, cookies_id)
# spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0]))
raise IgnoreRequest(
'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0]))
示例10: mustbe_deferred
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def mustbe_deferred(f, *args, **kw):
"""Same as twisted.internet.defer.maybeDeferred, but delay calling
callback/errback to next reactor loop
"""
try:
result = f(*args, **kw)
# FIXME: Hack to avoid introspecting tracebacks. This to speed up
# processing of IgnoreRequest errors which are, by far, the most common
# exception in Scrapy - see #125
except IgnoreRequest as e:
return defer_fail(failure.Failure(e))
except Exception:
return defer_fail(failure.Failure())
else:
return defer_result(result)
示例11: process_request_2
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request_2(self, rp, request, spider):
if rp is None:
return
if not rp.can_fetch(to_native_str(self._useragent), request.url):
logger.debug("Forbidden by robots.txt: %(request)s",
{'request': request}, extra={'spider': spider})
self.crawler.stats.inc_value('robotstxt/forbidden')
raise IgnoreRequest("Forbidden by robots.txt")
示例12: _logerror
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def _logerror(self, failure, request, spider):
if failure.type is not IgnoreRequest:
logger.error("Error downloading %(request)s: %(f_exception)s",
{'request': request, 'f_exception': failure.value},
exc_info=failure_to_exc_info(failure),
extra={'spider': spider})
return failure
示例13: _robots_error
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = 'robotstxt/exception_count/{}'.format(failure.type)
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None
rp_dfd.callback(None)
示例14: media_failed
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def media_failed(self, failure, request, info):
if not isinstance(failure.value, IgnoreRequest):
referer = referer_str(request)
logger.warning(
'File (unknown-error): Error downloading %(medianame)s from '
'%(request)s referred in <%(referer)s>: %(exception)s',
{'medianame': self.MEDIA_NAME, 'request': request,
'referer': referer, 'exception': failure.value},
extra={'spider': info.spider}
)
raise FileException
示例15: process_request
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request(self, request, spider):
if request.meta.get('dont_cache', False):
return
# Skip uncacheable requests
if not self.policy.should_cache_request(request):
request.meta['_dont_cache'] = True # flag as uncacheable
return
# Look for cached response and check if expired
cachedresponse = self.storage.retrieve_response(spider, request)
if cachedresponse is None:
self.stats.inc_value('httpcache/miss', spider=spider)
if self.ignore_missing:
self.stats.inc_value('httpcache/ignore', spider=spider)
raise IgnoreRequest("Ignored request not in cache: %s" % request)
return # first time request
# Return cached response only if not expired
cachedresponse.flags.append('cached')
if self.policy.is_cached_response_fresh(cachedresponse, request):
self.stats.inc_value('httpcache/hit', spider=spider)
return cachedresponse
# Keep a reference to cached response to avoid a second cache lookup on
# process_response hook
request.meta['cached_response'] = cachedresponse