当前位置: 首页>>代码示例>>Python>>正文


Python exceptions.IgnoreRequest方法代码示例

本文整理汇总了Python中scrapy.exceptions.IgnoreRequest方法的典型用法代码示例。如果您正苦于以下问题:Python exceptions.IgnoreRequest方法的具体用法?Python exceptions.IgnoreRequest怎么用?Python exceptions.IgnoreRequest使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.exceptions的用法示例。


在下文中一共展示了exceptions.IgnoreRequest方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _redirect

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \
                [reason]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
                         {'reason': reason, 'redirected': redirected, 'request': request},
                         extra={'spider': spider})
            return redirected
        else:
            logger.debug("Discarding %(request)s: max redirections reached",
                         {'request': request}, extra={'spider': spider})
            raise IgnoreRequest("max redirections reached") 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:23,代码来源:redirect.py

示例2: fetch

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
        if isinstance(request_or_url, Request):
            request = request_or_url
        else:
            url = any_to_uri(request_or_url)
            request = Request(url, dont_filter=True, **kwargs)
            if redirect:
                request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
            else:
                request.meta['handle_httpstatus_all'] = True
        response = None
        try:
            response, spider = threads.blockingCallFromThread(
                reactor, self._schedule, request, spider)
        except IgnoreRequest:
            pass
        self.populate_vars(response, request, spider) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:19,代码来源:shell.py

示例3: _log_download_errors

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def _log_download_errors(self, spider_failure, download_failure, request, spider):
        """Log and silence errors that come from the engine (typically download
        errors that got propagated thru here)
        """
        if (isinstance(download_failure, Failure) and
                not download_failure.check(IgnoreRequest)):
            if download_failure.frames:
                logger.error('Error downloading %(request)s',
                             {'request': request},
                             exc_info=failure_to_exc_info(download_failure),
                             extra={'spider': spider})
            else:
                errmsg = download_failure.getErrorMessage()
                if errmsg:
                    logger.error('Error downloading %(request)s: %(errmsg)s',
                                 {'request': request, 'errmsg': errmsg},
                                 extra={'spider': spider})

        if spider_failure is not download_failure:
            return spider_failure 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:22,代码来源:scraper.py

示例4: process_exception

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_exception(self, request, exception, spider):
        if isinstance(exception, (IgnoreRequest, DropItem)):
            return
        if not self._is_enabled_for_request(request):
            return

        autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
        stop_time = time.time()
        latency = time.time() - autoextract['timing']['start_ts']
        autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})

        # Make sure to log all unknown failures
        logger.warning('AutoExtract failure after %.3fs for %s: %s',
                       latency,
                       autoextract['original_url'],
                       repr(exception),
                       extra={'spider': spider})

        request.meta['autoextract'] = autoextract
        ex_class = global_object_name(exception.__class__)
        self.inc_metric('autoextract/errors/total_count', spider=spider)
        self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider) 
开发者ID:scrapinghub,项目名称:scrapy-autoextract,代码行数:24,代码来源:middlewares.py

示例5: process_request

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request(self, request, spider):

          # don't use this middleware while testing is site is up
        if hasattr(spider, "test") and spider.test=="yes":
            #logger = logging.getLogger()
            #logger.info("Testing mode, dead domains disabled")
            return None

        if not Domain.is_onion_url(request.url):
            return None

        domain = Domain.find_by_url(request.url)
        if not domain or domain.is_up:
            return None

        raise IgnoreRequest('Domain %s is dead, skipping' % domain.host) 
开发者ID:dirtyfilthy,项目名称:freshonions-torscraper,代码行数:18,代码来源:middlewares.py

示例6: process_request

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request(self, request, spider):
        if 'x-ignore-request' in request.url:
            raise IgnoreRequest()
        elif 'x-error-request' in request.url:
            _ = 1 / 0 
开发者ID:eren,项目名称:sozlukcrawler,代码行数:7,代码来源:middleware.py

示例7: process_response

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_response(self, request, response, spider):
        if 'x-ignore-response' in request.url:
            raise IgnoreRequest()
        elif 'x-error-response' in request.url:
            _ = 1 / 0
        else:
            return response 
开发者ID:eren,项目名称:sozlukcrawler,代码行数:9,代码来源:middleware.py

示例8: process_request

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request(self, request, spider):
        if not request.url:
            return None
        channel_id = request.meta.get('channel_id', 0)
        # 处理详情页面(忽略列表页面)与pipeline配合
        if is_dup_detail(request.url, spider.name, channel_id):
            raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url)) 
开发者ID:zhanghe06,项目名称:news_spider,代码行数:9,代码来源:de_duplication_request.py

示例9: process_request

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request(self, request, spider):
        # 处理微信反爬(反爬机制一, sogou)
        if spider.name in ['weixin'] and 'antispider' in request.url:
            # 获取来源链接
            redirect_urls = request.meta['redirect_urls']

            # 清理失效 cookies
            cookies_id = request.meta['cookiejar']
            del_cookies(spider.name, cookies_id)

            # spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0]))
            raise IgnoreRequest(
                'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0])) 
开发者ID:zhanghe06,项目名称:news_spider,代码行数:15,代码来源:anti_spider.py

示例10: mustbe_deferred

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def mustbe_deferred(f, *args, **kw):
    """Same as twisted.internet.defer.maybeDeferred, but delay calling
    callback/errback to next reactor loop
    """
    try:
        result = f(*args, **kw)
    # FIXME: Hack to avoid introspecting tracebacks. This to speed up
    # processing of IgnoreRequest errors which are, by far, the most common
    # exception in Scrapy - see #125
    except IgnoreRequest as e:
        return defer_fail(failure.Failure(e))
    except Exception:
        return defer_fail(failure.Failure())
    else:
        return defer_result(result) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:17,代码来源:defer.py

示例11: process_request_2

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request_2(self, rp, request, spider):
        if rp is None:
            return
        if not rp.can_fetch(to_native_str(self._useragent), request.url):
            logger.debug("Forbidden by robots.txt: %(request)s",
                         {'request': request}, extra={'spider': spider})
            self.crawler.stats.inc_value('robotstxt/forbidden')
            raise IgnoreRequest("Forbidden by robots.txt") 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:10,代码来源:robotstxt.py

示例12: _logerror

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def _logerror(self, failure, request, spider):
        if failure.type is not IgnoreRequest:
            logger.error("Error downloading %(request)s: %(f_exception)s",
                         {'request': request, 'f_exception': failure.value},
                         exc_info=failure_to_exc_info(failure),
                         extra={'spider': spider})
        return failure 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:9,代码来源:robotstxt.py

示例13: _robots_error

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def _robots_error(self, failure, netloc):
        if failure.type is not IgnoreRequest:
            key = 'robotstxt/exception_count/{}'.format(failure.type)
            self.crawler.stats.inc_value(key)
        rp_dfd = self._parsers[netloc]
        self._parsers[netloc] = None
        rp_dfd.callback(None) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:9,代码来源:robotstxt.py

示例14: media_failed

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def media_failed(self, failure, request, info):
        if not isinstance(failure.value, IgnoreRequest):
            referer = referer_str(request)
            logger.warning(
                'File (unknown-error): Error downloading %(medianame)s from '
                '%(request)s referred in <%(referer)s>: %(exception)s',
                {'medianame': self.MEDIA_NAME, 'request': request,
                 'referer': referer, 'exception': failure.value},
                extra={'spider': info.spider}
            )

        raise FileException 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:14,代码来源:files.py

示例15: process_request

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import IgnoreRequest [as 别名]
def process_request(self, request, spider):
        if request.meta.get('dont_cache', False):
            return

        # Skip uncacheable requests
        if not self.policy.should_cache_request(request):
            request.meta['_dont_cache'] = True  # flag as uncacheable
            return

        # Look for cached response and check if expired
        cachedresponse = self.storage.retrieve_response(spider, request)
        if cachedresponse is None:
            self.stats.inc_value('httpcache/miss', spider=spider)
            if self.ignore_missing:
                self.stats.inc_value('httpcache/ignore', spider=spider)
                raise IgnoreRequest("Ignored request not in cache: %s" % request)
            return  # first time request

        # Return cached response only if not expired
        cachedresponse.flags.append('cached')
        if self.policy.is_cached_response_fresh(cachedresponse, request):
            self.stats.inc_value('httpcache/hit', spider=spider)
            return cachedresponse

        # Keep a reference to cached response to avoid a second cache lookup on
        # process_response hook
        request.meta['cached_response'] = cachedresponse 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:29,代码来源:httpcache.py


注:本文中的scrapy.exceptions.IgnoreRequest方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。