当前位置: 首页>>代码示例>>Python>>正文


Python exceptions.CloseSpider方法代码示例

本文整理汇总了Python中scrapy.exceptions.CloseSpider方法的典型用法代码示例。如果您正苦于以下问题:Python exceptions.CloseSpider方法的具体用法?Python exceptions.CloseSpider怎么用?Python exceptions.CloseSpider使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.exceptions的用法示例。


在下文中一共展示了exceptions.CloseSpider方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: handle_spider_error

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def handle_spider_error(self, _failure, request, response, spider):
        exc = _failure.value
        if isinstance(exc, CloseSpider):
            self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
            return
        logger.error(
            "Spider error processing %(request)s (referer: %(referer)s)",
            {'request': request, 'referer': referer_str(request)},
            exc_info=failure_to_exc_info(_failure),
            extra={'spider': spider}
        )
        self.signals.send_catch_log(
            signal=signals.spider_error,
            failure=_failure, response=response,
            spider=spider
        )
        self.crawler.stats.inc_value(
            "spider_exceptions/%s" % _failure.value.__class__.__name__,
            spider=spider
        ) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:22,代码来源:scraper.py

示例2: parse_item

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse_item(self, response):
        item = InventusSpiderItem()
        for url in Selector(text=response.body).xpath('//a/@href').extract():
            if not url.startswith('http://') or url.startswith('https://'):
                url = self.base_url + url
            try:
                parsed_uri = urlparse(url)
            except ValueError:
                # If the URL is invalid we can ignore it.
                continue
            if parsed_uri.netloc.endswith('.' + self.domain) and 'mailto:' not in url:
                if not parsed_uri.netloc in self.subdomains:
                    self.subdomains.append(parsed_uri.netloc)
                    item['subdomain'] = parsed_uri.netloc
                    yield item

                    if len(self.subdomains) > int(self.subdomain_limit):
                        break

                yield Request(url, callback=self.parse)

        if len(self.subdomains) >= int(self.subdomain_limit):
            raise CloseSpider('subdomain limit reached') 
开发者ID:nmalcolm,项目名称:Inventus,代码行数:25,代码来源:inventus.py

示例3: process_request

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def process_request(self, request, spider):
        if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'):
            return
        proxy = self.proxies.get_random()
        if not proxy:
            if self.stop_if_no_proxies:
                raise CloseSpider("no_proxies")
            else:
                logger.warn("No proxies available; marking all proxies "
                            "as unchecked")
                self.proxies.reset()
                proxy = self.proxies.get_random()
                if proxy is None:
                    logger.error("No proxies available even after a reset.")
                    raise CloseSpider("no_proxies_after_reset")

        request.meta['proxy'] = proxy
        request.meta['download_slot'] = self.get_proxy_slot(proxy)
        request.meta['_rotating_proxy'] = True 
开发者ID:TeamHG-Memex,项目名称:scrapy-rotating-proxies,代码行数:21,代码来源:middlewares.py

示例4: parse_article

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse_article(self,response):
        #content,news_no,crawl_date
        item = response.meta.get("item",NewsItem())
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
        #     delta = self.end_now-struct_date
        #     print delta.days
        #     if delta.days == self.end_day:
        #         raise CloseSpider('today scrapy end')
        soup =BeautifulSoup(response.body)
        author = soup.find("span",class_="name").text if soup.find("span",class_="name") else None
        abstract =  soup.find("p",class_="excerpt").text if soup.find("p",class_="excerpt") else None
        content = soup.find("div",class_="detail").text if soup.find("div",class_="detail") else None
        news_no = response.url.split("/")[-1][:-5]
        item["author"] = author
        item["abstract"] = abstract
        item["content"] = content
        item["crawl_date"] = NOW
        item["news_no"] = news_no
        yield item 
开发者ID:yinzishao,项目名称:NewsScrapy,代码行数:23,代码来源:qdaily_spider.py

示例5: parse_news

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse_news(self,response):
        item = response.meta.get("item",None)
        # #把结束条件移到爬取内容中,以免引起事务的错误
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
        #     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
        #
        #     delta = self.end_now-struct_date
        #     if delta.days == self.end_day:
        #         # pass
        #         raise CloseSpider('today scrapy end')
        soup = BeautifulSoup(response.body)
        news_content_group = soup.find("div",class_="entry-content group")
        #去除相关阅读
        news_content_group.find("div",class_="related_posts").replace_with("")
        content = news_content_group.text.strip()
        item["content"] = content
        item["catalogue"] = u"最新内容"
        yield item 
开发者ID:yinzishao,项目名称:NewsScrapy,代码行数:22,代码来源:luxe_spider.py

示例6: parse

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse(self, response):
        self.logger.info(response.url)
        if response.text:
            for _, meta in formasaurus.extract_forms(response.text):
                form_type = meta['form']
                if form_type == 'login' and not self.found_login:
                    self.found_login = True
                    self.handle_login_form(response.url)
                elif form_type == 'registration' \
                        and not self.found_registration:
                    self.found_registration = True
                    self.handle_registration_form(response.url)
        if self.found_registration and self.found_login:
            raise CloseSpider('done')
        for link in self.link_extractor.extract_links(response):
            priority = 0
            text = ' '.join([relative_url(link.url), link.text]).lower()
            if any(pattern in text for pattern in self.priority_patterns):
                priority = 100
            yield self.request(link.url, self.parse, priority=priority) 
开发者ID:TeamHG-Memex,项目名称:autologin,代码行数:22,代码来源:spiders.py

示例7: open_spider

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def open_spider(self, spider):
        # Called when a spider starts

        #Create a dedicated Database Connection for the spider
        spider.postgres = postgresSQL()

        #Verify the Connection
        if spider.postgres.connect() == False:
            raise CloseSpider(" Database Connection cannot be established!")

        #Initialize the Stats
        spider.urls_dropped = 0
        spider.urls_scraped = 0
        spider.urls_parsed = 0
        spider.urls_stored = 0

        #Add/Verify Site in Database
        self.checkSite(spider)

        #Start Spider's Log
        spider.log_id = spider.postgres.start_log(spider.custom_settings['site_id'], os.getpid())
        if not spider.log_id:
            raise CloseSpider(" Unable to Start Log!") 
开发者ID:vipulgupta2048,项目名称:scrape,代码行数:25,代码来源:pipelines.py

示例8: checkSite

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def checkSite(self, spider):
        """ Verifies if site exist in database, add otherwise """
        # Verify Database Connection
        if not spider.postgres.checkConnection():
            logger.error(__name__ + " No Database Connection Found!")
            raise CloseSpider(" No Database Connection Found!")
        
        try:
            # Check if site Exists in Database using it's site_id
            if not spider.postgres.siteExists(spider.custom_settings['site_id']):
                # Add it to Database if not
                spider.postgres.cursor.execute(spider.postgres.insert_site_str, (
                    spider.custom_settings['site_id'],
                    spider.custom_settings['site_name'],
                    spider.custom_settings['site_url'],
                    spider.name,
                    )
                )
        except Exception as e:
            logger.error(__name__ + " Unable to add site to Database! Msg: " + str(e))
            raise CloseSpider("Unable to add site to Database")

    # Special Methods Below, Read about them before altering 
开发者ID:vipulgupta2048,项目名称:scrape,代码行数:25,代码来源:pipelines.py

示例9: spider_closed

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def spider_closed(self, spider, reason):
        # Calls After Spider is closed

        # Check Connection
        if not spider.postgres.checkConnection():
            raise CloseSpider("Unable to Establish a Database Connection")

        # Collect all Stats
        url_stats = {
            "urls_dropped": spider.urls_dropped,
            "urls_scraped": spider.urls_scraped,
            "urls_parsed": spider.urls_parsed,
            "urls_stored": spider.urls_stored
        }

        # End The Log
        if not spider.postgres.end_log(spider.log_id, url_stats, reason):
            logger.error(__name__ + " Unable to End Log for Spider " + spider.name + " with stats: " + str(url_stats))
        
        # Close the database connection
        spider.postgres.connection.close()
        logger.info(__name__ + " [" + spider.name + "] SPIDER CLOSED") 
开发者ID:vipulgupta2048,项目名称:scrape,代码行数:24,代码来源:pipelines.py

示例10: parse

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="topic"]/ul[@id="entry-list"]/li')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.xpath('./@data-id').extract()[0]
            baslik_id = response.xpath('//*[@id="title"]/a/@href').re(r'--(\d*)')[0]
            baslik = response.xpath('//*[@id="title"]/a/span/text()').extract()[0]
            date = sel.xpath('./footer/div[@class="info"]/a[@class="entry-date permalink"]/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.xpath('string(./div)').extract()[0]
            nick = sel.xpath('./footer/div[@class="info"]/a[@class="entry-author"]/text()').extract()[0]

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        # Sozluk sayfalamayi javascript ile yapiyor, dolayisi ile sayfa linkini XPath ile alamiyoruz ancak kacinci
        # sayfada oldugumuz ve son sayfa html icerisinde yer aliyor. Bu bilgileri kullanarak crawl edilecek bir
        # sonraki sayfanin adresini belirle. SSG degistirmez umarim :(
        current_page = int(response.xpath('//*[@id="topic"]/div[2]/@data-currentpage').extract()[0])
        page_count = int(response.xpath('//*[@id="topic"]/div[2]/@data-pagecount').extract()[0])

        current_url = response.request.url.split('?p')[0]

        next_page = current_page + 1
        if page_count >= next_page:
        # if current_page < 1:
            yield Request('%s?p=%s' % (current_url, next_page)) 
开发者ID:eren,项目名称:sozlukcrawler,代码行数:42,代码来源:eksisozluk.py

示例11: __init__

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def __init__(self, **kwargs):
        super(GenericSozlukSpider, self).__init__(**kwargs)

        if 'baslik' not in kwargs:
            raise CloseSpider('Baslik should be given to scrape')

        self.urls = kwargs['baslik'].split(',')
        self.allowed_domains = [] 
开发者ID:eren,项目名称:sozlukcrawler,代码行数:10,代码来源:__init__.py

示例12: parse

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.xpath('./footer/div[@class="entrymenu"]/@data-info').extract()[0].split(',')[0]
            baslik_id = response.xpath('//*[@id="canonical_url"]/@value').re(r'--(\d*)')[0]
            baslik = response.xpath('//*[@id="title"]/a/text()').extract()[0]
            date = sel.xpath('./footer/div[2]/time/a/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.xpath('string(./div)').extract()[0]
            nick = sel.css('a.yazarlink').xpath('text()').extract()[0]

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        current_url = response.request.url.split('/sayfa')[0]

        title_re = response.xpath('//title').re(r'sayfa (\d*)')
        current_page = int(title_re[0]) if title_re else 1

        page_count = int(response.xpath('//a[@rel="last"]')[0].xpath('text()').extract()[0])

        next_page = current_page + 1
        if page_count >= next_page:
        # if current_page < 2:
            yield Request('%s/sayfa/%s' % (current_url, next_page)) 
开发者ID:eren,项目名称:sozlukcrawler,代码行数:41,代码来源:itusozluk.py

示例13: parse

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.css('span.voting').css('a.entryid_a').xpath('./span/text()').re(r'#(\d*)')[0]
            baslik_id = response.xpath('//*[@id="main"]/div/div[1]/div[1]/div/ul/li[1]/ul/li/a/@onclick').re("'(\d*)'")[0]
            baslik = response.css('h1.title').xpath('./a/text()').extract()[0]
            date = sel.xpath('.//a[@class="entry_tarih"]/small/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.css('div.entry-p').xpath('string(.)').extract()[0]
            nick = sel.css('span.entry-author').xpath('./a/text()').extract()[0].lower()

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        current_page = int(response.css('div.pagination').css('li.active').xpath('./a/text()').extract()[0])
        page_count = int(response.xpath('//*[@id="main"]/div/div[3]/ul/li/a')[-2].xpath('text()').extract()[0])
        next_page = current_page + 1

        # Bir sonraki adimda sayfalama linkini dondurmek icin sayfalamadan onceki baslik adresini cikarmamiz gerek.
        # Adres uludagsozluk.com/k/BASLIK/10 seklinde gitmekte. Path'in sayfalamadan onceki kismini al
        url_split = urlsplit(response.request.url)
        current_baslik_url = '%s://%s%s' % (url_split.scheme, url_split.netloc, '/'.join(url_split.path.split('/')[:3]))

        if page_count >= next_page:
        # if current_page < 1:
            yield Request('%s/%s' % (current_baslik_url, next_page)) 
开发者ID:eren,项目名称:sozlukcrawler,代码行数:42,代码来源:uludagsozluk.py

示例14: capture_exceptions

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def capture_exceptions(callback):
    """ Wrapper for Scrapy callbacks that captures exceptions within
    the provided callback and yields it under `exception` property. Also
    spider is closed on the first exception. """
    def parse(*args, **kwargs):
        try:
            yield from callback(*args, **kwargs)
        except Exception as e:
            yield {'exception': e}
            raise CloseSpider("Exception in callback detected")
    # Mimic type annotations
    parse.__annotations__ = callback.__annotations__
    return parse 
开发者ID:scrapinghub,项目名称:scrapy-poet,代码行数:15,代码来源:utils.py

示例15: __init__

# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def __init__(self, IMAGE_STORE, MAXIMUM_IMAGE_NUMBER):
        if IMAGE_STORE is None or MAXIMUM_IMAGE_NUMBER is None:
            raise CloseSpider('Pipeline load settings failed')
        self.IMAGE_STORE = IMAGE_STORE
        self.MAXIMUM_IMAGE_NUMBER = MAXIMUM_IMAGE_NUMBER
        # recording number of downloaded image
        self.image_max_counter = 0
        # recording dir name number,it each one thousand add 1
        self.dir_counter = 0 
开发者ID:SylvanasSun,项目名称:scrapy-picture-spider,代码行数:11,代码来源:pipelines.py


注:本文中的scrapy.exceptions.CloseSpider方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。