本文整理汇总了Python中scrapy.exceptions.CloseSpider方法的典型用法代码示例。如果您正苦于以下问题:Python exceptions.CloseSpider方法的具体用法?Python exceptions.CloseSpider怎么用?Python exceptions.CloseSpider使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.exceptions
的用法示例。
在下文中一共展示了exceptions.CloseSpider方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: handle_spider_error
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def handle_spider_error(self, _failure, request, response, spider):
exc = _failure.value
if isinstance(exc, CloseSpider):
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
return
logger.error(
"Spider error processing %(request)s (referer: %(referer)s)",
{'request': request, 'referer': referer_str(request)},
exc_info=failure_to_exc_info(_failure),
extra={'spider': spider}
)
self.signals.send_catch_log(
signal=signals.spider_error,
failure=_failure, response=response,
spider=spider
)
self.crawler.stats.inc_value(
"spider_exceptions/%s" % _failure.value.__class__.__name__,
spider=spider
)
示例2: parse_item
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse_item(self, response):
item = InventusSpiderItem()
for url in Selector(text=response.body).xpath('//a/@href').extract():
if not url.startswith('http://') or url.startswith('https://'):
url = self.base_url + url
try:
parsed_uri = urlparse(url)
except ValueError:
# If the URL is invalid we can ignore it.
continue
if parsed_uri.netloc.endswith('.' + self.domain) and 'mailto:' not in url:
if not parsed_uri.netloc in self.subdomains:
self.subdomains.append(parsed_uri.netloc)
item['subdomain'] = parsed_uri.netloc
yield item
if len(self.subdomains) > int(self.subdomain_limit):
break
yield Request(url, callback=self.parse)
if len(self.subdomains) >= int(self.subdomain_limit):
raise CloseSpider('subdomain limit reached')
示例3: process_request
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def process_request(self, request, spider):
if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'):
return
proxy = self.proxies.get_random()
if not proxy:
if self.stop_if_no_proxies:
raise CloseSpider("no_proxies")
else:
logger.warn("No proxies available; marking all proxies "
"as unchecked")
self.proxies.reset()
proxy = self.proxies.get_random()
if proxy is None:
logger.error("No proxies available even after a reset.")
raise CloseSpider("no_proxies_after_reset")
request.meta['proxy'] = proxy
request.meta['download_slot'] = self.get_proxy_slot(proxy)
request.meta['_rotating_proxy'] = True
示例4: parse_article
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse_article(self,response):
#content,news_no,crawl_date
item = response.meta.get("item",NewsItem())
# news_date = item.get("news_date",None)
# if news_date:
# struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
# delta = self.end_now-struct_date
# print delta.days
# if delta.days == self.end_day:
# raise CloseSpider('today scrapy end')
soup =BeautifulSoup(response.body)
author = soup.find("span",class_="name").text if soup.find("span",class_="name") else None
abstract = soup.find("p",class_="excerpt").text if soup.find("p",class_="excerpt") else None
content = soup.find("div",class_="detail").text if soup.find("div",class_="detail") else None
news_no = response.url.split("/")[-1][:-5]
item["author"] = author
item["abstract"] = abstract
item["content"] = content
item["crawl_date"] = NOW
item["news_no"] = news_no
yield item
示例5: parse_news
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse_news(self,response):
item = response.meta.get("item",None)
# #把结束条件移到爬取内容中,以免引起事务的错误
# news_date = item.get("news_date",None)
# if news_date:
# struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
# news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
#
# delta = self.end_now-struct_date
# if delta.days == self.end_day:
# # pass
# raise CloseSpider('today scrapy end')
soup = BeautifulSoup(response.body)
news_content_group = soup.find("div",class_="entry-content group")
#去除相关阅读
news_content_group.find("div",class_="related_posts").replace_with("")
content = news_content_group.text.strip()
item["content"] = content
item["catalogue"] = u"最新内容"
yield item
示例6: parse
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse(self, response):
self.logger.info(response.url)
if response.text:
for _, meta in formasaurus.extract_forms(response.text):
form_type = meta['form']
if form_type == 'login' and not self.found_login:
self.found_login = True
self.handle_login_form(response.url)
elif form_type == 'registration' \
and not self.found_registration:
self.found_registration = True
self.handle_registration_form(response.url)
if self.found_registration and self.found_login:
raise CloseSpider('done')
for link in self.link_extractor.extract_links(response):
priority = 0
text = ' '.join([relative_url(link.url), link.text]).lower()
if any(pattern in text for pattern in self.priority_patterns):
priority = 100
yield self.request(link.url, self.parse, priority=priority)
示例7: open_spider
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def open_spider(self, spider):
# Called when a spider starts
#Create a dedicated Database Connection for the spider
spider.postgres = postgresSQL()
#Verify the Connection
if spider.postgres.connect() == False:
raise CloseSpider(" Database Connection cannot be established!")
#Initialize the Stats
spider.urls_dropped = 0
spider.urls_scraped = 0
spider.urls_parsed = 0
spider.urls_stored = 0
#Add/Verify Site in Database
self.checkSite(spider)
#Start Spider's Log
spider.log_id = spider.postgres.start_log(spider.custom_settings['site_id'], os.getpid())
if not spider.log_id:
raise CloseSpider(" Unable to Start Log!")
示例8: checkSite
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def checkSite(self, spider):
""" Verifies if site exist in database, add otherwise """
# Verify Database Connection
if not spider.postgres.checkConnection():
logger.error(__name__ + " No Database Connection Found!")
raise CloseSpider(" No Database Connection Found!")
try:
# Check if site Exists in Database using it's site_id
if not spider.postgres.siteExists(spider.custom_settings['site_id']):
# Add it to Database if not
spider.postgres.cursor.execute(spider.postgres.insert_site_str, (
spider.custom_settings['site_id'],
spider.custom_settings['site_name'],
spider.custom_settings['site_url'],
spider.name,
)
)
except Exception as e:
logger.error(__name__ + " Unable to add site to Database! Msg: " + str(e))
raise CloseSpider("Unable to add site to Database")
# Special Methods Below, Read about them before altering
示例9: spider_closed
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def spider_closed(self, spider, reason):
# Calls After Spider is closed
# Check Connection
if not spider.postgres.checkConnection():
raise CloseSpider("Unable to Establish a Database Connection")
# Collect all Stats
url_stats = {
"urls_dropped": spider.urls_dropped,
"urls_scraped": spider.urls_scraped,
"urls_parsed": spider.urls_parsed,
"urls_stored": spider.urls_stored
}
# End The Log
if not spider.postgres.end_log(spider.log_id, url_stats, reason):
logger.error(__name__ + " Unable to End Log for Spider " + spider.name + " with stats: " + str(url_stats))
# Close the database connection
spider.postgres.connection.close()
logger.info(__name__ + " [" + spider.name + "] SPIDER CLOSED")
示例10: parse
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse(self, response):
self.log("PARSING: %s" % response.request.url, level=log.INFO)
items_to_scrape = response.xpath('//*[@id="topic"]/ul[@id="entry-list"]/li')
if len(items_to_scrape) == 0:
self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
level=log.ERROR)
raise CloseSpider('no_item_found')
for sel in items_to_scrape:
girdi_id = sel.xpath('./@data-id').extract()[0]
baslik_id = response.xpath('//*[@id="title"]/a/@href').re(r'--(\d*)')[0]
baslik = response.xpath('//*[@id="title"]/a/span/text()').extract()[0]
date = sel.xpath('./footer/div[@class="info"]/a[@class="entry-date permalink"]/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
text = sel.xpath('string(./div)').extract()[0]
nick = sel.xpath('./footer/div[@class="info"]/a[@class="entry-author"]/text()').extract()[0]
item = Girdi()
item['source'] = self.name
item['baslik'] = baslik
item['girdi_id'] = girdi_id
item['baslik_id'] = baslik_id
item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
item['text'] = text
item['nick'] = nick
yield item
# Sozluk sayfalamayi javascript ile yapiyor, dolayisi ile sayfa linkini XPath ile alamiyoruz ancak kacinci
# sayfada oldugumuz ve son sayfa html icerisinde yer aliyor. Bu bilgileri kullanarak crawl edilecek bir
# sonraki sayfanin adresini belirle. SSG degistirmez umarim :(
current_page = int(response.xpath('//*[@id="topic"]/div[2]/@data-currentpage').extract()[0])
page_count = int(response.xpath('//*[@id="topic"]/div[2]/@data-pagecount').extract()[0])
current_url = response.request.url.split('?p')[0]
next_page = current_page + 1
if page_count >= next_page:
# if current_page < 1:
yield Request('%s?p=%s' % (current_url, next_page))
示例11: __init__
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def __init__(self, **kwargs):
super(GenericSozlukSpider, self).__init__(**kwargs)
if 'baslik' not in kwargs:
raise CloseSpider('Baslik should be given to scrape')
self.urls = kwargs['baslik'].split(',')
self.allowed_domains = []
示例12: parse
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse(self, response):
self.log("PARSING: %s" % response.request.url, level=log.INFO)
items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
if len(items_to_scrape) == 0:
self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
level=log.ERROR)
raise CloseSpider('no_item_found')
for sel in items_to_scrape:
girdi_id = sel.xpath('./footer/div[@class="entrymenu"]/@data-info').extract()[0].split(',')[0]
baslik_id = response.xpath('//*[@id="canonical_url"]/@value').re(r'--(\d*)')[0]
baslik = response.xpath('//*[@id="title"]/a/text()').extract()[0]
date = sel.xpath('./footer/div[2]/time/a/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
text = sel.xpath('string(./div)').extract()[0]
nick = sel.css('a.yazarlink').xpath('text()').extract()[0]
item = Girdi()
item['source'] = self.name
item['baslik'] = baslik
item['girdi_id'] = girdi_id
item['baslik_id'] = baslik_id
item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
item['text'] = text
item['nick'] = nick
yield item
current_url = response.request.url.split('/sayfa')[0]
title_re = response.xpath('//title').re(r'sayfa (\d*)')
current_page = int(title_re[0]) if title_re else 1
page_count = int(response.xpath('//a[@rel="last"]')[0].xpath('text()').extract()[0])
next_page = current_page + 1
if page_count >= next_page:
# if current_page < 2:
yield Request('%s/sayfa/%s' % (current_url, next_page))
示例13: parse
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def parse(self, response):
self.log("PARSING: %s" % response.request.url, level=log.INFO)
items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
if len(items_to_scrape) == 0:
self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
level=log.ERROR)
raise CloseSpider('no_item_found')
for sel in items_to_scrape:
girdi_id = sel.css('span.voting').css('a.entryid_a').xpath('./span/text()').re(r'#(\d*)')[0]
baslik_id = response.xpath('//*[@id="main"]/div/div[1]/div[1]/div/ul/li[1]/ul/li/a/@onclick').re("'(\d*)'")[0]
baslik = response.css('h1.title').xpath('./a/text()').extract()[0]
date = sel.xpath('.//a[@class="entry_tarih"]/small/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
text = sel.css('div.entry-p').xpath('string(.)').extract()[0]
nick = sel.css('span.entry-author').xpath('./a/text()').extract()[0].lower()
item = Girdi()
item['source'] = self.name
item['baslik'] = baslik
item['girdi_id'] = girdi_id
item['baslik_id'] = baslik_id
item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
item['text'] = text
item['nick'] = nick
yield item
current_page = int(response.css('div.pagination').css('li.active').xpath('./a/text()').extract()[0])
page_count = int(response.xpath('//*[@id="main"]/div/div[3]/ul/li/a')[-2].xpath('text()').extract()[0])
next_page = current_page + 1
# Bir sonraki adimda sayfalama linkini dondurmek icin sayfalamadan onceki baslik adresini cikarmamiz gerek.
# Adres uludagsozluk.com/k/BASLIK/10 seklinde gitmekte. Path'in sayfalamadan onceki kismini al
url_split = urlsplit(response.request.url)
current_baslik_url = '%s://%s%s' % (url_split.scheme, url_split.netloc, '/'.join(url_split.path.split('/')[:3]))
if page_count >= next_page:
# if current_page < 1:
yield Request('%s/%s' % (current_baslik_url, next_page))
示例14: capture_exceptions
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def capture_exceptions(callback):
""" Wrapper for Scrapy callbacks that captures exceptions within
the provided callback and yields it under `exception` property. Also
spider is closed on the first exception. """
def parse(*args, **kwargs):
try:
yield from callback(*args, **kwargs)
except Exception as e:
yield {'exception': e}
raise CloseSpider("Exception in callback detected")
# Mimic type annotations
parse.__annotations__ = callback.__annotations__
return parse
示例15: __init__
# 需要导入模块: from scrapy import exceptions [as 别名]
# 或者: from scrapy.exceptions import CloseSpider [as 别名]
def __init__(self, IMAGE_STORE, MAXIMUM_IMAGE_NUMBER):
if IMAGE_STORE is None or MAXIMUM_IMAGE_NUMBER is None:
raise CloseSpider('Pipeline load settings failed')
self.IMAGE_STORE = IMAGE_STORE
self.MAXIMUM_IMAGE_NUMBER = MAXIMUM_IMAGE_NUMBER
# recording number of downloaded image
self.image_max_counter = 0
# recording dir name number,it each one thousand add 1
self.dir_counter = 0