本文整理汇总了Python中scrapy.crawler.Crawler方法的典型用法代码示例。如果您正苦于以下问题:Python crawler.Crawler方法的具体用法?Python crawler.Crawler怎么用?Python crawler.Crawler使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.crawler
的用法示例。
在下文中一共展示了crawler.Crawler方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def __init__(self, splash_url, crawler_options):
self.process = CrawlerProcess({'LOG_ENABLED': True})
self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': crawler_options['user_agent'],
'SPLASH_URL': splash_url,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2,
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
'DEPTH_LIMIT': crawler_options['depth_limit'],
'SPLASH_COOKIES_DEBUG': False
})
示例2: build_crawler
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def build_crawler(self, spider):
"""
do some specific settings for spider
and return the wrapped crawler
:param spider: spider class
:return: crawler
"""
# TODO: specify settings
settings = crawler_runner.settings
# FIXME !!!
# conf = {}
# log_file = crawler_runner.settings.get('LOG_FILE')
# if log_file:
# conf['LOG_FILE'] = '%s.%s' % (log_file, spider.name)
# conf['LOG_FILE'] = None
# conf['LOG_FORMAT'] = ('%(levelname)1.1s [%(asctime)s]'
# ' [spider-{spider}]'
# ' %(message)s'
# ).format(spider=spider.name)
# settings = updated_crawler_settings(settings, conf)
# configure_logging(settings)
return Crawler(spider, settings)
示例3: start_job
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def start_job(self, job=None, callback_fn=None):
print(job)
spider_job = job['spider_job']
runner = job['runner']
spider_cls = spider_job['spider_cls']
spider_settings = spider_job['spider_settings']
spider_kwargs = spider_job['spider_kwargs']
def engine_stopped_callback():
runner.transform_and_index(callback_fn=callback_fn)
if callback_fn:
print("""
==========================================================
WARNING: callback_fn is {}
==========================================================
Since start_job is called with callback_fn, make sure you end the reactor if you want the spider process to
stop after the callback function is executed. By default callback_fn=None will close the reactor.
To write a custom callback_fn
def callback_fn():
print ("Write your own callback logic")
from twisted.internet import reactor
reactor.stop()
==========================================================
""".format(callback_fn))
spider = Crawler(spider_cls, Settings(spider_settings))
spider.signals.connect(engine_stopped_callback, signals.engine_stopped)
self.runner.crawl(spider, **spider_kwargs)
"""
d = runner.crawl(spider, **spider_kwargs)
# d.addBoth(engine_stopped_callback)
"""
reactor.run()
示例4: get_crawler
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def get_crawler():
def _crawler(extended_settings={}):
settings = {
"SPIDERMON_ENABLED": True,
"EXTENSIONS": {"spidermon.contrib.scrapy.extensions.Spidermon": 500},
}
settings.update(extended_settings)
crawler = Crawler(Spider, settings=settings)
crawler.spider = Spider("dummy")
return crawler
return _crawler
示例5: make_data
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def make_data(request):
def _make_data(settings=None):
crawler = Crawler(Spider, settings=settings)
spider = Spider("dummy")
return {
"stats": crawler.stats.get_stats(),
"crawler": crawler,
"spider": spider,
"runner": SpiderMonitorRunner(spider=spider),
"job": None,
}
return _make_data
示例6: run
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def run(self, args: List[str], opts: optparse.Values) -> None:
crawlers = []
real_create_crawler = self.crawler_process.create_crawler
def create_crawler(crawler_or_spidercls: Union[Crawler, str]) -> Crawler:
crawler = real_create_crawler(crawler_or_spidercls)
crawlers.append(crawler)
return crawler
self.crawler_process.create_crawler = create_crawler
super().run(args, opts)
if any(crawler.stats.get_value("log_count/ERROR") for crawler in crawlers):
self.exitcode = 1
示例7: make_crawler
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def make_crawler(spider_cls, settings):
if not getattr(spider_cls, 'name', None):
class Spider(spider_cls):
name = 'test_spider'
Spider.__name__ = spider_cls.__name__
Spider.__module__ = spider_cls.__module__
spider_cls = Spider
return Crawler(spider_cls, settings)
示例8: prepare_callback_replay
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def prepare_callback_replay(fixture_path, encoding="utf-8"):
with open(str(fixture_path), 'rb') as f:
raw_data = f.read()
fixture_info = unpickle_data(decompress_data(raw_data), encoding)
if 'fixture_version' in fixture_info:
encoding = fixture_info['encoding']
data = unpickle_data(fixture_info['data'], encoding)
else:
data = fixture_info # legacy tests
settings = get_project_settings()
spider_name = data.get('spider_name')
if not spider_name: # legacy tests
spider_name = os.path.basename(
os.path.dirname(
os.path.dirname(fixture_path)
)
)
spider_cls = get_spider_class(spider_name, settings)
spider_cls.update_settings(settings)
for k, v in data.get('settings', {}).items():
settings.set(k, v, 50)
crawler = Crawler(spider_cls, settings)
spider_args_in = data.get('spider_args', data.get('spider_args_in', {}))
spider = spider_cls.from_crawler(crawler, **spider_args_in)
crawler.spider = spider
return data, crawler, spider, settings
示例9: run_spider_instance
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def run_spider_instance(spider_class, site_id, main_url):
"""Run a spider given its spider class. For example, importing the TestSpider
and passing it to this function will run it."""
spider = spider_class(site_id=site_id, main_url=main_url)
settings = get_project_settings()
crawler = Crawler(settings)
crawler.configure()
# Scrapy uses a deprecated Twisted interface. Until the fix makes it to a
# new version (>0.24.4), we'll use this so deprecation warnings don't
# clutter the output
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
warnings.filterwarnings("ignore", category=DeprecationWarning)
crawler.crawl(spider)
crawler.start()
reactor.run()
示例10: __init__
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def __init__(self):
self.spider = HqSpider()
self.crawler = crawler = Crawler(get_project_settings())
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(self.spider)
dispatcher.connect(self._dont_close_me, signals.spider_idle)
self.thread = None
self._started = False
self._stopped = False
示例11: list_spiders
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def list_spiders():
settings = get_project_settings()
crawler = Crawler(settings)
return crawler.spiders.list()
示例12: do_scrape
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import Crawler [as 别名]
def do_scrape(spider_name):
"""
Asynchronous task for individual scrapes that is executed by Celery workers.
:param spider_name: str name of the spider that should be run
:return: the full path of the jsonlines output file to which results are stored
"""
# create and configure the spider
crawl_settings = get_project_settings()
# configure the output
# Technically don't need this unless we actually do the scrape, but need to put
# up here before the crawler is instantiated so the FEED_URI override is active
output_name = generate_scrape_name(spider_name)
output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes', output_name)
crawl_settings.overrides['FEED_URI'] = output_path
crawler = Crawler(crawl_settings)
crawler.configure()
try:
spider = crawler.spiders.create(spider_name)
except KeyError as e:
# No spider found.
raise RuntimeError('Could not find spider with name {}'.format(spider_name))
# Check to see if we're already running a scrape by looking for open ScrapeJobs
is_scraping = is_spider_scraping(spider_name)
if is_scraping is False:
logger.info('Starting new scrape of {}'.format(spider_name))
# Create the ScrapeJob record
job_id = do_scrape.request.id
if job_id is None:
# Case if called directly without using Celery, put in a dummy job id
timestamp = datetime.now().strftime('%y%m%d%H%M')
job_id = 'MANUAL_RUN{}'.format(timestamp)
job = ScrapeJob.objects.create(
spider=spider_name,
scheduled=datetime.now(),
# see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task
job_id=job_id,
raw_response=output_path
)
# and set up the callback for updating it
complete_cb = complete_job(job.id)
# Connect the signals and logging, then start it up
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.signals.connect(complete_cb, signal=signals.spider_closed)
log.start(loglevel=log.INFO, logstdout=True)
crawler.crawl(spider)
crawler.start()
reactor.run()
else:
logger.info('Pending job found for spider {}'.format(spider_name))
job = is_scraping
return job.raw_response