本文整理汇总了Python中hubstorage.HubstorageClient.start_job方法的典型用法代码示例。如果您正苦于以下问题:Python HubstorageClient.start_job方法的具体用法?Python HubstorageClient.start_job怎么用?Python HubstorageClient.start_job使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类hubstorage.HubstorageClient
的用法示例。
在下文中一共展示了HubstorageClient.start_job方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: HcfMiddleware
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import start_job [as 别名]
class HcfMiddleware(object):
def __init__(self, crawler):
settings = crawler.settings
self.hs_endpoint = settings.get("HS_ENDPOINT")
self.hs_auth = self._get_config(settings, "HS_AUTH")
self.hs_projectid = self._get_config(settings, "HS_PROJECTID")
self.hs_frontier = self._get_config(settings, "HS_FRONTIER")
self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT")
self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)
self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS)
self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False)
self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished'])
self.hs_start_job_new_panel = settings.getbool("HS_START_JOB_NEW_PANEL", False)
if not self.hs_start_job_new_panel:
conn = Connection(self.hs_auth)
self.oldpanel_project = conn[self.hs_projectid]
self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint)
self.project = self.hsclient.get_project(self.hs_projectid)
self.fclient = self.project.frontier
self.new_links = defaultdict(set)
self.batch_ids = []
crawler.signals.connect(self.close_spider, signals.spider_closed)
# Make sure the logger for hubstorage.batchuploader is configured
logging.basicConfig()
def _get_config(self, settings, key):
value = settings.get(key)
if not value:
raise NotConfigured('%s not found' % key)
return value
def _msg(self, msg, level=log.INFO):
log.msg('(HCF) %s' % msg, level)
def _start_job(self, spider):
self._msg("Starting new job for: %s" % spider.name)
if self.hs_start_job_new_panel:
jobid = self.hsclient.start_job(projectid=self.hs_projectid,
spider=spider.name)
else:
jobid = self.oldpanel_project.schedule(spider.name, slot=self.hs_consume_from_slot,
dummy=datetime.now())
self._msg("New job started: %s" % jobid)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_start_requests(self, start_requests, spider):
self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier)
self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)
self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot)
self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot)
self.has_new_requests = False
for req in self._get_new_requests():
self.has_new_requests = True
yield req
# if there are no links in the hcf, use the start_requests
# unless this is not the first job.
if not self.has_new_requests and not getattr(spider, 'dummy', None):
self._msg('Using start_requests')
for r in start_requests:
yield r
def process_spider_output(self, response, result, spider):
slot_callback = getattr(spider, 'slot_callback', self._get_slot)
for item in result:
if isinstance(item, Request):
request = item
if request.meta.get('use_hcf', False):
if request.method == 'GET': # XXX: Only GET support for now.
slot = slot_callback(request)
if not request.url in self.new_links:
hcf_params = request.meta.get('hcf_params')
fp = {'fp': request.url}
if hcf_params:
fp.update(hcf_params)
# Save the new links as soon as possible using
# the batch uploader
self.fclient.add(self.hs_frontier, slot, [fp])
self.new_links[slot].add(request.url)
else:
self._msg("'use_hcf' meta key is not supported for non GET requests (%s)" % request.url,
log.ERROR)
yield request
else:
yield request
else:
yield item
#.........这里部分代码省略.........
示例2: HcfMiddleware
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import start_job [as 别名]
class HcfMiddleware(object):
def __init__(self, crawler):
self.crawler = crawler
hs_endpoint = self._get_config(crawler, "HS_ENDPOINT")
hs_auth = self._get_config(crawler, "HS_AUTH")
self.hs_projectid = self._get_config(crawler, "HS_PROJECTID")
self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER")
self.hs_project_slot = self._get_config(crawler, "HS_SLOT")
# Max number of batches to read from the HCF within a single run.
try:
self.hs_max_baches = int(crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES))
except ValueError:
self.hs_max_baches = DEFAULT_MAX_BATCHES
self.hs_start_job_on_reason = crawler.settings.get("HS_START_JOB_ON_REASON", [])
self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint)
self.project = self.hsclient.get_project(self.hs_projectid)
self.fclient = self.project.frontier
self.new_links = defaultdict(list)
self.batch_ids = []
crawler.signals.connect(self.idle_spider, signals.spider_idle)
crawler.signals.connect(self.close_spider, signals.spider_closed)
def _get_config(self, crawler, key):
value = crawler.settings.get(key)
if not value:
raise NotConfigured('%s not found' % key)
return value
def _msg(self, msg):
log.msg('(HCF) %s' % msg)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_start_requests(self, start_requests, spider):
self.hs_frontier = getattr(spider, 'frontier', self.hs_project_frontier)
self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)
self.hs_slot = getattr(spider, 'slot', self.hs_project_slot)
self._msg('Using HS_SLOT=%s' % self.hs_slot)
has_new_requests = False
for req in self._get_new_requests():
has_new_requests = True
yield req
# if there are no links in the hcf, use the start_requests
if not has_new_requests:
self._msg('Using start_requests')
for r in start_requests:
yield r
def process_spider_output(self, response, result, spider):
slot_callback = getattr(spider, 'slot_callback', self._get_slot)
for item in result:
if isinstance(item, Request):
request = item
if (request.method == 'GET' and # XXX: Only GET support for now.
request.meta.get('use_hcf', False)):
slot = slot_callback(request)
hcf_params = request.meta.get('hcf_params')
fp = {'fp': request.url}
if hcf_params:
fp.update(hcf_params)
self.new_links[slot].append(fp)
else:
yield item
else:
yield item
def idle_spider(self, spider):
self._save_new_links()
self.fclient.flush()
self._delete_processed_ids()
has_new_requests = False
for request in self._get_new_requests():
self.crawler.engine.schedule(request, spider)
has_new_requests = True
if has_new_requests:
raise DontCloseSpider
def close_spider(self, spider, reason):
# Only store the results if the spider finished normally, if it
# didn't finished properly there is not way to know whether all the url batches
# were processed and it is better not to delete them from the frontier
# (so they will be picked by anothe process).
if reason == 'finished':
self._save_new_links()
self._delete_processed_ids()
# If the reason is defined in the hs_start_job_on_reason list then start
# a new job right after this spider is finished. The idea is to limit
# every spider runtime (either via itemcount, pagecount or timeout) and
#.........这里部分代码省略.........
示例3: SystemTest
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import start_job [as 别名]
class SystemTest(HSTestCase):
MAGICN = 1211
def setUp(self):
super(HSTestCase, self).setUp()
endpoint = self.hsclient.endpoint
# Panel - no client auth, only project auth using user auth token
self.panelclient = HubstorageClient(endpoint=endpoint)
self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth)
# Runner - client uses global auth to poll jobq
self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth)
# Scraper - uses job level auth, no global or project auth available
self.scraperclient = HubstorageClient(endpoint=endpoint)
def test_succeed_with_close_reason(self):
p = self.panelproject
pushed = p.jobq.push(self.spidername)
# check pending state
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'pending')
# consume msg from runner
self._run_runner(pushed, close_reason='all-good')
# query again from panel
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'finished')
self.assertEqual(job.metadata.get('close_reason'), 'all-good')
self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN)
self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4)
self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN)
def test_succeed_without_close_reason(self):
p = self.panelproject
pushed = p.jobq.push(self.spidername)
# check pending state
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'pending')
# consume msg from runner
self._run_runner(pushed, close_reason=None)
# query again from panel
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'finished')
self.assertEqual(job.metadata.get('close_reason'), 'no_reason')
self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN)
self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4)
self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN)
def test_scraper_failure(self):
p = self.panelproject
pushed = p.jobq.push(self.spidername)
# check pending state
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'pending')
# consume msg from runner
self._run_runner(pushed, close_reason=IOError('no more resources, ha!'))
# query again from panel
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'finished')
self.assertEqual(job.metadata.get('close_reason'), 'failed')
# MAGICN per log level messages plus one of last failure
stats = job.logs.stats()
self.assertTrue(stats)
self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1)
def _run_runner(self, pushed, close_reason):
job = self.runnerclient.start_job(self.projectid)
self.assertFalse(job.metadata.get('stop_requested'))
job.metadata.update(host='localhost', slot=1)
self.assertEqual(job.metadata.get('state'), 'running')
# run scraper
try:
self._run_scraper(job.key, job.jobauth, close_reason=close_reason)
except Exception as exc:
job.failed(message=str(exc))
# logging from runner must append and never remove messages logged
# by scraper
self.assertTrue(job.logs.batch_append)
else:
job.finished()
self.runnerclient.close()
def _run_scraper(self, jobkey, jobauth, close_reason=None):
httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split()
job = self.scraperclient.get_job(jobkey, auth=jobauth)
for idx in xrange(self.MAGICN):
iid = job.items.write({'uuid': idx})
job.logs.debug('log debug %s' % idx, idx=idx)
job.logs.info('log info %s' % idx, idx=idx)
job.logs.warn('log warn %s' % idx, idx=idx)
job.logs.error('log error %s' % idx, idx=idx)
sid = job.samples.write([idx, idx, idx])
rid = job.requests.add(
url='http://test.com/%d' % idx,
status=random.randint(100, 1000),
method=random.choice(httpmethods),
rs=random.randint(0, 100000),
duration=random.randint(0, 1000),
parent=random.randrange(0, idx + 1) if idx > 10 else None,
ts=millitime() + random.randint(100, 100000),
#.........这里部分代码省略.........