本文整理汇总了Python中hubstorage.HubstorageClient.get_project方法的典型用法代码示例。如果您正苦于以下问题:Python HubstorageClient.get_project方法的具体用法?Python HubstorageClient.get_project怎么用?Python HubstorageClient.get_project使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类hubstorage.HubstorageClient
的用法示例。
在下文中一共展示了HubstorageClient.get_project方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_auth
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
def test_auth(self):
# client without global auth set
hsc = HubstorageClient(endpoint=self.hsclient.endpoint)
self.assertEqual(hsc.auth, None)
# check no-auth access
try:
hsc.push_job(self.projectid, self.spidername)
except HTTPError as exc:
self.assertTrue(exc.response.status_code, 401)
else:
self.assertTrue(False, '401 not raised')
try:
hsc.get_project(self.projectid).push_job(self.spidername)
except HTTPError as exc:
self.assertTrue(exc.response.status_code, 401)
else:
self.assertTrue(False, '401 not raised')
try:
hsc.get_job((self.projectid, 1, 1)).items.list()
except HTTPError as exc:
self.assertTrue(exc.response.status_code, 401)
else:
self.assertTrue(False, '401 not raised')
try:
hsc.get_project(self.projectid).get_job((self.projectid, 1, 1)).items.list()
except HTTPError as exc:
self.assertTrue(exc.response.status_code, 401)
else:
self.assertTrue(False, '401 not raised')
# create project with auth
auth = self.hsclient.auth
project = hsc.get_project(self.projectid, auth)
self.assertEqual(project.auth, auth)
job = project.push_job(self.spidername)
samejob = project.get_job(job.key)
self.assertEqual(samejob.key, job.key)
示例2: fetch_and_save_items
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
def fetch_and_save_items():
hc = HubstorageClient(auth=API_KEY)
project = hc.get_project(SH_PROJECT)
for spider in SPIDERS:
print("\nworking on spider {}".format(spider['spider_name']))
spider_id = project.ids.spider(spider['spider_name'])
summary = project.spiders.lastjobsummary(spiderid=spider_id)
for element in summary:
print(element['key'])
job = hc.get_job(element['key'])
items = job.items.iter_values()
save_items(items, spider['institution_name'])
示例3: test_collection_store_and_delete_are_retried
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
def test_collection_store_and_delete_are_retried(self):
# Prepare
client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3)
callback_post, attempts_count_post = self.make_request_callback(2, [])
callback_delete, attempts_count_delete = self.make_request_callback(2, [])
self.mock_api(method=POST, callback=callback_delete, url_match='/.*/deleted')
self.mock_api(method=POST, callback=callback_post) # /!\ default regexp matches all paths, has to be added last
# Act
project = client.get_project(self.projectid)
store = project.collections.new_store('foo')
store.set({'_key': 'bar', 'content': 'value'})
store.delete('baz')
# Assert
self.assertEqual(attempts_count_post[0], 3)
self.assertEqual(attempts_count_delete[0], 3)
示例4: ClientTest
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
class ClientTest(HSTestCase):
def test_push_job(self):
c = self.hsclient
job = c.push_job(self.projectid, self.spidername,
state='running',
priority=self.project.jobq.PRIO_LOW,
foo='baz')
m = job.metadata
self.assertEqual(m.get('state'), u'running', c.auth)
self.assertEqual(m.get('foo'), u'baz')
self.project.jobq.delete(job)
m.expire()
self.assertEqual(m.get('state'), u'deleted')
self.assertEqual(m.get('foo'), u'baz')
def test_botgroup(self):
self.project.settings.update(botgroups=['foo'], created=millitime())
self.project.settings.save()
c = self.hsclient
q1 = c.push_job(self.project.projectid, self.spidername)
j1 = c.start_job()
self.assertEqual(j1, None, 'got %s, pushed job was %s' % (j1, q1))
j2 = c.start_job(botgroup='bar')
self.assertEqual(j2, None, 'got %s, pushed job was %s' % (j2, q1))
j3 = c.start_job(botgroup='foo')
self.assertEqual(j3.key, q1.key)
def test_debug_queries(self):
self.hsclient = HubstorageClient(auth=self.auth, endpoint=self.endpoint, debug=True)
self.assertEqual(self.hsclient.queries, [])
self.project = self.hsclient.get_project(self.projectid)
list(self.project.get_jobs(self.spiderid))
self.assertEqual(len(self.hsclient.queries), 1)
q = self.hsclient.queries[0]
self.assertEqual(q['method'], 'GET')
self.assert_(q['time'] > 0)
self.assert_('url' in q)
示例5: test_delete_on_hubstorage_api_does_not_404
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
def test_delete_on_hubstorage_api_does_not_404(self):
# NOTE: The current Hubstorage API does not raise 404 errors on deleting resources that do not exist,
# Thus the retry policy does not catch 404 errors when retrying deletes (simplify implementation A LOT).
# This test checks that this assumption holds.
client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=0)
project = client.get_project(projectid=self.projectid)
# Check frontier delete
project.frontier.delete_slot('frontier_non_existing', 'slot_non_existing')
# Check metadata delete
job = client.push_job(self.projectid, self.spidername)
job.metadata['foo'] = 'bar' # Add then delete key, this will trigger an api delete for item foo
del job.metadata['foo']
job.metadata.save()
# Check collections delete
store = project.collections.new_store('foo')
store.set({'_key': 'foo'})
store.delete('bar')
self.assertTrue(True, "No error have been triggered by calling a delete on resources that do not exist")
示例6: main
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
def main(argv):
apikey = ''
project = ''
try:
opts, args = getopt.getopt(argv, "hi:o", ["apikey=","project="])
except getopt.GetoptError:
print 'alljobs.py -k <API Key> -p <ProjectID>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print 'alljobs.py -k <API Key> -p <ProjectID>'
sys.exit()
elif opt in("-k", "--apikey"):
apikey = arg
elif opt in("-p", "--project"):
project = arg
hc = HubstorageClient(auth=apikey)
project = hc.get_project(project)
jobs_metadata = project.jobq.list()
jobids = [j['key'] for j in jobs_metadata]
jobidsUtf = [x.encode('UTF8') for x in jobids]
print jobidsUtf
示例7: HcfMiddleware
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
class HcfMiddleware(object):
def __init__(self, crawler):
settings = crawler.settings
self.hs_endpoint = settings.get("HS_ENDPOINT")
self.hs_auth = self._get_config(settings, "HS_AUTH")
self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID'))
self.hs_frontier = self._get_config(settings, "HS_FRONTIER")
self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT")
self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)
self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS)
self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False)
self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished'])
conn = Connection(self.hs_auth)
self.panel_project = conn[self.hs_projectid]
self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint)
self.project = self.hsclient.get_project(self.hs_projectid)
self.fclient = self.project.frontier
self.new_links = defaultdict(set)
self.batch_ids = []
crawler.signals.connect(self.close_spider, signals.spider_closed)
# Make sure the logger for hubstorage.batchuploader is configured
logging.basicConfig()
def _get_config(self, settings, key, default=None):
value = settings.get(key, default)
if not value:
raise NotConfigured('%s not found' % key)
return value
def _msg(self, msg, level=log.INFO):
log.msg('(HCF) %s' % msg, level)
def start_job(self, spider):
self._msg("Starting new job for: %s" % spider.name)
jobid = self.panel_project.schedule(
spider.name,
hs_consume_from_slot=self.hs_consume_from_slot,
dummy=datetime.now()
)
self._msg("New job started: %s" % jobid)
return jobid
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_start_requests(self, start_requests, spider):
self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier)
self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)
self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot)
self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot)
self.has_new_requests = False
for req in self._get_new_requests():
self.has_new_requests = True
yield req
# if there are no links in the hcf, use the start_requests
# unless this is not the first job.
if not self.has_new_requests and not getattr(spider, 'dummy', None):
self._msg('Using start_requests')
for r in start_requests:
yield r
def process_spider_output(self, response, result, spider):
slot_callback = getattr(spider, 'slot_callback', self._get_slot)
for item in result:
if isinstance(item, Request):
request = item
if request.meta.get('use_hcf', False):
if request.method == 'GET': # XXX: Only GET support for now.
slot = slot_callback(request)
if not request.url in self.new_links[slot]:
hcf_params = request.meta.get('hcf_params')
fp = {'fp': request.url}
if hcf_params:
fp.update(hcf_params)
# Save the new links as soon as possible using
# the batch uploader
self.fclient.add(self.hs_frontier, slot, [fp])
self.new_links[slot].add(request.url)
else:
self._msg("'use_hcf' meta key is not supported for non GET requests (%s)" % request.url,
log.ERROR)
yield request
else:
yield request
else:
yield item
def close_spider(self, spider, reason):
# Only store the results if the spider finished normally, if it
#.........这里部分代码省略.........
示例8: HcfMiddleware
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
class HcfMiddleware(object):
def __init__(self, crawler):
self.crawler = crawler
hs_endpoint = self._get_config(crawler, "HS_ENDPOINT")
hs_auth = self._get_config(crawler, "HS_AUTH")
self.hs_projectid = self._get_config(crawler, "HS_PROJECTID")
self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER")
self.hs_project_slot = self._get_config(crawler, "HS_SLOT")
# Max number of batches to read from the HCF within a single run.
try:
self.hs_max_baches = int(crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES))
except ValueError:
self.hs_max_baches = DEFAULT_MAX_BATCHES
self.hs_start_job_on_reason = crawler.settings.get("HS_START_JOB_ON_REASON", [])
self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint)
self.project = self.hsclient.get_project(self.hs_projectid)
self.fclient = self.project.frontier
self.new_links = defaultdict(list)
self.batch_ids = []
crawler.signals.connect(self.idle_spider, signals.spider_idle)
crawler.signals.connect(self.close_spider, signals.spider_closed)
def _get_config(self, crawler, key):
value = crawler.settings.get(key)
if not value:
raise NotConfigured('%s not found' % key)
return value
def _msg(self, msg):
log.msg('(HCF) %s' % msg)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_start_requests(self, start_requests, spider):
self.hs_frontier = getattr(spider, 'frontier', self.hs_project_frontier)
self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)
self.hs_slot = getattr(spider, 'slot', self.hs_project_slot)
self._msg('Using HS_SLOT=%s' % self.hs_slot)
has_new_requests = False
for req in self._get_new_requests():
has_new_requests = True
yield req
# if there are no links in the hcf, use the start_requests
if not has_new_requests:
self._msg('Using start_requests')
for r in start_requests:
yield r
def process_spider_output(self, response, result, spider):
slot_callback = getattr(spider, 'slot_callback', self._get_slot)
for item in result:
if isinstance(item, Request):
request = item
if (request.method == 'GET' and # XXX: Only GET support for now.
request.meta.get('use_hcf', False)):
slot = slot_callback(request)
hcf_params = request.meta.get('hcf_params')
fp = {'fp': request.url}
if hcf_params:
fp.update(hcf_params)
self.new_links[slot].append(fp)
else:
yield item
else:
yield item
def idle_spider(self, spider):
self._save_new_links()
self.fclient.flush()
self._delete_processed_ids()
has_new_requests = False
for request in self._get_new_requests():
self.crawler.engine.schedule(request, spider)
has_new_requests = True
if has_new_requests:
raise DontCloseSpider
def close_spider(self, spider, reason):
# Only store the results if the spider finished normally, if it
# didn't finished properly there is not way to know whether all the url batches
# were processed and it is better not to delete them from the frontier
# (so they will be picked by anothe process).
if reason == 'finished':
self._save_new_links()
self._delete_processed_ids()
# If the reason is defined in the hs_start_job_on_reason list then start
# a new job right after this spider is finished. The idea is to limit
# every spider runtime (either via itemcount, pagecount or timeout) and
#.........这里部分代码省略.........
示例9: SystemTest
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
class SystemTest(HSTestCase):
MAGICN = 1211
def setUp(self):
super(HSTestCase, self).setUp()
self.endpoint = self.hsclient.endpoint
# Panel - no client auth, only project auth using user auth token
self.panelclient = HubstorageClient(endpoint=self.endpoint)
self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth)
def tearDown(self):
super(HSTestCase, self).tearDown()
self.panelclient.close()
def test_succeed_with_close_reason(self):
self._do_test_success("all-good", "all-good")
def test_succeed_without_close_reason(self):
self._do_test_success(None, "no_reason")
def test_scraper_failure(self):
job = self._do_test_job(IOError("no more resources, ha!"), "failed")
# MAGICN per log level messages plus one of last failure
stats = job.logs.stats()
self.assertTrue(stats)
self.assertEqual(stats["totals"]["input_values"], self.MAGICN * 4 + 1)
def _do_test_success(self, job_close_reason, expected_close_reason):
job = self._do_test_job(job_close_reason, expected_close_reason)
self.assertEqual(job.items.stats()["totals"]["input_values"], self.MAGICN)
self.assertEqual(job.logs.stats()["totals"]["input_values"], self.MAGICN * 4)
self.assertEqual(job.requests.stats()["totals"]["input_values"], self.MAGICN)
def _do_test_job(self, job_close_reason, expected_close_reason):
p = self.panelproject
pushed = p.jobq.push(self.spidername)
# check pending state
job = p.get_job(pushed["key"])
self.assertEqual(job.metadata.get("state"), "pending")
# consume msg from runner
self._run_runner(pushed, close_reason=job_close_reason)
# query again from panel
job = p.get_job(pushed["key"])
self.assertEqual(job.metadata.get("state"), "finished")
self.assertEqual(job.metadata.get("close_reason"), expected_close_reason)
return job
def _run_runner(self, pushed, close_reason):
client = HubstorageClient(endpoint=self.endpoint, auth=self.auth)
with closing(client) as runnerclient:
job = self.start_job()
self.assertFalse(job.metadata.get("stop_requested"))
job.metadata.update(host="localhost", slot=1)
self.assertEqual(job.metadata.get("state"), "running")
# run scraper
try:
self._run_scraper(job.key, job.jobauth, close_reason=close_reason)
except Exception as exc:
job.logs.error(message=str(exc), appendmode=True)
job.close_writers()
job.jobq.finish(job, close_reason="failed")
# logging from runner must append and never remove messages logged
# by scraper
self.assertTrue(job.logs.batch_append)
else:
job.jobq.finish(job, close_reason=close_reason or "no_reason")
def _run_scraper(self, jobkey, jobauth, close_reason=None):
httpmethods = "GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT".split()
# Scraper - uses job level auth, no global or project auth available
client = HubstorageClient(endpoint=self.endpoint)
with closing(client) as scraperclient:
job = scraperclient.get_job(jobkey, auth=jobauth)
for idx in xrange(self.MAGICN):
iid = job.items.write({"uuid": idx})
job.logs.debug("log debug %s" % idx, idx=idx)
job.logs.info("log info %s" % idx, idx=idx)
job.logs.warn("log warn %s" % idx, idx=idx)
job.logs.error("log error %s" % idx, idx=idx)
sid = job.samples.write([idx, idx, idx])
rid = job.requests.add(
url="http://test.com/%d" % idx,
status=random.randint(100, 1000),
method=random.choice(httpmethods),
rs=random.randint(0, 100000),
duration=random.randint(0, 1000),
parent=random.randrange(0, idx + 1) if idx > 10 else None,
ts=millitime() + random.randint(100, 100000),
)
self.assertEqual(iid, idx)
self.assertEqual(sid, idx)
self.assertEqual(rid, idx)
if isinstance(close_reason, Exception):
raise close_reason
if close_reason:
job.metadata["close_reason"] = close_reason
#.........这里部分代码省略.........
示例10: HubstorageClient
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
#coding=UTF-8
from hubstorage import HubstorageClient
hc = HubstorageClient(auth='bc2aa25cc40f4ed4b03988e8e0b9e89e')
project = hc.get_project('53883')
itemslist = hc.get_job('53883/1/5').items.list()
itemslist_size = itemslist.__sizeof__()
for element in itemslist:
element.__delitem__('_type')
element.__delitem__('_cached_page_id')
element.__delitem__('_template')
elementIterator = element.iteritems()
for fields in elementIterator:
fieldIterator = fields.__iter__()
for values in fieldIterator:
if isinstance(values, basestring):
print values
else:
print values.pop()
示例11: test_allows_msgpack
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
def test_allows_msgpack(monkeypatch, msgpack_available, path, expected_result):
monkeypatch.setattr(
'hubstorage.collectionsrt.MSGPACK_AVAILABLE', msgpack_available)
hsclient = HubstorageClient()
collections = hsclient.get_project(2222000).collections
assert collections._allows_mpack(path) is (msgpack_available and expected_result)
示例12: __init__
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
def __init__(self, project: str, spider: str):
hc = HubstorageClient(auth=shub_cfg.get('apikey'))
key = next(hc.get_project(project).jobq.list(spider=spider)).get('key')
self.job = hc.get_job(key)
示例13: SystemTest
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
class SystemTest(HSTestCase):
MAGICN = 1211
def setUp(self):
super(HSTestCase, self).setUp()
endpoint = self.hsclient.endpoint
# Panel - no client auth, only project auth using user auth token
self.panelclient = HubstorageClient(endpoint=endpoint)
self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth)
# Runner - client uses global auth to poll jobq
self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth)
# Scraper - uses job level auth, no global or project auth available
self.scraperclient = HubstorageClient(endpoint=endpoint)
def test_succeed_with_close_reason(self):
p = self.panelproject
pushed = p.jobq.push(self.spidername)
# check pending state
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'pending')
# consume msg from runner
self._run_runner(pushed, close_reason='all-good')
# query again from panel
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'finished')
self.assertEqual(job.metadata.get('close_reason'), 'all-good')
self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN)
self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4)
self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN)
def test_succeed_without_close_reason(self):
p = self.panelproject
pushed = p.jobq.push(self.spidername)
# check pending state
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'pending')
# consume msg from runner
self._run_runner(pushed, close_reason=None)
# query again from panel
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'finished')
self.assertEqual(job.metadata.get('close_reason'), 'no_reason')
self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN)
self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4)
self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN)
def test_scraper_failure(self):
p = self.panelproject
pushed = p.jobq.push(self.spidername)
# check pending state
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'pending')
# consume msg from runner
self._run_runner(pushed, close_reason=IOError('no more resources, ha!'))
# query again from panel
job = p.get_jobs(self.spiderid).next()
self.assertEqual(job.metadata.get('state'), 'finished')
self.assertEqual(job.metadata.get('close_reason'), 'failed')
# MAGICN per log level messages plus one of last failure
stats = job.logs.stats()
self.assertTrue(stats)
self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1)
def _run_runner(self, pushed, close_reason):
job = self.runnerclient.start_job(self.projectid)
self.assertFalse(job.metadata.get('stop_requested'))
job.metadata.update(host='localhost', slot=1)
self.assertEqual(job.metadata.get('state'), 'running')
# run scraper
try:
self._run_scraper(job.key, job.jobauth, close_reason=close_reason)
except Exception as exc:
job.failed(message=str(exc))
# logging from runner must append and never remove messages logged
# by scraper
self.assertTrue(job.logs.batch_append)
else:
job.finished()
self.runnerclient.close()
def _run_scraper(self, jobkey, jobauth, close_reason=None):
httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split()
job = self.scraperclient.get_job(jobkey, auth=jobauth)
for idx in xrange(self.MAGICN):
iid = job.items.write({'uuid': idx})
job.logs.debug('log debug %s' % idx, idx=idx)
job.logs.info('log info %s' % idx, idx=idx)
job.logs.warn('log warn %s' % idx, idx=idx)
job.logs.error('log error %s' % idx, idx=idx)
sid = job.samples.write([idx, idx, idx])
rid = job.requests.add(
url='http://test.com/%d' % idx,
status=random.randint(100, 1000),
method=random.choice(httpmethods),
rs=random.randint(0, 100000),
duration=random.randint(0, 1000),
parent=random.randrange(0, idx + 1) if idx > 10 else None,
ts=millitime() + random.randint(100, 100000),
#.........这里部分代码省略.........
示例14: HCFStates
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
class HCFStates(MemoryStates):
def __init__(self, auth, project_id, colname, cache_size_limit, cleanup_on_start):
super(HCFStates, self).__init__(cache_size_limit)
self._hs_client = HubstorageClient(auth=auth)
self.projectid = project_id
project = self._hs_client.get_project(self.projectid)
self._collections = project.collections
self._colname = colname + "_states"
self.logger = logging.getLogger("hcf.states")
if cleanup_on_start:
self._cleanup()
def _cleanup(self):
while True:
nextstart = None
params = {'method':'DELETE',
'url':'https://storage.scrapinghub.com/collections/%d/s/%s' % (self.projectid, self._colname),
'auth':self._hs_client.auth}
if nextstart:
params['prefix'] = nextstart
response = self._hs_client.session.request(**params)
if response.status_code != 200:
self.logger.error("%d %s", response.status_code, response.content)
self.logger.info(params)
try:
r = loads(response.content.decode('utf-8'))
self.logger.debug("Removed %d, scanned %d", r["deleted"], r["scanned"])
nextstart = r.get('nextstart')
except ValueError as ve:
self.logger.debug(ve)
self.logger.debug("content: %s (%d)" % (response.content, len(response.content)))
if not nextstart:
break
def frontier_start(self):
self._store = self._collections.new_store(self._colname)
def frontier_stop(self):
self.logger.debug("Got frontier stop.")
self.flush()
self._hs_client.close()
def _hcf_fetch(self, to_fetch):
finished = False
i = iter(to_fetch)
while True:
prepared_keys = []
while True:
try:
prepared_keys.append("key=%s" % next(i))
if len(prepared_keys) >= 32:
break
except StopIteration:
finished = True
break
if not prepared_keys:
break
prepared_keys.append("meta=_key")
params = {'method':'GET',
'url':'https://storage.scrapinghub.com/collections/%d/s/%s' % (self.projectid, self._colname),
'params':str('&').join(prepared_keys),
'auth':self._hs_client.auth}
start = time()
response = self._hs_client.session.request(**params)
self.logger.debug("Fetch request time %f ms", (time()-start) * 1000)
if response.status_code != 200:
self.logger.error("%d %s", response.status_code, response.content)
self.logger.info(params)
for line in response.content.decode('utf-8').split('\n'):
if not line:
continue
try:
yield loads(line)
except ValueError as ve:
self.logger.debug(ve)
self.logger.debug("content: %s (%d)" % (line, len(line)))
if finished:
break
def fetch(self, fingerprints):
to_fetch = [f for f in fingerprints if f not in self._cache]
self.logger.debug("cache size %s" % len(self._cache))
self.logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints)))
if not to_fetch:
return
count = 0
for o in self._hcf_fetch(to_fetch):
self._cache[o['_key']] = o['value']
count += 1
self.logger.debug("Fetched %d items" % count)
def flush(self, force_clear=False):
buffer = []
count = 0
start = time()
try:
#.........这里部分代码省略.........
示例15: HCFClientWrapper
# 需要导入模块: from hubstorage import HubstorageClient [as 别名]
# 或者: from hubstorage.HubstorageClient import get_project [as 别名]
class HCFClientWrapper(object):
def __init__(self, auth, project_id, frontier, batch_size=0, flush_interval=30):
self._hs_client = HubstorageClient(auth=auth)
self._hcf = self._hs_client.get_project(project_id).frontier
self._hcf.batch_size = batch_size
self._hcf.batch_interval = flush_interval
self._frontier = frontier
self._links_count = defaultdict(int)
self._links_to_flush_count = defaultdict(int)
self._hcf_retries = 10
self.logger = logging.getLogger("hubstorage-wrapper")
def add_request(self, slot, request):
self._hcf.add(self._frontier, slot, [request])
self._links_count[slot] += 1
self._links_to_flush_count[slot] += 1
return 0
def flush(self, slot=None):
n_links_to_flush = self.get_number_of_links_to_flush(slot)
if n_links_to_flush:
if slot is None:
self._hcf.flush()
for slot in self._links_to_flush_count.keys():
self._links_to_flush_count[slot] = 0
else:
writer = self._hcf._get_writer(self._frontier, slot)
writer.flush()
self._links_to_flush_count[slot] = 0
return n_links_to_flush
def read(self, slot, mincount=None):
for i in range(self._hcf_retries):
try:
return self._hcf.read(self._frontier, slot, mincount)
except requests_lib.exceptions.ReadTimeout:
self.logger.error("Could not read from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1,
self._hcf_retries))
except requests_lib.exceptions.ConnectionError:
self.logger.error("Connection error while reading from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1,
self._hcf_retries))
except requests_lib.exceptions.RequestException:
self.logger.error("Error while reading from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1,
self._hcf_retries))
sleep(60 * (i + 1))
return []
def delete(self, slot, ids):
for i in range(self._hcf_retries):
try:
self._hcf.delete(self._frontier, slot, ids)
break
except requests_lib.exceptions.ReadTimeout:
self.logger.error("Could not delete ids from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1,
self._hcf_retries))
except requests_lib.exceptions.ConnectionError:
self.logger.error("Connection error while deleting ids from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1,
self._hcf_retries))
except requests_lib.exceptions.RequestException:
self.logger.error("Error deleting ids from {0}/{1} try {2}/{3}".format(self._frontier, slot, i+1,
self._hcf_retries))
sleep(60 * (i + 1))
def delete_slot(self, slot):
self._hcf.delete_slot(self._frontier, slot)
def close(self):
self._hcf.close()
self._hs_client.close()
def get_number_of_links(self, slot=None):
if slot is None:
return sum(self._links_count.values())
else:
return self._links_count[slot]
def get_number_of_links_to_flush(self, slot=None):
if slot is None:
return sum(self._links_to_flush_count.values())
else:
return self._links_to_flush_count[slot]