本文整理汇总了Python中tornado.queues.Queue.task_done方法的典型用法代码示例。如果您正苦于以下问题:Python Queue.task_done方法的具体用法?Python Queue.task_done怎么用?Python Queue.task_done使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tornado.queues.Queue
的用法示例。
在下文中一共展示了Queue.task_done方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: TornadoQuerierBase
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class TornadoQuerierBase(object):
def __init__(self):
self.tasks = TornadoQueue()
def gen_task(self):
raise NotImplementError()
def run_task(self, task):
raise NotImplementError()
def prepare(self):
self.running = True
def cleanup(self):
self.running = False
@coroutine
def run_worker(self, worker_id, f):
while self.tasks.qsize() > 0:
task = yield self.tasks.get()
LOG.debug('worker[%d]: current task is %s' % (worker_id, task))
try:
yield f(task)
pass
except Exception as e:
LOG.warning(str(e))
finally:
self.tasks.task_done()
task = None
LOG.debug('worker[%d]: all tasks done %s' % (worker_id, self.tasks))
@coroutine
def start(self, num_workers=1):
self.prepare()
# add tasks
tasks = yield self.gen_task()
for task in tasks:
yield self.tasks.put(task)
# start shoot workers
for worker_id in range(num_workers):
LOG.debug('starting worker %d' % worker_id)
self.run_worker(worker_id, self.run_task)
yield self.tasks.join()
self.cleanup()
示例2: TopicAppllication
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class TopicAppllication(tornado.web.Application):
def __init__(self):
handlers = [
url(r'/', MainHandler)
]
self.queue = Queue(maxsize=10)
super(TopicAppllication, self).__init__(handlers=handlers, debug=True)
@gen.coroutine
def consumer(self):
item = yield self.queue.get()
try:
print item
finally:
self.queue.task_done()
示例3: CommandQueue
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class CommandQueue():
def __init__(self):
self.queue = Queue()
@gen.coroutine
def process_command(self):
while True:
item = yield self.queue.get()
try:
yield gen.sleep(0.1)
command, view = item
view.write_message({command[0]: command[1]})
finally:
self.queue.task_done()
def put(self, item):
self.queue.put(item)
示例4: StreamClient
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class StreamClient(object):
MAX_SIZE = 60
def __init__(self, steam_id):
self.id = generate_id()
self.stream_id = steam_id
self.queue = Queue(StreamClient.MAX_SIZE)
@coroutine
def send(self, item):
yield self.queue.put(item)
@coroutine
def fetch(self):
item = yield self.queue.get()
self.queue.task_done()
return item
def empty(self):
return self.queue.qsize() == 0
示例5: BatchedStream
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class BatchedStream(object):
""" Mostly obsolete, see BatchedSend """
def __init__(self, stream, interval):
self.stream = stream
self.interval = interval / 1000.0
self.last_transmission = default_timer()
self.send_q = Queue()
self.recv_q = Queue()
self._background_send_coroutine = self._background_send()
self._background_recv_coroutine = self._background_recv()
self._broken = None
self.pc = PeriodicCallback(lambda: None, 100)
self.pc.start()
@gen.coroutine
def _background_send(self):
with log_errors():
while True:
msg = yield self.send_q.get()
if msg == "close":
break
msgs = [msg]
now = default_timer()
wait_time = self.last_transmission + self.interval - now
if wait_time > 0:
yield gen.sleep(wait_time)
while not self.send_q.empty():
msgs.append(self.send_q.get_nowait())
try:
yield write(self.stream, msgs)
except StreamClosedError:
self.recv_q.put_nowait("close")
self._broken = True
break
if len(msgs) > 1:
logger.debug("Batched messages: %d", len(msgs))
for _ in msgs:
self.send_q.task_done()
@gen.coroutine
def _background_recv(self):
with log_errors():
while True:
try:
msgs = yield read(self.stream)
except StreamClosedError:
self.recv_q.put_nowait("close")
self.send_q.put_nowait("close")
self._broken = True
break
assert isinstance(msgs, list)
if len(msgs) > 1:
logger.debug("Batched messages: %d", len(msgs))
for msg in msgs:
self.recv_q.put_nowait(msg)
@gen.coroutine
def flush(self):
yield self.send_q.join()
@gen.coroutine
def send(self, msg):
if self._broken:
raise StreamClosedError("Batch Stream is Closed")
else:
self.send_q.put_nowait(msg)
@gen.coroutine
def recv(self):
result = yield self.recv_q.get()
if result == "close":
raise StreamClosedError("Batched Stream is Closed")
else:
raise gen.Return(result)
@gen.coroutine
def close(self):
yield self.flush()
raise gen.Return(self.stream.close())
def closed(self):
return self.stream.closed()
示例6: BlogBackup
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
#.........这里部分代码省略.........
self._generate_save_dir()
def _get_user_cookies(self):
url = target_url + login_page_path
self.driver.get(url)
try:
user_input = self.driver.find_element_by_name('mail')
passwd_input = self.driver.find_element_by_name('password')
submit_btn = self.driver.find_element_by_class_name('pr20')
except NoSuchElementException:
raise PageHtmlChanged(
"%s login page structure have changed!" % _domain)
user_input.send_keys(self.username)
passwd_input.send_keys(self.passwd)
submit_btn.click()
try:
WebDriverWait(self.driver, 3).until(staleness_of(submit_btn))
except TimeoutException:
raise Exception("Wrong username or password!")
WebDriverWait(self.driver, timeout=10).until(has_page_load)
try_times = 0
while True:
time.sleep(1)
if url != self.driver.current_url:
return self.driver.get_cookies()
try_times += 1
if try_times > 10:
raise Exception("Getting cookie info failed!")
def _get_driver(self):
if self.phantomjs_path:
try:
return webdriver.PhantomJS(
executable_path=self.phantomjs_path,
service_log_path=os.path.devnull)
except WebDriverException:
raise PhantomjsPathError("Phantomjs locate path invalid!")
else:
return webdriver.PhantomJS(service_log_path=os.path.devnull)
def __init__(self, **conf):
self.username = conf['username']
self.passwd = conf['passwd']
self.phantomjs_path = conf.get('phantomjs_path')
self.save_path = conf.get('save_path')
self._q = Queue()
self._parse_save_path()
self.driver = self._get_driver()
self._cookies = self._get_user_cookies()
@gen.coroutine
def run(self):
self.__filter_cookies()
start_url = target_url + blog_path
yield self._fetch_blog_list_page(start_url)
for _ in xrange(cpu_count()):
self._fetch_essay_content()
yield self._q.join()
def __filter_cookies(self):
self._cookies = {k['name']: k['value'] for k in self._cookies if
k['domain'] == _domain}
@gen.coroutine
def _fetch_blog_list_page(self, page_link):
ret = requests.get(page_link, cookies=self._cookies)
d = pq(ret.text)
link_elements = d('.stream-list__item > .summary > h2 > a')
for link in link_elements:
yield self._q.put(d(link).attr('href'))
next_ele = d('.pagination li.next a')
if next_ele:
next_page_url = target_url + next_ele.attr('href')
self._fetch_blog_list_page(next_page_url)
@gen.coroutine
def _fetch_essay_content(self):
while True:
try:
essay_path = yield self._q.get(timeout=1)
essay_url = target_url + essay_path + edit_suffix
ret = requests.get(essay_url, cookies=self._cookies)
d = pq(ret.text)
title = d("#myTitle").val()
content = d("#myEditor").text()
file_name = title + '.md'
real_file_name = os.path.join(self.save_path, file_name)
with open(real_file_name, 'w') as f:
f.writelines(content.encode('utf8'))
except gen.TimeoutError:
raise gen.Return()
finally:
self._q.task_done()
示例7: ProjectGroomer
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class ProjectGroomer(object):
""" Cleans up expired transactions for a project. """
def __init__(self, project_id, coordinator, zk_client, db_access,
thread_pool):
""" Creates a new ProjectGroomer.
Args:
project_id: A string specifying a project ID.
coordinator: A GroomingCoordinator.
zk_client: A KazooClient.
db_access: A DatastoreProxy.
thread_pool: A ThreadPoolExecutor.
"""
self.project_id = project_id
self._coordinator = coordinator
self._zk_client = zk_client
self._tornado_zk = TornadoKazoo(self._zk_client)
self._db_access = db_access
self._thread_pool = thread_pool
self._project_node = '/appscale/apps/{}'.format(self.project_id)
self._containers = []
self._inactive_containers = set()
self._batch_resolver = BatchResolver(self.project_id, self._db_access)
self._zk_client.ensure_path(self._project_node)
self._zk_client.ChildrenWatch(self._project_node, self._update_containers)
self._txid_manual_offset = 0
self._offset_node = '/'.join([self._project_node, OFFSET_NODE])
self._zk_client.DataWatch(self._offset_node, self._update_offset)
self._stop_event = AsyncEvent()
self._stopped_event = AsyncEvent()
# Keeps track of cleanup results for each round of grooming.
self._txids_cleaned = 0
self._oldest_valid_tx_time = None
self._worker_queue = AsyncQueue(maxsize=MAX_CONCURRENCY)
for _ in range(MAX_CONCURRENCY):
IOLoop.current().spawn_callback(self._worker)
IOLoop.current().spawn_callback(self.start)
@gen.coroutine
def start(self):
""" Starts the grooming process until the stop event is set. """
logger.info('Grooming {}'.format(self.project_id))
while True:
if self._stop_event.is_set():
break
try:
yield self._groom_project()
except Exception:
# Prevent the grooming loop from stopping if an error is encountered.
logger.exception(
'Unexpected error while grooming {}'.format(self.project_id))
yield gen.sleep(MAX_TX_DURATION)
self._stopped_event.set()
@gen.coroutine
def stop(self):
""" Stops the grooming process. """
logger.info('Stopping grooming process for {}'.format(self.project_id))
self._stop_event.set()
yield self._stopped_event.wait()
@gen.coroutine
def _worker(self):
""" Processes items in the worker queue. """
while True:
tx_path, composite_indexes = yield self._worker_queue.get()
try:
tx_time = yield self._resolve_txid(tx_path, composite_indexes)
if tx_time is None:
self._txids_cleaned += 1
if tx_time is not None and tx_time < self._oldest_valid_tx_time:
self._oldest_valid_tx_time = tx_time
finally:
self._worker_queue.task_done()
def _update_offset(self, new_offset, _):
""" Watches for updates to the manual offset node.
Args:
new_offset: A string specifying the new manual offset.
"""
self._txid_manual_offset = int(new_offset or 0)
def _update_containers(self, nodes):
""" Updates the list of active txid containers.
Args:
nodes: A list of strings specifying ZooKeeper nodes.
"""
counters = [int(node[len(CONTAINER_PREFIX):] or 1)
#.........这里部分代码省略.........
示例8: __init__
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class Model:
def __init__(self, config_file):
self.lock = locks.Lock()
self.classification_queue = Queue()
print('loading config %s' % config_file, file=log.v5)
# Load and setup config
try:
self.config = Config.Config()
self.config.load_file(config_file)
self.pause_after_first_seq = self.config.float('pause_after_first_seq', 0.2)
self.batch_size = self.config.int('batch_size', 5000)
self.max_seqs = self.config.int('max_seqs', -1)
except Exception:
print('Error: loading config %s failed' % config_file, file=log.v1)
raise
try:
self.devices = self._init_devices()
except Exception:
print('Error: Loading devices for config %s failed' % config_file, file=log.v1)
raise
print('Starting engine for config %s' % config_file, file=log.v5)
self.engine = Engine.Engine(self.devices)
try:
self.engine.init_network_from_config(config=self.config)
except Exception:
print('Error: Loading network for config %s failed' % config_file, file=log.v1)
raise
IOLoop.current().spawn_callback(self.classify_in_background)
self.last_used = datetime.datetime.now()
def _init_devices(self):
"""
Initiates the required devices for a config. Same as the funtion initDevices in
rnn.py.
:param config:
:return: A list with the devices used.
"""
oldDeviceConfig = ",".join(self.config.list('device', ['default']))
if "device" in TheanoFlags:
# This is important because Theano likely already has initialized that device.
config.set("device", TheanoFlags["device"])
print("Devices: Use %s via THEANO_FLAGS instead of %s." % (TheanoFlags["device"], oldDeviceConfig), file=log.v4)
devArgs = get_devices_init_args(self.config)
assert len(devArgs) > 0
devices = [Device(**kwargs) for kwargs in devArgs]
for device in devices:
while not device.initialized:
time.sleep(0.25)
if devices[0].blocking:
print("Devices: Used in blocking / single proc mode.", file=log.v4)
else:
print("Devices: Used in multiprocessing mode.", file=log.v4)
return devices
@tornado.gen.coroutine
def classify_in_background(self):
while True:
requests = []
# fetch first request
r = yield self.classification_queue.get()
requests.append(r)
# grab all other waiting requests
try:
while True:
requests.append(self.classification_queue.get_nowait())
except QueueEmpty:
pass
output_dim = {}
# Do dataset creation and classification.
dataset = StaticDataset(data=[r.data for r in requests], output_dim=output_dim)
dataset.init_seq_order()
batches = dataset.generate_batches(recurrent_net=self.engine.network.recurrent,
batch_size=self.batch_size, max_seqs=self.max_seqs)
with (yield self.lock.acquire()):
ctt = ForwardTaskThread(self.engine.network, self.devices, dataset, batches)
yield ctt.join()
try:
for i in range(dataset.num_seqs):
requests[i].future.set_result(ctt.result[i])
self.classification_queue.task_done()
except Exception as e:
print('exception', e)
raise
@tornado.gen.coroutine
def classify(self, data):
self.last_used = datetime.datetime.now()
request = ClassificationRequest(data)
yield self.classification_queue.put(request)
yield request.future
#.........这里部分代码省略.........
示例9: SubscribeListener
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class SubscribeListener(SubscribeCallback):
def __init__(self):
self.connected = False
self.connected_event = Event()
self.disconnected_event = Event()
self.presence_queue = Queue()
self.message_queue = Queue()
self.error_queue = Queue()
def status(self, pubnub, status):
if utils.is_subscribed_event(status) and not self.connected_event.is_set():
self.connected_event.set()
elif utils.is_unsubscribed_event(status) and not self.disconnected_event.is_set():
self.disconnected_event.set()
elif status.is_error():
self.error_queue.put_nowait(status.error_data.exception)
def message(self, pubnub, message):
self.message_queue.put(message)
def presence(self, pubnub, presence):
self.presence_queue.put(presence)
@tornado.gen.coroutine
def _wait_for(self, coro):
error = self.error_queue.get()
wi = tornado.gen.WaitIterator(coro, error)
while not wi.done():
result = yield wi.next()
if wi.current_future == coro:
raise gen.Return(result)
elif wi.current_future == error:
raise result
else:
raise Exception("Unexpected future resolved: %s" % str(wi.current_future))
@tornado.gen.coroutine
def wait_for_connect(self):
if not self.connected_event.is_set():
yield self._wait_for(self.connected_event.wait())
else:
raise Exception("instance is already connected")
@tornado.gen.coroutine
def wait_for_disconnect(self):
if not self.disconnected_event.is_set():
yield self._wait_for(self.disconnected_event.wait())
else:
raise Exception("instance is already disconnected")
@tornado.gen.coroutine
def wait_for_message_on(self, *channel_names):
channel_names = list(channel_names)
while True:
try: # NOQA
env = yield self._wait_for(self.message_queue.get())
if env.channel in channel_names:
raise tornado.gen.Return(env)
else:
continue
finally:
self.message_queue.task_done()
@tornado.gen.coroutine
def wait_for_presence_on(self, *channel_names):
channel_names = list(channel_names)
while True:
try:
try:
env = yield self._wait_for(self.presence_queue.get())
except: # NOQA E722 pylint: disable=W0702
break
if env.channel in channel_names:
raise tornado.gen.Return(env)
else:
continue
finally:
self.presence_queue.task_done()
示例10: BlogBackup
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class BlogBackup(object):
_default_dir_name = "seg_blog_backup"
def _generate_save_dir(self):
cur_dir = os.path.dirname(__file__)
self.save_path = os.path.join(cur_dir, self._default_dir_name)
if not os.path.isdir(self.save_path):
os.mkdir(self.save_path)
def _parse_save_path(self):
if self.save_path:
if os.path.exists(self.save_path) and os.path.isdir(self.save_path):
return
else:
raise BlogSavePathError("'%s' not exists or is not dir!" % self.save_path)
else:
self._generate_save_dir()
@staticmethod
def parse_token_from_html(content):
overall_pat = re.compile(r"SF.token =.*?,\s+_\w+ = [\d,\[\]]+;", re.DOTALL)
overall_res = overall_pat.search(content)
if overall_res:
overall_content = overall_res.group()
# remove /* */ type annotation
filter_res = re.sub(r"(/\*[/a-zA-Z\d' ]+\*/)", "", overall_content)
str_list = re.findall(r"(?<!//)'([a-zA-Z\d]+)'", filter_res, re.DOTALL)
filter_list = re.findall(r"\[(\d+),(\d+)\]", overall_content)
ret = "".join(str_list)
if filter_list:
for m, n in filter_list:
ret = ret[: int(m)] + ret[int(n) :]
if len(ret) == 32:
return ret
raise PageHtmlChanged("website login token has changed")
def _get_user_cookies(self):
s = requests.Session()
s.headers.update(headers)
rep = s.get(target_url)
post_url = "%s%s?_=%s" % (target_url, login_api_path, self.parse_token_from_html(rep.text))
data = {"mail": self.username, "password": self.passwd}
s.post(post_url, data=data)
return s.cookies
def __init__(self, **conf):
self.username = conf["username"]
self.passwd = conf["passwd"]
self.save_path = conf.get("save_path")
self._q = Queue()
self._cookies = self._get_user_cookies()
self._parse_save_path()
@gen.coroutine
def run(self):
start_url = target_url + blog_path
yield self._fetch_blog_list_page(start_url)
for _ in xrange(cpu_count()):
self._fetch_essay_content()
yield self._q.join()
@gen.coroutine
def _fetch_blog_list_page(self, page_link):
ret = requests.get(page_link, cookies=self._cookies)
d = pq(ret.text)
link_elements = d(".stream-list__item > .summary > h2 > a")
for link in link_elements:
yield self._q.put(d(link).attr("href"))
next_ele = d(".pagination li.next a")
if next_ele:
next_page_url = target_url + next_ele.attr("href")
self._fetch_blog_list_page(next_page_url)
@gen.coroutine
def _fetch_essay_content(self):
while True:
try:
essay_path = yield self._q.get(timeout=1)
essay_url = target_url + essay_path + edit_suffix
ret = requests.get(essay_url, cookies=self._cookies)
d = pq(ret.text)
title = d("#myTitle").val()
content = d("#myEditor").text()
real_file_name = os.path.join(self.save_path, title + ".md")
logger.info("is backup essay: %s" % title)
with open(real_file_name, "w") as f:
f.writelines(content.encode("utf8"))
except gen.TimeoutError:
raise gen.Return()
finally:
self._q.task_done()
示例11: Scraper
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class Scraper():
def __init__(
self,
destinations=None,
transform=None,
headers={},
max_clients=50,
maxsize=50,
connect_timeout=1200,
request_timeout=600,):
"""Instantiate a tornado async http client to do multiple concurrent requests"""
if None in [destinations, transform]:
sys.stderr.write('You must pass both collection of URLS and a transform function')
raise SystemExit
self.max_clients = max_clients
self.maxsize = maxsize
self.connect_timeout = connect_timeout
self.request_timeout = request_timeout
AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients)
self.http_client = AsyncHTTPClient()
self.queue = Queue(maxsize=50)
self.destinations = destinations
self.transform = transform
self.headers = headers
self.read(self.destinations)
self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout, self.http_client)
self.loop = ioloop.IOLoop.current()
self.join_future = self.queue.join()
def done(future):
self.loop.stop()
self.join_future.add_done_callback(done)
self.loop.start()
@gen.coroutine
def read(self, destinations):
for url in destinations:
yield self.queue.put(url)
@gen.coroutine
def get(self, transform, headers, connect_timeout, request_timeout, http_client):
while True:
url = yield self.queue.get()
try:
request = HTTPRequest(url,
connect_timeout=connect_timeout,
request_timeout=request_timeout,
method="GET",
headers = headers
)
except Exception as e:
sys.stderr.write('Destination {0} returned error {1}'.format(url, str(e) + '\n'))
future = self.http_client.fetch(request)
def done_callback(future):
body = future.result().body
url = future.result().effective_url
transform(body, url=url)
self.queue.task_done()
try:
future.add_done_callback(done_callback)
except Exception as e:
sys.stderr.write(str(e))
queue.put(url)
示例12: Client
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class Client(object):
def __init__(self, server, name, stream):
self.server = server
self.name = name
self.rooms = {}
self.stream = stream
self.inqueue = Queue(maxsize=QUEUE_SIZE)
self.outqueue = Queue(maxsize=QUEUE_SIZE)
@coroutine
def forwarding(self):
while True:
msg = yield self.outqueue.get()
if msg.command == COMMAND_QUIT:
for _, room in self.rooms.items():
yield room.inqueue.put(msg)
elif msg.command == COMMAND_JOIN:
room_name = msg.receiver
room = self.server.get_room(room_name)
self.rooms[room_name] = room
yield room.inqueue.put(msg)
else:
room = self.rooms[msg.receiver]
yield room.inqueue.put(msg)
self.outqueue.task_done()
@coroutine
def response(self):
global SPEED
while True:
msg = yield self.inqueue.get()
if msg.command == COMMAND_QUIT:
self.stream.close()
return
else:
response = ("%s %s:%s\n" % (datetime.datetime.now(),
msg.sender.name,
msg.content.decode()))\
.encode('utf-8')
try:
SPEED += 1
yield self.stream.write(response)
except Exception as e:
logging.debug(str(e))
self.stream.close()
@coroutine
def receive(self):
while True:
try:
line = yield self.stream.read_until(b'\n')
except Exception as e:
logging.debug(str(e))
msg = Message(self, '', COMMAND_QUIT, 'CONNECTION ERROR')
yield self.outqueue.put(msg)
return
data = line.strip().split(b' ')
if len(data) != 2:
continue
room_name, content = data[0], data[1]
if room_name in self.rooms:
msg = Message(self, room_name, COMMAND_NORMAL, content)
else:
msg = Message(self, room_name, COMMAND_JOIN, content)
yield self.outqueue.put(msg)
示例13: BaseSpider
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import task_done [as 别名]
class BaseSpider(object):
url_parser = None
def __init__(self, engine, concurrent=3):
self.engine = engine
self.http = httpclient.AsyncHTTPClient()
self.queue = Queue()
self.concurrency = concurrent
@property
def hostname(self):
return self.url_parser.hostname
@property
def url_root(self):
return self.url_parser.url_root
@property
def base_url(self):
return self.url_parser.base_url
@gen.coroutine
def __worker(self):
"""Consumes the queue."""
while True:
yield self.fetch_url()
@gen.coroutine
def crawl(self, description, location):
"""Starts crawling the specified URL."""
url = self.url_parser(description, location)
self.queue.put(url)
self.engine.notify_started(self)
for _ in range(self.concurrency):
self.__worker()
yield self.queue.join()
self.engine.notify_finished(self)
@gen.coroutine
def fetch_url(self):
"""Retrieves a URL from the queue and returns the parsed data."""
url = yield self.queue.get()
logger.info('fetching %s' % url)
try:
response = yield self.http.fetch(url)
soup = BeautifulSoup(response.body)
logger.info('got response %s' % url)
urls = yield self.fetch_links(response, soup)
for new_url in urls:
logger.debug('Added %s to queue' % new_url)
yield self.queue.put(new_url)
data = yield self.parse_response(response, soup)
logger.info('Parsed response for %s' % url)
except (httpclient.HTTPError, ValueError):
message = 'HTTP Error: (%s)' % url
self.engine.write_message(message, self.engine.STATUS_ERROR)
else:
self.engine.write_data(data)
finally:
self.queue.task_done()
@gen.coroutine
def fetch_links(self, response, soup):
"""Fetch URLs to be added to the queue."""
raise gen.Return([])
def parse_response(self, response, soup):
"""Extract information from the response, return should be a
list of dict's.
Sample dict:
{
'title': 'Job Title',
'company': 'Company Name',
'location': 'City/State/Country',
'tags': ['tag1', 'tag2', 'tag3'],
'category': 'Software Developer',
'origin': 'Name of the origin website',
'url': 'Link to the complete job description',
}
"""
raise NotImplementedError