本文整理汇总了Python中tornado.queues.Queue.join方法的典型用法代码示例。如果您正苦于以下问题:Python Queue.join方法的具体用法?Python Queue.join怎么用?Python Queue.join使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tornado.queues.Queue
的用法示例。
在下文中一共展示了Queue.join方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
def run(args):
if not args.test:
ip_iter = _create_ip_iterator()
else:
ip_iter = _get_test_ips()
good_ips = []
job_queue = Queue(maxsize=200)
start = time.time()
counter = Counter()
@gen.coroutine
def job_producer():
for ip in ip_iter:
yield job_queue.put(ip)
#print("Put {}".format(ip))
@gen.coroutine
def worker(id):
while True:
ip = yield job_queue.get()
try:
good = yield test_ip(ip)
counter['all'] += 1
if args.progress:
if counter['all'] % 10000 == 0:
print("Tested {} ips.".format(counter['all']))
if good:
print("Found good ip: {}".format(ip))
counter['good'] += 1
if not args.test:
yield record_good_ip(ip)
else:
good_ips.append(ip)
finally:
job_queue.task_done()
for i in range(CONCURRENCY):
worker(i)
_disable_logging()
try:
yield job_producer()
yield job_queue.join()
finally:
print("\n\nTested: {} ips\nFound {} good ips\nQps: {}".format(
counter['all'],
counter['good'],
counter['all'] / (time.time() - start)
))
if args.test and args.remove:
with open(GOOD_IP_FILE + '_removed', 'w') as f:
f.write('|'.join(good_ips))
示例2: TornadoQuerierBase
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class TornadoQuerierBase(object):
def __init__(self):
self.tasks = TornadoQueue()
def gen_task(self):
raise NotImplementError()
def run_task(self, task):
raise NotImplementError()
def prepare(self):
self.running = True
def cleanup(self):
self.running = False
@coroutine
def run_worker(self, worker_id, f):
while self.tasks.qsize() > 0:
task = yield self.tasks.get()
LOG.debug('worker[%d]: current task is %s' % (worker_id, task))
try:
yield f(task)
pass
except Exception as e:
LOG.warning(str(e))
finally:
self.tasks.task_done()
task = None
LOG.debug('worker[%d]: all tasks done %s' % (worker_id, self.tasks))
@coroutine
def start(self, num_workers=1):
self.prepare()
# add tasks
tasks = yield self.gen_task()
for task in tasks:
yield self.tasks.put(task)
# start shoot workers
for worker_id in range(num_workers):
LOG.debug('starting worker %d' % worker_id)
self.run_worker(worker_id, self.run_task)
yield self.tasks.join()
self.cleanup()
示例3: get_file_list
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
def get_file_list(account, **kwargs):
queue = Queue()
sem = BoundedSemaphore(FETCH_CONCURRENCY)
done, working = set(), set()
data = set()
@gen.coroutine
def fetch_url():
current_url = yield queue.get()
try:
if current_url in working:
return
page_no = working.__len__()
app_log.info("Fetching page {}".format(page_no))
working.add(current_url)
req = account.get_request(current_url)
client = AsyncHTTPClient()
response = yield client.fetch(req)
done.add(current_url)
app_log.info("Page {} downloaded".format(page_no))
response_data = json.loads(response.body.decode('utf-8'))
for file in response_data:
# be sure we're a valid file type and less than our maximum response size limit
extension = file['path'].lower().split('.')[-1]
if extension in VALID_FILETYPES and int(file['bytes']) < RESPONSE_SIZE_LIMIT * 1000000:
data.add((file['path'].lstrip('/'), file['path'], ))
app_log.info("Page {} completed".format(page_no))
finally:
queue.task_done()
sem.release()
@gen.coroutine
def worker():
while True:
yield sem.acquire()
fetch_url()
app_log.info("Gathering filelist for account {}".format(account._id))
for file_type in VALID_FILETYPES:
file_type = '.'.join([file_type])
url = "https://api.dropbox.com/1/search/auto/?query={}&include_membership=true".format(file_type)
queue.put(url)
# start our concurrency worker
worker()
# wait until we're done
yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME))
app_log.info("Finished list retrieval. Found {} items.".format(data.__len__()))
return sorted([{"title": title, "value": path} for title, path in data], key=lambda f: f['title'])
示例4: main
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
def main():
cocurrency = 10
queue = Queue()
queue.put("http://www.jianshu.com")
workers = []
for _ in range(cocurrency):
workers.append(Worker(app,queue))
for worker in workers:
Log4Spider.debugLog("worker begin:",worker)
worker.run()
Log4Spider.debugLog("waitiing for spiderQueue empty:")
yield queue.join(timeout=timedelta(seconds=300))
Log4Spider.debugLog("main done!")
示例5: BatchedStream
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class BatchedStream(object):
""" Mostly obsolete, see BatchedSend """
def __init__(self, stream, interval):
self.stream = stream
self.interval = interval / 1000.0
self.last_transmission = default_timer()
self.send_q = Queue()
self.recv_q = Queue()
self._background_send_coroutine = self._background_send()
self._background_recv_coroutine = self._background_recv()
self._broken = None
self.pc = PeriodicCallback(lambda: None, 100)
self.pc.start()
@gen.coroutine
def _background_send(self):
with log_errors():
while True:
msg = yield self.send_q.get()
if msg == "close":
break
msgs = [msg]
now = default_timer()
wait_time = self.last_transmission + self.interval - now
if wait_time > 0:
yield gen.sleep(wait_time)
while not self.send_q.empty():
msgs.append(self.send_q.get_nowait())
try:
yield write(self.stream, msgs)
except StreamClosedError:
self.recv_q.put_nowait("close")
self._broken = True
break
if len(msgs) > 1:
logger.debug("Batched messages: %d", len(msgs))
for _ in msgs:
self.send_q.task_done()
@gen.coroutine
def _background_recv(self):
with log_errors():
while True:
try:
msgs = yield read(self.stream)
except StreamClosedError:
self.recv_q.put_nowait("close")
self.send_q.put_nowait("close")
self._broken = True
break
assert isinstance(msgs, list)
if len(msgs) > 1:
logger.debug("Batched messages: %d", len(msgs))
for msg in msgs:
self.recv_q.put_nowait(msg)
@gen.coroutine
def flush(self):
yield self.send_q.join()
@gen.coroutine
def send(self, msg):
if self._broken:
raise StreamClosedError("Batch Stream is Closed")
else:
self.send_q.put_nowait(msg)
@gen.coroutine
def recv(self):
result = yield self.recv_q.get()
if result == "close":
raise StreamClosedError("Batched Stream is Closed")
else:
raise gen.Return(result)
@gen.coroutine
def close(self):
yield self.flush()
raise gen.Return(self.stream.close())
def closed(self):
return self.stream.closed()
示例6: SQSDrain
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class SQSDrain(object):
"""Implementation of IDrain that writes to an AWS SQS queue.
"""
def __init__(self, logger, loop, sqs_client,
metric_prefix='emitter'):
self.emitter = sqs_client
self.logger = logger
self.loop = loop
self.metric_prefix = metric_prefix
self.output_error = Event()
self.state = RUNNING
self.sender_tag = 'sender:%s.%s' % (self.__class__.__module__,
self.__class__.__name__)
self._send_queue = Queue()
self._should_flush_queue = Event()
self._flush_handle = None
self.loop.spawn_callback(self._onSend)
@gen.coroutine
def _flush_send_batch(self, batch_size):
send_batch = [
self._send_queue.get_nowait()
for pos in range(min(batch_size, self.emitter.max_messages))
]
try:
response = yield self.emitter.send_message_batch(*send_batch)
except SQSError as err:
self.logger.exception('Error encountered flushing data to SQS: %s',
err)
self.output_error.set()
for msg in send_batch:
self._send_queue.put_nowait(msg)
else:
if response.Failed:
self.output_error.set()
for req in response.Failed:
self.logger.error('Message failed to send: %s', req.Id)
self._send_queue.put_nowait(req)
@gen.coroutine
def _onSend(self):
respawn = True
while respawn:
qsize = self._send_queue.qsize()
# This will keep flushing until clear,
# including items that show up in between flushes
while qsize > 0:
try:
yield self._flush_send_batch(qsize)
except Exception as err:
self.logger.exception(err)
self.output_error.set()
qsize = self._send_queue.qsize()
# We've cleared the backlog, remove any possible future flush
if self._flush_handle:
self.loop.remove_timeout(self._flush_handle)
self._flush_handle = None
self._should_flush_queue.clear()
yield self._should_flush_queue.wait()
@gen.coroutine
def close(self, timeout=None):
self.state = CLOSING
yield self._send_queue.join(timeout)
def emit_nowait(self, msg):
if self._send_queue.qsize() >= self.emitter.max_messages:
# Signal flush
self._should_flush_queue.set()
raise QueueFull()
elif self._flush_handle is None:
# Ensure we flush messages at least by MAX_TIMEOUT
self._flush_handle = self.loop.add_timeout(
MAX_TIMEOUT,
lambda: self._should_flush_queue.set(),
)
self.logger.debug("Drain emitting")
self._send_queue.put_nowait(msg)
@gen.coroutine
def emit(self, msg, timeout=None):
if self._send_queue.qsize() >= self.emitter.max_messages:
# Signal flush
self._should_flush_queue.set()
elif self._flush_handle is None:
# Ensure we flush messages at least by MAX_TIMEOUT
self._flush_handle = self.loop.add_timeout(
MAX_TIMEOUT,
lambda: self._should_flush_queue.set(),
)
yield self._send_queue.put(msg, timeout)
示例7: BlogBackup
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class BlogBackup(object):
_default_dir_name = 'seg_blog_backup'
def _generate_save_dir(self):
cur_dir = os.path.dirname(__file__)
self.save_path = os.path.join(cur_dir, self._default_dir_name)
if not os.path.isdir(self.save_path):
os.mkdir(self.save_path)
def _parse_save_path(self):
if self.save_path:
if os.path.exists(self.save_path) and \
os.path.isdir(self.save_path):
return
else:
raise BlogSavePathError(
"'%s' not exists or is not dir!" % self.save_path)
else:
self._generate_save_dir()
def _get_user_cookies(self):
url = target_url + login_page_path
self.driver.get(url)
try:
user_input = self.driver.find_element_by_name('mail')
passwd_input = self.driver.find_element_by_name('password')
submit_btn = self.driver.find_element_by_class_name('pr20')
except NoSuchElementException:
raise PageHtmlChanged(
"%s login page structure have changed!" % _domain)
user_input.send_keys(self.username)
passwd_input.send_keys(self.passwd)
submit_btn.click()
try:
WebDriverWait(self.driver, 3).until(staleness_of(submit_btn))
except TimeoutException:
raise Exception("Wrong username or password!")
WebDriverWait(self.driver, timeout=10).until(has_page_load)
try_times = 0
while True:
time.sleep(1)
if url != self.driver.current_url:
return self.driver.get_cookies()
try_times += 1
if try_times > 10:
raise Exception("Getting cookie info failed!")
def _get_driver(self):
if self.phantomjs_path:
try:
return webdriver.PhantomJS(
executable_path=self.phantomjs_path,
service_log_path=os.path.devnull)
except WebDriverException:
raise PhantomjsPathError("Phantomjs locate path invalid!")
else:
return webdriver.PhantomJS(service_log_path=os.path.devnull)
def __init__(self, **conf):
self.username = conf['username']
self.passwd = conf['passwd']
self.phantomjs_path = conf.get('phantomjs_path')
self.save_path = conf.get('save_path')
self._q = Queue()
self._parse_save_path()
self.driver = self._get_driver()
self._cookies = self._get_user_cookies()
@gen.coroutine
def run(self):
self.__filter_cookies()
start_url = target_url + blog_path
yield self._fetch_blog_list_page(start_url)
for _ in xrange(cpu_count()):
self._fetch_essay_content()
yield self._q.join()
def __filter_cookies(self):
self._cookies = {k['name']: k['value'] for k in self._cookies if
k['domain'] == _domain}
@gen.coroutine
def _fetch_blog_list_page(self, page_link):
ret = requests.get(page_link, cookies=self._cookies)
d = pq(ret.text)
link_elements = d('.stream-list__item > .summary > h2 > a')
for link in link_elements:
yield self._q.put(d(link).attr('href'))
next_ele = d('.pagination li.next a')
if next_ele:
next_page_url = target_url + next_ele.attr('href')
self._fetch_blog_list_page(next_page_url)
#.........这里部分代码省略.........
示例8: ProjectGroomer
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class ProjectGroomer(object):
""" Cleans up expired transactions for a project. """
def __init__(self, project_id, coordinator, zk_client, db_access,
thread_pool):
""" Creates a new ProjectGroomer.
Args:
project_id: A string specifying a project ID.
coordinator: A GroomingCoordinator.
zk_client: A KazooClient.
db_access: A DatastoreProxy.
thread_pool: A ThreadPoolExecutor.
"""
self.project_id = project_id
self._coordinator = coordinator
self._zk_client = zk_client
self._tornado_zk = TornadoKazoo(self._zk_client)
self._db_access = db_access
self._thread_pool = thread_pool
self._project_node = '/appscale/apps/{}'.format(self.project_id)
self._containers = []
self._inactive_containers = set()
self._batch_resolver = BatchResolver(self.project_id, self._db_access)
self._zk_client.ensure_path(self._project_node)
self._zk_client.ChildrenWatch(self._project_node, self._update_containers)
self._txid_manual_offset = 0
self._offset_node = '/'.join([self._project_node, OFFSET_NODE])
self._zk_client.DataWatch(self._offset_node, self._update_offset)
self._stop_event = AsyncEvent()
self._stopped_event = AsyncEvent()
# Keeps track of cleanup results for each round of grooming.
self._txids_cleaned = 0
self._oldest_valid_tx_time = None
self._worker_queue = AsyncQueue(maxsize=MAX_CONCURRENCY)
for _ in range(MAX_CONCURRENCY):
IOLoop.current().spawn_callback(self._worker)
IOLoop.current().spawn_callback(self.start)
@gen.coroutine
def start(self):
""" Starts the grooming process until the stop event is set. """
logger.info('Grooming {}'.format(self.project_id))
while True:
if self._stop_event.is_set():
break
try:
yield self._groom_project()
except Exception:
# Prevent the grooming loop from stopping if an error is encountered.
logger.exception(
'Unexpected error while grooming {}'.format(self.project_id))
yield gen.sleep(MAX_TX_DURATION)
self._stopped_event.set()
@gen.coroutine
def stop(self):
""" Stops the grooming process. """
logger.info('Stopping grooming process for {}'.format(self.project_id))
self._stop_event.set()
yield self._stopped_event.wait()
@gen.coroutine
def _worker(self):
""" Processes items in the worker queue. """
while True:
tx_path, composite_indexes = yield self._worker_queue.get()
try:
tx_time = yield self._resolve_txid(tx_path, composite_indexes)
if tx_time is None:
self._txids_cleaned += 1
if tx_time is not None and tx_time < self._oldest_valid_tx_time:
self._oldest_valid_tx_time = tx_time
finally:
self._worker_queue.task_done()
def _update_offset(self, new_offset, _):
""" Watches for updates to the manual offset node.
Args:
new_offset: A string specifying the new manual offset.
"""
self._txid_manual_offset = int(new_offset or 0)
def _update_containers(self, nodes):
""" Updates the list of active txid containers.
Args:
nodes: A list of strings specifying ZooKeeper nodes.
"""
counters = [int(node[len(CONTAINER_PREFIX):] or 1)
#.........这里部分代码省略.........
示例9: main
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
def main():
# Start consumer without waiting
# Tornado framework used for async IO
# http://www.tornadoweb.org/en/stable/index.html
q = Queue()
@gen.coroutine
def consumer():
item = yield q.get()
try:
code = False
try:
response = yield httpclient.AsyncHTTPClient().fetch(item)
codes = ['200', '301', '302']
code = any(s in response.headers['Status'] for s in codes)
rcode = response.code
if DEBUG:
fname = re.match(r'http://([\w+|.]+)/',item).group(1)
fname = os.path.join(DEBUG_DIR,fname.replace(".","_"))
with open(fname, 'w') as f:
for k,v in response.headers.get_all():
f.write(k+' '+v+'\n')
f.write('\n')
f.write(response.body)
f.close()
except Exception as e:
code = False
rcode = str(e)
print('%s,%s,%s,"%s"' %
(datetime.now(), item, code, rcode))
# Append to DOMAINS found URL
if code:
DOMAINS[RESULT[item]].append(item)
finally:
q.task_done()
@gen.coroutine
def worker():
while True:
yield consumer()
@gen.coroutine
def producer():
if DEBUG and not os.path.exists(DEBUG_DIR):
print('Creating debug out dir: %s' % DEBUG_DIR)
os.makedirs(DEBUG_DIR)
# Open and process file if supplied
if len(sys.argv) >= 2:
with open(sys.argv[1]) as f:
for line in f:
DOMAINS[line.strip()]= []
else:
print("Domains list file wasn't provided")
print("Usage: %s <domains.txt> [ report.txt ]" % sys.argv[0])
sys.exit(2)
# Generate processing list
for d in DOMAINS.keys():
for url in generate_url_list(d):
q.put(url)
yield producer()# Wait for producer to put all tasks.
# Start workers, then wait for the work queue to be empty.
for _ in range(concurrency):
worker()
yield q.join() # Wait for consumer to finish all tasks.
# Out results
if len(sys.argv) >= 3:
f = open(sys.argv[2],'w')
else:
f = sys.stdout
for key, val in DOMAINS.items():
if DOMAINS[key]:
DOMAINS[key] = '"'+" ".join(val)+'"'
else:
DOMAINS[key] = 'No'
out = "\n".join([",".join([key, str(val)]) for key, val in DOMAINS.items()]) + '\n'
f.write(out)
示例10: BlogBackup
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class BlogBackup(object):
_default_dir_name = "seg_blog_backup"
def _generate_save_dir(self):
cur_dir = os.path.dirname(__file__)
self.save_path = os.path.join(cur_dir, self._default_dir_name)
if not os.path.isdir(self.save_path):
os.mkdir(self.save_path)
def _parse_save_path(self):
if self.save_path:
if os.path.exists(self.save_path) and os.path.isdir(self.save_path):
return
else:
raise BlogSavePathError("'%s' not exists or is not dir!" % self.save_path)
else:
self._generate_save_dir()
@staticmethod
def parse_token_from_html(content):
overall_pat = re.compile(r"SF.token =.*?,\s+_\w+ = [\d,\[\]]+;", re.DOTALL)
overall_res = overall_pat.search(content)
if overall_res:
overall_content = overall_res.group()
# remove /* */ type annotation
filter_res = re.sub(r"(/\*[/a-zA-Z\d' ]+\*/)", "", overall_content)
str_list = re.findall(r"(?<!//)'([a-zA-Z\d]+)'", filter_res, re.DOTALL)
filter_list = re.findall(r"\[(\d+),(\d+)\]", overall_content)
ret = "".join(str_list)
if filter_list:
for m, n in filter_list:
ret = ret[: int(m)] + ret[int(n) :]
if len(ret) == 32:
return ret
raise PageHtmlChanged("website login token has changed")
def _get_user_cookies(self):
s = requests.Session()
s.headers.update(headers)
rep = s.get(target_url)
post_url = "%s%s?_=%s" % (target_url, login_api_path, self.parse_token_from_html(rep.text))
data = {"mail": self.username, "password": self.passwd}
s.post(post_url, data=data)
return s.cookies
def __init__(self, **conf):
self.username = conf["username"]
self.passwd = conf["passwd"]
self.save_path = conf.get("save_path")
self._q = Queue()
self._cookies = self._get_user_cookies()
self._parse_save_path()
@gen.coroutine
def run(self):
start_url = target_url + blog_path
yield self._fetch_blog_list_page(start_url)
for _ in xrange(cpu_count()):
self._fetch_essay_content()
yield self._q.join()
@gen.coroutine
def _fetch_blog_list_page(self, page_link):
ret = requests.get(page_link, cookies=self._cookies)
d = pq(ret.text)
link_elements = d(".stream-list__item > .summary > h2 > a")
for link in link_elements:
yield self._q.put(d(link).attr("href"))
next_ele = d(".pagination li.next a")
if next_ele:
next_page_url = target_url + next_ele.attr("href")
self._fetch_blog_list_page(next_page_url)
@gen.coroutine
def _fetch_essay_content(self):
while True:
try:
essay_path = yield self._q.get(timeout=1)
essay_url = target_url + essay_path + edit_suffix
ret = requests.get(essay_url, cookies=self._cookies)
d = pq(ret.text)
title = d("#myTitle").val()
content = d("#myEditor").text()
real_file_name = os.path.join(self.save_path, title + ".md")
logger.info("is backup essay: %s" % title)
with open(real_file_name, "w") as f:
f.writelines(content.encode("utf8"))
except gen.TimeoutError:
raise gen.Return()
finally:
self._q.task_done()
示例11: Scraper
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class Scraper():
def __init__(
self,
destinations=None,
transform=None,
headers={},
max_clients=50,
maxsize=50,
connect_timeout=1200,
request_timeout=600,):
"""Instantiate a tornado async http client to do multiple concurrent requests"""
if None in [destinations, transform]:
sys.stderr.write('You must pass both collection of URLS and a transform function')
raise SystemExit
self.max_clients = max_clients
self.maxsize = maxsize
self.connect_timeout = connect_timeout
self.request_timeout = request_timeout
AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients)
self.http_client = AsyncHTTPClient()
self.queue = Queue(maxsize=50)
self.destinations = destinations
self.transform = transform
self.headers = headers
self.read(self.destinations)
self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout, self.http_client)
self.loop = ioloop.IOLoop.current()
self.join_future = self.queue.join()
def done(future):
self.loop.stop()
self.join_future.add_done_callback(done)
self.loop.start()
@gen.coroutine
def read(self, destinations):
for url in destinations:
yield self.queue.put(url)
@gen.coroutine
def get(self, transform, headers, connect_timeout, request_timeout, http_client):
while True:
url = yield self.queue.get()
try:
request = HTTPRequest(url,
connect_timeout=connect_timeout,
request_timeout=request_timeout,
method="GET",
headers = headers
)
except Exception as e:
sys.stderr.write('Destination {0} returned error {1}'.format(url, str(e) + '\n'))
future = self.http_client.fetch(request)
def done_callback(future):
body = future.result().body
url = future.result().effective_url
transform(body, url=url)
self.queue.task_done()
try:
future.add_done_callback(done_callback)
except Exception as e:
sys.stderr.write(str(e))
queue.put(url)
示例12: BaseSpider
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class BaseSpider(object):
url_parser = None
def __init__(self, engine, concurrent=3):
self.engine = engine
self.http = httpclient.AsyncHTTPClient()
self.queue = Queue()
self.concurrency = concurrent
@property
def hostname(self):
return self.url_parser.hostname
@property
def url_root(self):
return self.url_parser.url_root
@property
def base_url(self):
return self.url_parser.base_url
@gen.coroutine
def __worker(self):
"""Consumes the queue."""
while True:
yield self.fetch_url()
@gen.coroutine
def crawl(self, description, location):
"""Starts crawling the specified URL."""
url = self.url_parser(description, location)
self.queue.put(url)
self.engine.notify_started(self)
for _ in range(self.concurrency):
self.__worker()
yield self.queue.join()
self.engine.notify_finished(self)
@gen.coroutine
def fetch_url(self):
"""Retrieves a URL from the queue and returns the parsed data."""
url = yield self.queue.get()
logger.info('fetching %s' % url)
try:
response = yield self.http.fetch(url)
soup = BeautifulSoup(response.body)
logger.info('got response %s' % url)
urls = yield self.fetch_links(response, soup)
for new_url in urls:
logger.debug('Added %s to queue' % new_url)
yield self.queue.put(new_url)
data = yield self.parse_response(response, soup)
logger.info('Parsed response for %s' % url)
except (httpclient.HTTPError, ValueError):
message = 'HTTP Error: (%s)' % url
self.engine.write_message(message, self.engine.STATUS_ERROR)
else:
self.engine.write_data(data)
finally:
self.queue.task_done()
@gen.coroutine
def fetch_links(self, response, soup):
"""Fetch URLs to be added to the queue."""
raise gen.Return([])
def parse_response(self, response, soup):
"""Extract information from the response, return should be a
list of dict's.
Sample dict:
{
'title': 'Job Title',
'company': 'Company Name',
'location': 'City/State/Country',
'tags': ['tag1', 'tag2', 'tag3'],
'category': 'Software Developer',
'origin': 'Name of the origin website',
'url': 'Link to the complete job description',
}
"""
raise NotImplementedError
示例13: get_data
# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
#.........这里部分代码省略.........
source_filter = GitHubRepositoryDateFilter(source_filter)
if source_filter.repository is None:
raise ValueError('required parameter projects missing')
default_headers = {"Content-Type": "application/json", "Accept": "application/vnd.github.v3+json"}
# first we grab our list of commits
uri = "https://api.github.com/repos/{}/commits".format(source_filter.repository)
qs = source_filter.get_qs()
if qs != '':
uri = uri + '?' + qs
app_log.info("Starting retrieval of commit list for account {}".format(account._id))
if limit is not None and limit <= 100:
# we can handle our limit right here
uri += "?per_page={}".format(limit)
elif limit is None:
uri += "?per_page=100" # maximum number per page for GitHub API
taken = 0
queue = Queue()
sem = BoundedSemaphore(FETCH_CONCURRENCY)
done, working = set(), set()
while uri is not None:
app_log.info(
"({}) Retrieving next page, received {} commits thus far".format(account._id, taken))
req = account.get_request(uri, headers=default_headers)
response = yield client.fetch(req)
page_data = json.loads(response.body.decode('utf-8'))
taken += page_data.__len__()
for item in page_data:
queue.put(item.get('url', None))
if limit is None or taken < limit:
# parse the Link header from GitHub (https://developer.github.com/v3/#pagination)
links = parse_link_header(response.headers.get('Link', ''))
uri = links.get('next', None)
else:
break
if queue.qsize() > 500:
raise HTTPError(413, 'too many commits')
app_log.info("({}) Commit list retrieved, fetching info for {} commits".format(account._id, taken))
# open our list
cls.write('[')
# our worker to actually fetch the info
@gen.coroutine
def fetch_url():
current_url = yield queue.get()
try:
if current_url in working:
return
page_no = working.__len__()
app_log.info("Fetching page {}".format(page_no))
working.add(current_url)
req = account.get_request(current_url)
client = AsyncHTTPClient()
response = yield client.fetch(req)
response_data = json.loads(response.body.decode('utf-8'))
obj = {
'date': response_data['commit']['author']['date'],
'author': response_data['commit']['author']['name'],
'added_files': [file for file in response_data['files'] if file['status'] == 'added'].__len__(),
'deleted_files': [file for file in response_data['files'] if file['status'] == 'deleted'].__len__(),
'modified_files': [file for file in response_data['files'] if file['status'] == 'modified'].__len__(),
'additions': response_data['stats']['additions'],
'deletions': response_data['stats']['deletions']
}
if done.__len__() > 0:
cls.write(',')
cls.write(json.dumps(obj))
done.add(current_url)
app_log.info("Page {} downloaded".format(page_no))
finally:
queue.task_done()
sem.release()
@gen.coroutine
def worker():
while True:
yield sem.acquire()
fetch_url()
# start our concurrency worker
worker()
try:
# wait until we're done
yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME))
except gen.TimeoutError:
app_log.warning("Request exceeds maximum time, cutting response short")
finally:
# close our list
cls.write(']')
app_log.info("Finished retrieving commits for {}".format(account._id))