当前位置: 首页>>代码示例>>Python>>正文


Python Queue.join方法代码示例

本文整理汇总了Python中tornado.queues.Queue.join方法的典型用法代码示例。如果您正苦于以下问题:Python Queue.join方法的具体用法?Python Queue.join怎么用?Python Queue.join使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在tornado.queues.Queue的用法示例。


在下文中一共展示了Queue.join方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: run

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
def run(args):
    if not args.test:
        ip_iter = _create_ip_iterator()
    else:
        ip_iter = _get_test_ips()
        good_ips = []

    job_queue = Queue(maxsize=200)

    start = time.time()
    counter = Counter()

    @gen.coroutine
    def job_producer():
        for ip in ip_iter:
            yield job_queue.put(ip)
            #print("Put {}".format(ip))

    @gen.coroutine
    def worker(id):
        while True:
            ip = yield job_queue.get()
            try:
                good = yield test_ip(ip)
                counter['all'] += 1
                if args.progress:
                    if counter['all'] % 10000 == 0:
                        print("Tested {} ips.".format(counter['all']))
                if good:
                    print("Found good ip: {}".format(ip))
                    counter['good'] += 1
                    if not args.test:
                        yield record_good_ip(ip)
                    else:
                        good_ips.append(ip)
            finally:
                job_queue.task_done()

    for i in range(CONCURRENCY):
        worker(i)

    _disable_logging()

    try:
        yield job_producer()
        yield job_queue.join()
    finally:
        print("\n\nTested: {} ips\nFound {} good ips\nQps: {}".format(
            counter['all'],
            counter['good'],
            counter['all'] / (time.time() - start)
        ))

    if args.test and args.remove:
        with open(GOOD_IP_FILE + '_removed', 'w') as f:
            f.write('|'.join(good_ips))
开发者ID:kawing-chiu,项目名称:search_google_ip,代码行数:58,代码来源:search_google_ip.py

示例2: TornadoQuerierBase

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class TornadoQuerierBase(object):

    def __init__(self):
        self.tasks = TornadoQueue()

    def gen_task(self):
        raise NotImplementError()

    def run_task(self, task):
        raise NotImplementError()

    def prepare(self):
        self.running = True

    def cleanup(self):
        self.running = False

    @coroutine
    def run_worker(self, worker_id, f):
        while self.tasks.qsize() > 0:
            task = yield self.tasks.get()
            LOG.debug('worker[%d]: current task is %s' % (worker_id, task))
            try:
                yield f(task)
                pass
            except Exception as e:
                LOG.warning(str(e))
            finally:
                self.tasks.task_done()
                task = None
        LOG.debug('worker[%d]: all tasks done %s' % (worker_id, self.tasks))

    @coroutine
    def start(self, num_workers=1):

        self.prepare()

        # add tasks
        tasks = yield self.gen_task()
        for task in tasks:
            yield self.tasks.put(task)

        # start shoot workers
        for worker_id in range(num_workers):
            LOG.debug('starting worker %d' % worker_id)
            self.run_worker(worker_id, self.run_task)

        yield self.tasks.join()
        self.cleanup()
开发者ID:jianingy,项目名称:watchgang,代码行数:51,代码来源:libwatcher.py

示例3: get_file_list

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
    def get_file_list(account, **kwargs):
        queue = Queue()
        sem = BoundedSemaphore(FETCH_CONCURRENCY)
        done, working = set(), set()
        data = set()

        @gen.coroutine
        def fetch_url():
            current_url = yield queue.get()
            try:
                if current_url in working:
                    return
                page_no = working.__len__()
                app_log.info("Fetching page {}".format(page_no))
                working.add(current_url)
                req = account.get_request(current_url)
                client = AsyncHTTPClient()
                response = yield client.fetch(req)
                done.add(current_url)
                app_log.info("Page {} downloaded".format(page_no))
                response_data = json.loads(response.body.decode('utf-8'))

                for file in response_data:
                    # be sure we're a valid file type and less than our maximum response size limit
                    extension = file['path'].lower().split('.')[-1]
                    if extension in VALID_FILETYPES and int(file['bytes']) < RESPONSE_SIZE_LIMIT * 1000000:
                        data.add((file['path'].lstrip('/'), file['path'], ))
                app_log.info("Page {} completed".format(page_no))
            finally:
                queue.task_done()
                sem.release()

        @gen.coroutine
        def worker():
            while True:
                yield sem.acquire()
                fetch_url()

        app_log.info("Gathering filelist for account {}".format(account._id))
        for file_type in VALID_FILETYPES:
            file_type = '.'.join([file_type])
            url = "https://api.dropbox.com/1/search/auto/?query={}&include_membership=true".format(file_type)
            queue.put(url)
        # start our concurrency worker
        worker()
        # wait until we're done
        yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME))
        app_log.info("Finished list retrieval. Found {} items.".format(data.__len__()))
        return sorted([{"title": title, "value": path} for title, path in data], key=lambda f: f['title'])
开发者ID:vizydrop,项目名称:apps,代码行数:51,代码来源:files.py

示例4: main

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
def main():
    cocurrency = 10

    queue = Queue()
    queue.put("http://www.jianshu.com")

    workers = []
    for _ in range(cocurrency):
        workers.append(Worker(app,queue))

    for worker in workers:
        Log4Spider.debugLog("worker begin:",worker)
        worker.run()

    Log4Spider.debugLog("waitiing for spiderQueue empty:")
    yield queue.join(timeout=timedelta(seconds=300))
    Log4Spider.debugLog("main done!")
开发者ID:happyAnger6,项目名称:anger6Spider,代码行数:19,代码来源:run_worker_test.py

示例5: BatchedStream

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class BatchedStream(object):
    """ Mostly obsolete, see BatchedSend """

    def __init__(self, stream, interval):
        self.stream = stream
        self.interval = interval / 1000.0
        self.last_transmission = default_timer()
        self.send_q = Queue()
        self.recv_q = Queue()
        self._background_send_coroutine = self._background_send()
        self._background_recv_coroutine = self._background_recv()
        self._broken = None

        self.pc = PeriodicCallback(lambda: None, 100)
        self.pc.start()

    @gen.coroutine
    def _background_send(self):
        with log_errors():
            while True:
                msg = yield self.send_q.get()
                if msg == "close":
                    break
                msgs = [msg]
                now = default_timer()
                wait_time = self.last_transmission + self.interval - now
                if wait_time > 0:
                    yield gen.sleep(wait_time)
                while not self.send_q.empty():
                    msgs.append(self.send_q.get_nowait())

                try:
                    yield write(self.stream, msgs)
                except StreamClosedError:
                    self.recv_q.put_nowait("close")
                    self._broken = True
                    break

                if len(msgs) > 1:
                    logger.debug("Batched messages: %d", len(msgs))
                for _ in msgs:
                    self.send_q.task_done()

    @gen.coroutine
    def _background_recv(self):
        with log_errors():
            while True:
                try:
                    msgs = yield read(self.stream)
                except StreamClosedError:
                    self.recv_q.put_nowait("close")
                    self.send_q.put_nowait("close")
                    self._broken = True
                    break
                assert isinstance(msgs, list)
                if len(msgs) > 1:
                    logger.debug("Batched messages: %d", len(msgs))
                for msg in msgs:
                    self.recv_q.put_nowait(msg)

    @gen.coroutine
    def flush(self):
        yield self.send_q.join()

    @gen.coroutine
    def send(self, msg):
        if self._broken:
            raise StreamClosedError("Batch Stream is Closed")
        else:
            self.send_q.put_nowait(msg)

    @gen.coroutine
    def recv(self):
        result = yield self.recv_q.get()
        if result == "close":
            raise StreamClosedError("Batched Stream is Closed")
        else:
            raise gen.Return(result)

    @gen.coroutine
    def close(self):
        yield self.flush()
        raise gen.Return(self.stream.close())

    def closed(self):
        return self.stream.closed()
开发者ID:broxtronix,项目名称:distributed,代码行数:88,代码来源:batched.py

示例6: SQSDrain

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class SQSDrain(object):
    """Implementation of IDrain that writes to an AWS SQS queue.
    """

    def __init__(self, logger, loop, sqs_client,
                 metric_prefix='emitter'):
        self.emitter = sqs_client
        self.logger = logger
        self.loop = loop
        self.metric_prefix = metric_prefix
        self.output_error = Event()
        self.state = RUNNING
        self.sender_tag = 'sender:%s.%s' % (self.__class__.__module__,
                                            self.__class__.__name__)
        self._send_queue = Queue()
        self._should_flush_queue = Event()
        self._flush_handle = None
        self.loop.spawn_callback(self._onSend)

    @gen.coroutine
    def _flush_send_batch(self, batch_size):
        send_batch = [
            self._send_queue.get_nowait()
            for pos in range(min(batch_size, self.emitter.max_messages))
        ]
        try:
            response = yield self.emitter.send_message_batch(*send_batch)
        except SQSError as err:
            self.logger.exception('Error encountered flushing data to SQS: %s',
                                  err)
            self.output_error.set()
            for msg in send_batch:
                self._send_queue.put_nowait(msg)
        else:
            if response.Failed:
                self.output_error.set()
                for req in response.Failed:
                    self.logger.error('Message failed to send: %s', req.Id)
                    self._send_queue.put_nowait(req)

    @gen.coroutine
    def _onSend(self):
        respawn = True
        while respawn:
            qsize = self._send_queue.qsize()
            # This will keep flushing until clear,
            # including items that show up in between flushes
            while qsize > 0:
                try:
                    yield self._flush_send_batch(qsize)
                except Exception as err:
                    self.logger.exception(err)
                    self.output_error.set()
                qsize = self._send_queue.qsize()
            # We've cleared the backlog, remove any possible future flush
            if self._flush_handle:
                self.loop.remove_timeout(self._flush_handle)
                self._flush_handle = None
            self._should_flush_queue.clear()
            yield self._should_flush_queue.wait()

    @gen.coroutine
    def close(self, timeout=None):
        self.state = CLOSING
        yield self._send_queue.join(timeout)

    def emit_nowait(self, msg):
        if self._send_queue.qsize() >= self.emitter.max_messages:
            # Signal flush
            self._should_flush_queue.set()
            raise QueueFull()
        elif self._flush_handle is None:
            # Ensure we flush messages at least by MAX_TIMEOUT
            self._flush_handle = self.loop.add_timeout(
                MAX_TIMEOUT,
                lambda: self._should_flush_queue.set(),
            )
        self.logger.debug("Drain emitting")
        self._send_queue.put_nowait(msg)

    @gen.coroutine
    def emit(self, msg, timeout=None):
        if self._send_queue.qsize() >= self.emitter.max_messages:
            # Signal flush
            self._should_flush_queue.set()
        elif self._flush_handle is None:
            # Ensure we flush messages at least by MAX_TIMEOUT
            self._flush_handle = self.loop.add_timeout(
                MAX_TIMEOUT,
                lambda: self._should_flush_queue.set(),
            )
        yield self._send_queue.put(msg, timeout)
开发者ID:CrowdStrike,项目名称:cs.eyrie,代码行数:94,代码来源:drain.py

示例7: BlogBackup

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class BlogBackup(object):
    _default_dir_name = 'seg_blog_backup'

    def _generate_save_dir(self):
        cur_dir = os.path.dirname(__file__)
        self.save_path = os.path.join(cur_dir, self._default_dir_name)
        if not os.path.isdir(self.save_path):
            os.mkdir(self.save_path)

    def _parse_save_path(self):
        if self.save_path:
            if os.path.exists(self.save_path) and \
                    os.path.isdir(self.save_path):
                return
            else:
                raise BlogSavePathError(
                    "'%s' not exists or is not dir!" % self.save_path)
        else:
            self._generate_save_dir()

    def _get_user_cookies(self):
        url = target_url + login_page_path
        self.driver.get(url)
        try:
            user_input = self.driver.find_element_by_name('mail')
            passwd_input = self.driver.find_element_by_name('password')
            submit_btn = self.driver.find_element_by_class_name('pr20')
        except NoSuchElementException:
            raise PageHtmlChanged(
                "%s login page structure have changed!" % _domain)

        user_input.send_keys(self.username)
        passwd_input.send_keys(self.passwd)
        submit_btn.click()
        try:
            WebDriverWait(self.driver, 3).until(staleness_of(submit_btn))
        except TimeoutException:
            raise Exception("Wrong username or password!")

        WebDriverWait(self.driver, timeout=10).until(has_page_load)
        try_times = 0
        while True:
            time.sleep(1)
            if url != self.driver.current_url:
                return self.driver.get_cookies()

            try_times += 1
            if try_times > 10:
                raise Exception("Getting cookie info failed!")

    def _get_driver(self):
        if self.phantomjs_path:
            try:
                return webdriver.PhantomJS(
                    executable_path=self.phantomjs_path,
                    service_log_path=os.path.devnull)
            except WebDriverException:
                raise PhantomjsPathError("Phantomjs locate path invalid!")
        else:
            return webdriver.PhantomJS(service_log_path=os.path.devnull)

    def __init__(self, **conf):
        self.username = conf['username']
        self.passwd = conf['passwd']
        self.phantomjs_path = conf.get('phantomjs_path')
        self.save_path = conf.get('save_path')
        self._q = Queue()

        self._parse_save_path()
        self.driver = self._get_driver()
        self._cookies = self._get_user_cookies()

    @gen.coroutine
    def run(self):
        self.__filter_cookies()

        start_url = target_url + blog_path
        yield self._fetch_blog_list_page(start_url)
        for _ in xrange(cpu_count()):
            self._fetch_essay_content()

        yield self._q.join()

    def __filter_cookies(self):
        self._cookies = {k['name']: k['value'] for k in self._cookies if
                         k['domain'] == _domain}

    @gen.coroutine
    def _fetch_blog_list_page(self, page_link):
        ret = requests.get(page_link, cookies=self._cookies)
        d = pq(ret.text)
        link_elements = d('.stream-list__item > .summary > h2 > a')
        for link in link_elements:
            yield self._q.put(d(link).attr('href'))

        next_ele = d('.pagination li.next a')
        if next_ele:
            next_page_url = target_url + next_ele.attr('href')
            self._fetch_blog_list_page(next_page_url)

#.........这里部分代码省略.........
开发者ID:quietin,项目名称:seg_backup_script,代码行数:103,代码来源:backup_with_phantomjs.py

示例8: ProjectGroomer

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class ProjectGroomer(object):
  """ Cleans up expired transactions for a project. """
  def __init__(self, project_id, coordinator, zk_client, db_access,
               thread_pool):
    """ Creates a new ProjectGroomer.

    Args:
      project_id: A string specifying a project ID.
      coordinator: A GroomingCoordinator.
      zk_client: A KazooClient.
      db_access: A DatastoreProxy.
      thread_pool: A ThreadPoolExecutor.
    """
    self.project_id = project_id

    self._coordinator = coordinator
    self._zk_client = zk_client
    self._tornado_zk = TornadoKazoo(self._zk_client)
    self._db_access = db_access
    self._thread_pool = thread_pool
    self._project_node = '/appscale/apps/{}'.format(self.project_id)
    self._containers = []
    self._inactive_containers = set()
    self._batch_resolver = BatchResolver(self.project_id, self._db_access)

    self._zk_client.ensure_path(self._project_node)
    self._zk_client.ChildrenWatch(self._project_node, self._update_containers)

    self._txid_manual_offset = 0
    self._offset_node = '/'.join([self._project_node, OFFSET_NODE])
    self._zk_client.DataWatch(self._offset_node, self._update_offset)

    self._stop_event = AsyncEvent()
    self._stopped_event = AsyncEvent()

    # Keeps track of cleanup results for each round of grooming.
    self._txids_cleaned = 0
    self._oldest_valid_tx_time = None

    self._worker_queue = AsyncQueue(maxsize=MAX_CONCURRENCY)
    for _ in range(MAX_CONCURRENCY):
      IOLoop.current().spawn_callback(self._worker)

    IOLoop.current().spawn_callback(self.start)

  @gen.coroutine
  def start(self):
    """ Starts the grooming process until the stop event is set. """
    logger.info('Grooming {}'.format(self.project_id))
    while True:
      if self._stop_event.is_set():
        break

      try:
        yield self._groom_project()
      except Exception:
        # Prevent the grooming loop from stopping if an error is encountered.
        logger.exception(
          'Unexpected error while grooming {}'.format(self.project_id))
        yield gen.sleep(MAX_TX_DURATION)

    self._stopped_event.set()

  @gen.coroutine
  def stop(self):
    """ Stops the grooming process. """
    logger.info('Stopping grooming process for {}'.format(self.project_id))
    self._stop_event.set()
    yield self._stopped_event.wait()

  @gen.coroutine
  def _worker(self):
    """ Processes items in the worker queue. """
    while True:
      tx_path, composite_indexes = yield self._worker_queue.get()
      try:
        tx_time = yield self._resolve_txid(tx_path, composite_indexes)
        if tx_time is None:
          self._txids_cleaned += 1

        if tx_time is not None and tx_time < self._oldest_valid_tx_time:
          self._oldest_valid_tx_time = tx_time
      finally:
        self._worker_queue.task_done()

  def _update_offset(self, new_offset, _):
    """ Watches for updates to the manual offset node.

    Args:
      new_offset: A string specifying the new manual offset.
    """
    self._txid_manual_offset = int(new_offset or 0)

  def _update_containers(self, nodes):
    """ Updates the list of active txid containers.

    Args:
      nodes: A list of strings specifying ZooKeeper nodes.
    """
    counters = [int(node[len(CONTAINER_PREFIX):] or 1)
#.........这里部分代码省略.........
开发者ID:cdonati,项目名称:appscale,代码行数:103,代码来源:transaction_groomer.py

示例9: main

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
def main():
    # Start consumer without waiting
    # Tornado framework used for async IO
    # http://www.tornadoweb.org/en/stable/index.html
    q = Queue()

    @gen.coroutine
    def consumer():
        item = yield q.get()
        try:
            code = False
            try:
                response = yield httpclient.AsyncHTTPClient().fetch(item)
                codes = ['200', '301', '302']
                code = any(s in response.headers['Status'] for s in codes)
                rcode = response.code
                if DEBUG:
                    fname = re.match(r'http://([\w+|.]+)/',item).group(1)
                    fname = os.path.join(DEBUG_DIR,fname.replace(".","_"))
                    with open(fname, 'w') as f:
                        for k,v in response.headers.get_all():
                            f.write(k+' '+v+'\n')
                        f.write('\n')
                        f.write(response.body)
                    f.close()
            except Exception as e:
                code = False
                rcode = str(e)
            
            print('%s,%s,%s,"%s"' % 
                            (datetime.now(), item, code, rcode))
            # Append to DOMAINS found URL 
            if code:
                DOMAINS[RESULT[item]].append(item)            
        
        finally:
            q.task_done()

    @gen.coroutine
    def worker():
        while True:
            yield consumer()

    @gen.coroutine
    def producer():
        if DEBUG and not os.path.exists(DEBUG_DIR):
            print('Creating debug out dir: %s' % DEBUG_DIR)
            os.makedirs(DEBUG_DIR)
  
        # Open and process file if supplied
        if len(sys.argv) >= 2:
            with open(sys.argv[1]) as f:
                for line in f:    
                    DOMAINS[line.strip()]= []
        else:
            print("Domains list file wasn't provided")
            print("Usage: %s <domains.txt> [ report.txt ]" % sys.argv[0])
            sys.exit(2)
        # Generate processing list 
        for d in DOMAINS.keys():
            for url in generate_url_list(d):
                q.put(url)

    yield producer()# Wait for producer to put all tasks.
    # Start workers, then wait for the work queue to be empty. 
    for _ in range(concurrency):
        worker()
    
    yield q.join() # Wait for consumer to finish all tasks.
    
    # Out results
    if len(sys.argv) >= 3:
        f = open(sys.argv[2],'w')
    else:
        f = sys.stdout

    for key, val in DOMAINS.items():
        if DOMAINS[key]:
            DOMAINS[key] = '"'+" ".join(val)+'"'
        else:
            DOMAINS[key] = 'No'
    out = "\n".join([",".join([key, str(val)]) for key, val in DOMAINS.items()]) + '\n'
    
    f.write(out)
开发者ID:andjelx,项目名称:pycode,代码行数:86,代码来源:site_check.py

示例10: BlogBackup

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class BlogBackup(object):
    _default_dir_name = "seg_blog_backup"

    def _generate_save_dir(self):
        cur_dir = os.path.dirname(__file__)
        self.save_path = os.path.join(cur_dir, self._default_dir_name)
        if not os.path.isdir(self.save_path):
            os.mkdir(self.save_path)

    def _parse_save_path(self):
        if self.save_path:
            if os.path.exists(self.save_path) and os.path.isdir(self.save_path):
                return
            else:
                raise BlogSavePathError("'%s' not exists or is not dir!" % self.save_path)
        else:
            self._generate_save_dir()

    @staticmethod
    def parse_token_from_html(content):
        overall_pat = re.compile(r"SF.token =.*?,\s+_\w+ = [\d,\[\]]+;", re.DOTALL)
        overall_res = overall_pat.search(content)
        if overall_res:
            overall_content = overall_res.group()
            # remove /* */ type annotation
            filter_res = re.sub(r"(/\*[/a-zA-Z\d' ]+\*/)", "", overall_content)
            str_list = re.findall(r"(?<!//)'([a-zA-Z\d]+)'", filter_res, re.DOTALL)
            filter_list = re.findall(r"\[(\d+),(\d+)\]", overall_content)
            ret = "".join(str_list)

            if filter_list:
                for m, n in filter_list:
                    ret = ret[: int(m)] + ret[int(n) :]
            if len(ret) == 32:
                return ret

        raise PageHtmlChanged("website login token has changed")

    def _get_user_cookies(self):
        s = requests.Session()
        s.headers.update(headers)
        rep = s.get(target_url)
        post_url = "%s%s?_=%s" % (target_url, login_api_path, self.parse_token_from_html(rep.text))
        data = {"mail": self.username, "password": self.passwd}
        s.post(post_url, data=data)
        return s.cookies

    def __init__(self, **conf):
        self.username = conf["username"]
        self.passwd = conf["passwd"]
        self.save_path = conf.get("save_path")
        self._q = Queue()
        self._cookies = self._get_user_cookies()
        self._parse_save_path()

    @gen.coroutine
    def run(self):
        start_url = target_url + blog_path
        yield self._fetch_blog_list_page(start_url)
        for _ in xrange(cpu_count()):
            self._fetch_essay_content()

        yield self._q.join()

    @gen.coroutine
    def _fetch_blog_list_page(self, page_link):
        ret = requests.get(page_link, cookies=self._cookies)
        d = pq(ret.text)
        link_elements = d(".stream-list__item > .summary > h2 > a")
        for link in link_elements:
            yield self._q.put(d(link).attr("href"))

        next_ele = d(".pagination li.next a")
        if next_ele:
            next_page_url = target_url + next_ele.attr("href")
            self._fetch_blog_list_page(next_page_url)

    @gen.coroutine
    def _fetch_essay_content(self):
        while True:
            try:
                essay_path = yield self._q.get(timeout=1)
                essay_url = target_url + essay_path + edit_suffix
                ret = requests.get(essay_url, cookies=self._cookies)
                d = pq(ret.text)
                title = d("#myTitle").val()
                content = d("#myEditor").text()
                real_file_name = os.path.join(self.save_path, title + ".md")
                logger.info("is backup essay: %s" % title)
                with open(real_file_name, "w") as f:
                    f.writelines(content.encode("utf8"))
            except gen.TimeoutError:
                raise gen.Return()
            finally:
                self._q.task_done()
开发者ID:quietin,项目名称:seg_backup_script,代码行数:97,代码来源:backup_simple.py

示例11: Scraper

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class Scraper():

    def __init__(
                self,
                destinations=None,
                transform=None,
                headers={},
                max_clients=50,
                maxsize=50,
                connect_timeout=1200,
                request_timeout=600,):

        """Instantiate a tornado async http client to do multiple concurrent requests"""

        if None in [destinations, transform]:
            sys.stderr.write('You must pass both collection of URLS and a transform function')
            raise SystemExit

        self.max_clients = max_clients
        self.maxsize = maxsize
        self.connect_timeout = connect_timeout
        self.request_timeout = request_timeout

        AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients)

        self.http_client = AsyncHTTPClient()
        self.queue = Queue(maxsize=50)
        self.destinations = destinations
        self.transform = transform
        self.headers = headers
        self.read(self.destinations)
        self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout, self.http_client)
        self.loop = ioloop.IOLoop.current()
        self.join_future = self.queue.join()

        def done(future):
            self.loop.stop()

        self.join_future.add_done_callback(done)
        self.loop.start()

    @gen.coroutine
    def read(self, destinations):
        for url in destinations:
            yield self.queue.put(url)

    @gen.coroutine
    def get(self, transform, headers, connect_timeout, request_timeout, http_client):
        while True:
            url = yield self.queue.get()
            try:
                request = HTTPRequest(url,
                                    connect_timeout=connect_timeout,
                                    request_timeout=request_timeout,
                                    method="GET",
                                    headers = headers
                )
            except Exception as e:
                sys.stderr.write('Destination {0} returned error {1}'.format(url, str(e) + '\n'))

            future = self.http_client.fetch(request)

            def done_callback(future):
                body = future.result().body
                url = future.result().effective_url
                transform(body, url=url)
                self.queue.task_done()

            try:
                future.add_done_callback(done_callback)
            except Exception as e:
                sys.stderr.write(str(e))
                queue.put(url)
开发者ID:andres-de-castro,项目名称:scraping,代码行数:75,代码来源:scraper.py

示例12: BaseSpider

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]
class BaseSpider(object):
    url_parser = None

    def __init__(self, engine, concurrent=3):
        self.engine = engine
        self.http = httpclient.AsyncHTTPClient()
        self.queue = Queue()
        self.concurrency = concurrent

    @property
    def hostname(self):
        return self.url_parser.hostname

    @property
    def url_root(self):
        return self.url_parser.url_root

    @property
    def base_url(self):
        return self.url_parser.base_url

    @gen.coroutine
    def __worker(self):
        """Consumes the queue."""
        while True:
            yield self.fetch_url()

    @gen.coroutine
    def crawl(self, description, location):
        """Starts crawling the specified URL."""
        url = self.url_parser(description, location)
        self.queue.put(url)
        self.engine.notify_started(self)
        for _ in range(self.concurrency):
            self.__worker()
        yield self.queue.join()
        self.engine.notify_finished(self)

    @gen.coroutine
    def fetch_url(self):
        """Retrieves a URL from the queue and returns the parsed data."""
        url = yield self.queue.get()
        logger.info('fetching %s' % url)
        try:
            response = yield self.http.fetch(url)
            soup = BeautifulSoup(response.body)
            logger.info('got response %s' % url)

            urls = yield self.fetch_links(response, soup)
            for new_url in urls:
                logger.debug('Added %s to queue' % new_url)
                yield self.queue.put(new_url)

            data = yield self.parse_response(response, soup)
            logger.info('Parsed response for %s' % url)
        except (httpclient.HTTPError, ValueError):
            message = 'HTTP Error: (%s)' % url
            self.engine.write_message(message, self.engine.STATUS_ERROR)
        else:
            self.engine.write_data(data)
        finally:
            self.queue.task_done()

    @gen.coroutine
    def fetch_links(self, response, soup):
        """Fetch URLs to be added to the queue."""
        raise gen.Return([])

    def parse_response(self, response, soup):
        """Extract information from the response, return should be a 
        list of dict's.
        
        Sample dict:
        {
            'title': 'Job Title',
            'company': 'Company Name',
            'location': 'City/State/Country',
            'tags': ['tag1', 'tag2', 'tag3'],
            'category': 'Software Developer',
            'origin': 'Name of the origin website',
            'url': 'Link to the complete job description',
        }
        """
        raise NotImplementedError
开发者ID:winstonf88,项目名称:pyjobs,代码行数:86,代码来源:spider.py

示例13: get_data

# 需要导入模块: from tornado.queues import Queue [as 别名]
# 或者: from tornado.queues.Queue import join [as 别名]

#.........这里部分代码省略.........

        source_filter = GitHubRepositoryDateFilter(source_filter)

        if source_filter.repository is None:
            raise ValueError('required parameter projects missing')

        default_headers = {"Content-Type": "application/json", "Accept": "application/vnd.github.v3+json"}

        # first we grab our list of commits
        uri = "https://api.github.com/repos/{}/commits".format(source_filter.repository)
        qs = source_filter.get_qs()
        if qs != '':
            uri = uri + '?' + qs
        app_log.info("Starting retrieval of commit list for account {}".format(account._id))
        if limit is not None and limit <= 100:
            # we can handle our limit right here
            uri += "?per_page={}".format(limit)
        elif limit is None:
            uri += "?per_page=100"  # maximum number per page for GitHub API
        taken = 0

        queue = Queue()
        sem = BoundedSemaphore(FETCH_CONCURRENCY)
        done, working = set(), set()

        while uri is not None:
            app_log.info(
                "({}) Retrieving next page, received {} commits thus far".format(account._id, taken))
            req = account.get_request(uri, headers=default_headers)
            response = yield client.fetch(req)

            page_data = json.loads(response.body.decode('utf-8'))
            taken += page_data.__len__()
            for item in page_data:
                queue.put(item.get('url', None))

            if limit is None or taken < limit:
                # parse the Link header from GitHub (https://developer.github.com/v3/#pagination)
                links = parse_link_header(response.headers.get('Link', ''))
                uri = links.get('next', None)
            else:
                break

            if queue.qsize() > 500:
                raise HTTPError(413, 'too many commits')
        app_log.info("({}) Commit list retrieved, fetching info for {} commits".format(account._id, taken))

        # open our list
        cls.write('[')

        # our worker to actually fetch the info
        @gen.coroutine
        def fetch_url():
            current_url = yield queue.get()
            try:
                if current_url in working:
                    return
                page_no = working.__len__()
                app_log.info("Fetching page {}".format(page_no))
                working.add(current_url)
                req = account.get_request(current_url)
                client = AsyncHTTPClient()
                response = yield client.fetch(req)
                response_data = json.loads(response.body.decode('utf-8'))
                obj = {
                    'date': response_data['commit']['author']['date'],
                    'author': response_data['commit']['author']['name'],
                    'added_files': [file for file in response_data['files'] if file['status'] == 'added'].__len__(),
                    'deleted_files': [file for file in response_data['files'] if file['status'] == 'deleted'].__len__(),
                    'modified_files': [file for file in response_data['files'] if file['status'] == 'modified'].__len__(),
                    'additions': response_data['stats']['additions'],
                    'deletions': response_data['stats']['deletions']
                }
                if done.__len__() > 0:
                    cls.write(',')
                cls.write(json.dumps(obj))
                done.add(current_url)
                app_log.info("Page {} downloaded".format(page_no))

            finally:
                queue.task_done()
                sem.release()

        @gen.coroutine
        def worker():
            while True:
                yield sem.acquire()
                fetch_url()

        # start our concurrency worker
        worker()
        try:
            # wait until we're done
            yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME))
        except gen.TimeoutError:
            app_log.warning("Request exceeds maximum time, cutting response short")
        finally:
            # close our list
            cls.write(']')
        app_log.info("Finished retrieving commits for {}".format(account._id))
开发者ID:vizydrop,项目名称:apps,代码行数:104,代码来源:commits.py


注:本文中的tornado.queues.Queue.join方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。