当前位置: 首页>>代码示例>>Python>>正文


Python Queue.join方法代码示例

本文整理汇总了Python中asyncio.Queue.join方法的典型用法代码示例。如果您正苦于以下问题:Python Queue.join方法的具体用法?Python Queue.join怎么用?Python Queue.join使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在asyncio.Queue的用法示例。


在下文中一共展示了Queue.join方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: workerTask

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import join [as 别名]
			

def workerTask(q):
	while not q.empty():
		processImage(q.get_nowait()[0])
		q.task_done()
		
if not os.path.exists("__working"):
	os.mkdir("__working")
convertPdfs(pdfList)
q = Queue(maxsize=0)
num_threads = 4

#put files in queue
for fileName in os.listdir("__working"):
	if fileName.endswith(".pbm"):
		q.put_nowait(("__working/" + fileName,))

threads = []
for i in range(num_threads):
	worker = Thread(target=workerTask, args=(q,))
	worker.start()
	threads.append(worker)

q.join()
for thread in threads:
	thread.join()

subprocess.run("rm -r __working", shell=True)

开发者ID:astrocatalogs,项目名称:cne-external,代码行数:31,代码来源:tableOCR_old.py

示例2: __init__

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import join [as 别名]
class Crawler:
    def __init__(self, root_url, max_redirect):
        self.max_tasks = 10
        self.max_redirect = max_redirect
        self.q = Queue()
        self.seen_urls = set()

        # aiohttp's ClientSession does connection pooling and
        # HTTP keep-alives for us.
        self.session = aiohttp.ClientSession(loop=loop)

        # Put (URL, max_redirect) in the Queue
        self.q.put((root_url, self.max_redirect))
        
    @asyncio.coroutine
    def crawl(self):
        '''Run the crawler untill all work is done.'''
        workers = [asyncio.Task(self.work())
                   for _ in range(self.max_tasks)]

        # When all work is done, exit.
        yield from self.q.join()
        for w in workers:
            w.cancel()

    @asyncio.coroutine
    def work(self):
        while True:
            url, max_redirect = yield from self.q.get()

            # Download page and add new links to self.q
            yield from self.fetch(url, max_redirect)
            self.q.task_done()

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        # Handle redirects ourselves.
        response = yield from self.session.get(
            url, allow_redirects=False)

        try:
            if is_redirect(response):
                if max_redirect > 0:
                    next_url = response.headers['location']
                    if next_url in self.seen_urls:
                        # We have done this before.
                        return

                    # Remember we have seen this url.
                    self.seen_urls.add(next_url)

                    # Follow the redirect. One less redirect remains.
                    self.q.put_nowait((next_url, max_redirect -1))
            else:
                links = yield from self.parse_links(response)
                # Python set-logic:
                for link in links.difference(self.seen_urls):
                    self.q.put_nowait((link, self.max_redirect))
                self.seen_urls.update(links)
        finally:
            # Return connection to pool.
            yield from response.release()
开发者ID:Chaogebruce,项目名称:Webcrawler,代码行数:64,代码来源:app_asyncio.py

示例3: __init__

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import join [as 别名]

#.........这里部分代码省略.........

        This compares the last two components of the host.
        """
        return lenient_host(host) in self.root_domains

    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    @asyncio.coroutine
    def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = yield from response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = yield from response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',len(urls), response.url)
                for url in urls:
                    normalized = urllib.parse.urljoin(response.url, url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls))

        return stat, links

    @asyncio.coroutine
    def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = yield from self.session.get(url, allow_redirects=False)  #1
                break  #2
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r', tries, url, client_error)
                exception = client_error
        else:
            return
            
开发者ID:penglee87,项目名称:lpython,代码行数:69,代码来源:crawl_01.py

示例4: __init__

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import join [as 别名]

#.........这里部分代码省略.........
    def record_statistic(self, fetch_statistic):
        """Record the FetchStatistic for completed / failed URL."""
        self.done.append(fetch_statistic)

    async def parse_links(self, response):
        """Return a FetchStatistic and list of links."""
        links = set()
        content_type = None
        encoding = None
        body = await response.read()

        if response.status == 200:
            content_type = response.headers.get('content-type')
            pdict = {}

            if content_type:
                content_type, pdict = cgi.parse_header(content_type)

            encoding = pdict.get('charset', 'utf-8')
            if content_type in ('text/html', 'application/xml'):
                text = await response.text()

                # Replace href with (?:href|src) to follow image links.
                urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
                                      text))
                if urls:
                    LOGGER.info('got %r distinct urls from %r',
                                len(urls), response.url)
                for url in urls:
                    LOGGER.info("response.url:%s,type:%s",
                                response.url, type(response.url))
                    LOGGER.info("parse_links url:%s,type:%s",
                                url, type(url))
                    normalized = urllib.parse.urljoin(str(response.url), url)
                    defragmented, frag = urllib.parse.urldefrag(normalized)
                    if self.url_allowed(defragmented):
                        links.add(defragmented)

        stat = FetchStatistic(
            url=response.url,
            next_url=None,
            status=response.status,
            exception=None,
            size=len(body),
            content_type=content_type,
            encoding=encoding,
            num_urls=len(links),
            num_new_urls=len(links) - len(self.seen_urls))

        return stat, links

    async def fetch(self, url, max_redirect):
        """Fetch one URL."""
        tries = 0
        exception = None
        while tries < self.max_tries:
            try:
                response = await self.session.get(
                    url, allow_redirects=False)

                if tries > 1:
                    LOGGER.info('try %r for %r success', tries, url)

                break
            except aiohttp.ClientError as client_error:
                LOGGER.info('try %r for %r raised %r',
开发者ID:ramsayleung,项目名称:betacat,代码行数:70,代码来源:crawling.py

示例5: Crawler

# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import join [as 别名]

#.........这里部分代码省略.........
    def parse_links(self, web_page_html, base_url, _content_type, _encoding):
        """Return a list of links."""
        links = set()
        tree = html.fromstring(web_page_html)
        tree.make_links_absolute(base_url)
        urls = [link[2] for link in tree.iterlinks()]
        for url in urls:
            defragmented, frag = urllib.parse.urldefrag(url)
            if verify.url_allowed(
                defragmented, self.root_domains, exclude=self.exclude
            ):  # Select Valid links, testing against regexp and root_domains
                links.add(defragmented)
        if urls:
            LOGGER.info(
                "got %r urls from %r new links: %i visited: %i",
                len(urls),
                base_url,
                len(links - self.seen_urls),
                len(self.seen_urls),
            )
        new_links = [link for link in links.difference(self.seen_urls)]

        self.record_statistic(
            url=base_url,
            content_type=_content_type,
            encoding=_encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls),
        )
        return new_links

    def handle_redirect(self, response, url, max_redirect):
        location = response.headers["location"]
        next_url = urllib.parse.urljoin(url, location)
        self.record_statistic(url=url, next_url=next_url, status=response.status)
        if next_url in self.seen_urls:
            return
        if max_redirect > 0:
            LOGGER.info("redirect to %r from %r max_redir: %i", next_url, url, max_redirect - 1)
            self.add_urls(next_url, max_redirect - 1)
        else:
            LOGGER.error("redirect limit reached for %r from %r", next_url, url)
        return

    @asyncio.coroutine
    def fetch(self, url, max_redirect, sem):
        """Fetch one URL."""
        tries = 0
        web_page = None
        exception = None
        _url = None
        _encoding = None
        _content_type = None
        sleep_time = 0
        while tries < self.max_tries:
            try:
                with (yield from sem):
                    response = yield from asyncio.wait_for(
                        self.session.get(url, allow_redirects=False), 10, loop=self.loop
                    )
                if tries > 1:
                    LOGGER.debug("try %r for %r success", tries, url)
                break
            except Exception as client_error:
                sleep_time += 5
                yield from asyncio.sleep(sleep_time)
开发者ID:koolkt,项目名称:python_crawler,代码行数:70,代码来源:crawling.py


注:本文中的asyncio.Queue.join方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。