本文整理汇总了Python中asyncio.Queue.join方法的典型用法代码示例。如果您正苦于以下问题:Python Queue.join方法的具体用法?Python Queue.join怎么用?Python Queue.join使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类asyncio.Queue
的用法示例。
在下文中一共展示了Queue.join方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: workerTask
# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import join [as 别名]
def workerTask(q):
while not q.empty():
processImage(q.get_nowait()[0])
q.task_done()
if not os.path.exists("__working"):
os.mkdir("__working")
convertPdfs(pdfList)
q = Queue(maxsize=0)
num_threads = 4
#put files in queue
for fileName in os.listdir("__working"):
if fileName.endswith(".pbm"):
q.put_nowait(("__working/" + fileName,))
threads = []
for i in range(num_threads):
worker = Thread(target=workerTask, args=(q,))
worker.start()
threads.append(worker)
q.join()
for thread in threads:
thread.join()
subprocess.run("rm -r __working", shell=True)
示例2: __init__
# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import join [as 别名]
class Crawler:
def __init__(self, root_url, max_redirect):
self.max_tasks = 10
self.max_redirect = max_redirect
self.q = Queue()
self.seen_urls = set()
# aiohttp's ClientSession does connection pooling and
# HTTP keep-alives for us.
self.session = aiohttp.ClientSession(loop=loop)
# Put (URL, max_redirect) in the Queue
self.q.put((root_url, self.max_redirect))
@asyncio.coroutine
def crawl(self):
'''Run the crawler untill all work is done.'''
workers = [asyncio.Task(self.work())
for _ in range(self.max_tasks)]
# When all work is done, exit.
yield from self.q.join()
for w in workers:
w.cancel()
@asyncio.coroutine
def work(self):
while True:
url, max_redirect = yield from self.q.get()
# Download page and add new links to self.q
yield from self.fetch(url, max_redirect)
self.q.task_done()
@asyncio.coroutine
def fetch(self, url, max_redirect):
# Handle redirects ourselves.
response = yield from self.session.get(
url, allow_redirects=False)
try:
if is_redirect(response):
if max_redirect > 0:
next_url = response.headers['location']
if next_url in self.seen_urls:
# We have done this before.
return
# Remember we have seen this url.
self.seen_urls.add(next_url)
# Follow the redirect. One less redirect remains.
self.q.put_nowait((next_url, max_redirect -1))
else:
links = yield from self.parse_links(response)
# Python set-logic:
for link in links.difference(self.seen_urls):
self.q.put_nowait((link, self.max_redirect))
self.seen_urls.update(links)
finally:
# Return connection to pool.
yield from response.release()
示例3: __init__
# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import join [as 别名]
#.........这里部分代码省略.........
This compares the last two components of the host.
"""
return lenient_host(host) in self.root_domains
def record_statistic(self, fetch_statistic):
"""Record the FetchStatistic for completed / failed URL."""
self.done.append(fetch_statistic)
@asyncio.coroutine
def parse_links(self, response):
"""Return a FetchStatistic and list of links."""
links = set()
content_type = None
encoding = None
body = yield from response.read()
if response.status == 200:
content_type = response.headers.get('content-type')
pdict = {}
if content_type:
content_type, pdict = cgi.parse_header(content_type)
encoding = pdict.get('charset', 'utf-8')
if content_type in ('text/html', 'application/xml'):
text = yield from response.text()
# Replace href with (?:href|src) to follow image links.
urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',text))
if urls:
LOGGER.info('got %r distinct urls from %r',len(urls), response.url)
for url in urls:
normalized = urllib.parse.urljoin(response.url, url)
defragmented, frag = urllib.parse.urldefrag(normalized)
if self.url_allowed(defragmented):
links.add(defragmented)
stat = FetchStatistic(
url=response.url,
next_url=None,
status=response.status,
exception=None,
size=len(body),
content_type=content_type,
encoding=encoding,
num_urls=len(links),
num_new_urls=len(links - self.seen_urls))
return stat, links
@asyncio.coroutine
def fetch(self, url, max_redirect):
"""Fetch one URL."""
tries = 0
exception = None
while tries < self.max_tries:
try:
response = yield from self.session.get(url, allow_redirects=False) #1
break #2
except aiohttp.ClientError as client_error:
LOGGER.info('try %r for %r raised %r', tries, url, client_error)
exception = client_error
else:
return
示例4: __init__
# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import join [as 别名]
#.........这里部分代码省略.........
def record_statistic(self, fetch_statistic):
"""Record the FetchStatistic for completed / failed URL."""
self.done.append(fetch_statistic)
async def parse_links(self, response):
"""Return a FetchStatistic and list of links."""
links = set()
content_type = None
encoding = None
body = await response.read()
if response.status == 200:
content_type = response.headers.get('content-type')
pdict = {}
if content_type:
content_type, pdict = cgi.parse_header(content_type)
encoding = pdict.get('charset', 'utf-8')
if content_type in ('text/html', 'application/xml'):
text = await response.text()
# Replace href with (?:href|src) to follow image links.
urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',
text))
if urls:
LOGGER.info('got %r distinct urls from %r',
len(urls), response.url)
for url in urls:
LOGGER.info("response.url:%s,type:%s",
response.url, type(response.url))
LOGGER.info("parse_links url:%s,type:%s",
url, type(url))
normalized = urllib.parse.urljoin(str(response.url), url)
defragmented, frag = urllib.parse.urldefrag(normalized)
if self.url_allowed(defragmented):
links.add(defragmented)
stat = FetchStatistic(
url=response.url,
next_url=None,
status=response.status,
exception=None,
size=len(body),
content_type=content_type,
encoding=encoding,
num_urls=len(links),
num_new_urls=len(links) - len(self.seen_urls))
return stat, links
async def fetch(self, url, max_redirect):
"""Fetch one URL."""
tries = 0
exception = None
while tries < self.max_tries:
try:
response = await self.session.get(
url, allow_redirects=False)
if tries > 1:
LOGGER.info('try %r for %r success', tries, url)
break
except aiohttp.ClientError as client_error:
LOGGER.info('try %r for %r raised %r',
示例5: Crawler
# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import join [as 别名]
#.........这里部分代码省略.........
def parse_links(self, web_page_html, base_url, _content_type, _encoding):
"""Return a list of links."""
links = set()
tree = html.fromstring(web_page_html)
tree.make_links_absolute(base_url)
urls = [link[2] for link in tree.iterlinks()]
for url in urls:
defragmented, frag = urllib.parse.urldefrag(url)
if verify.url_allowed(
defragmented, self.root_domains, exclude=self.exclude
): # Select Valid links, testing against regexp and root_domains
links.add(defragmented)
if urls:
LOGGER.info(
"got %r urls from %r new links: %i visited: %i",
len(urls),
base_url,
len(links - self.seen_urls),
len(self.seen_urls),
)
new_links = [link for link in links.difference(self.seen_urls)]
self.record_statistic(
url=base_url,
content_type=_content_type,
encoding=_encoding,
num_urls=len(links),
num_new_urls=len(links - self.seen_urls),
)
return new_links
def handle_redirect(self, response, url, max_redirect):
location = response.headers["location"]
next_url = urllib.parse.urljoin(url, location)
self.record_statistic(url=url, next_url=next_url, status=response.status)
if next_url in self.seen_urls:
return
if max_redirect > 0:
LOGGER.info("redirect to %r from %r max_redir: %i", next_url, url, max_redirect - 1)
self.add_urls(next_url, max_redirect - 1)
else:
LOGGER.error("redirect limit reached for %r from %r", next_url, url)
return
@asyncio.coroutine
def fetch(self, url, max_redirect, sem):
"""Fetch one URL."""
tries = 0
web_page = None
exception = None
_url = None
_encoding = None
_content_type = None
sleep_time = 0
while tries < self.max_tries:
try:
with (yield from sem):
response = yield from asyncio.wait_for(
self.session.get(url, allow_redirects=False), 10, loop=self.loop
)
if tries > 1:
LOGGER.debug("try %r for %r success", tries, url)
break
except Exception as client_error:
sleep_time += 5
yield from asyncio.sleep(sleep_time)