本文整理汇总了Python中asyncio.Queue.empty方法的典型用法代码示例。如果您正苦于以下问题:Python Queue.empty方法的具体用法?Python Queue.empty怎么用?Python Queue.empty使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类asyncio.Queue
的用法示例。
在下文中一共展示了Queue.empty方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Cloner
# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import empty [as 别名]
class Cloner(object):
def __init__(self, root):
self.visited_urls = []
self.root = self.add_scheme(root)
if len(self.root.host) < 4:
sys.exit('invalid taget {}'.format(self.root.host))
self.target_path = '/opt/snare/pages/{}'.format(self.root.host)
if not os.path.exists(self.target_path):
os.mkdir(self.target_path)
self.new_urls = Queue()
@staticmethod
def add_scheme(url):
if yarl.URL(url).scheme:
new_url = yarl.URL(url)
else:
new_url = yarl.URL('http://' + url)
return new_url
@asyncio.coroutine
def process_link(self, url, check_host=False):
url = yarl.URL(url)
if check_host:
if (url.host != self.root.host or url.fragment
or url in self.visited_urls):
return None
if not url.is_absolute():
url = self.root.join(url)
yield from self.new_urls.put(url)
return url.relative().human_repr()
@asyncio.coroutine
def replace_links(self, data):
soup = BeautifulSoup(data, 'html.parser')
# find all relative links
for link in soup.findAll(href=True):
res = yield from self.process_link(link['href'], check_host=True)
if res is not None:
link['href'] = res
# find all images and scripts
for elem in soup.findAll(src=True):
res = yield from self.process_link(elem['src'])
if res is not None:
elem['src'] = res
# find all action elements
for act_link in soup.findAll(action=True):
res = yield from self.process_link(act_link['action'])
if res is not None:
act_link['action'] = res
# prevent redirects
for redir in soup.findAll(True, attrs={'name': re.compile('redirect.*')}):
redir['value'] = yarl.URL(redir['value']).relative().human_repr()
return soup
@asyncio.coroutine
def get_body(self):
while not self.new_urls.empty():
current_url = yield from self.new_urls.get()
if current_url in self.visited_urls:
continue
self.visited_urls.append(current_url)
if current_url.name:
file_name = current_url.name
elif current_url.raw_path != '/':
file_name = current_url.path.rsplit('/')[1]
else:
file_name = 'index.html'
file_path = os.path.dirname(current_url.path)
if file_path == '/':
file_path = self.target_path
else:
file_path = os.path.join(self.target_path, file_path[1:])
print('path: ', file_path, 'name: ', file_name)
if file_path and not os.path.exists(file_path):
os.makedirs(file_path)
data = None
try:
with aiohttp.Timeout(10.0):
with aiohttp.ClientSession() as session:
response = yield from session.get(current_url)
data = yield from response.read()
except aiohttp.ClientError as client_error:
print(client_error)
else:
response.release()
session.close()
if data is not None:
if re.match(re.compile('.*\.(html|php)'), file_name):
soup = yield from self.replace_links(data)
#.........这里部分代码省略.........
示例2: Cloner
# 需要导入模块: from asyncio import Queue [as 别名]
# 或者: from asyncio.Queue import empty [as 别名]
#.........这里部分代码省略.........
for elem in soup.findAll(src=True):
res = await self.process_link(elem['src'], level)
if res is not None:
elem['src'] = res
# find all action elements
for act_link in soup.findAll(action=True):
res = await self.process_link(act_link['action'], level)
if res is not None:
act_link['action'] = res
# prevent redirects
for redir in soup.findAll(True, attrs={'name': re.compile('redirect.*')}):
if redir['value'] != "":
redir['value'] = yarl.URL(redir['value']).relative().human_repr()
return soup
def _make_filename(self, url):
host = url.host
if url.is_absolute():
file_name = url.relative().human_repr()
else:
file_name = url.human_repr()
if not file_name.startswith('/'):
file_name = "/" + file_name
if file_name == '/' or file_name == "":
if host == self.root.host or (self.moved_root is not None and self.moved_root.host == host):
file_name = '/index.html'
else:
file_name = host
m = hashlib.md5()
m.update(file_name.encode('utf-8'))
hash_name = m.hexdigest()
return file_name, hash_name
async def get_body(self, session):
while not self.new_urls.empty():
current_url, level = await self.new_urls.get()
if current_url.human_repr() in self.visited_urls:
continue
self.visited_urls.append(current_url.human_repr())
file_name, hash_name = self._make_filename(current_url)
print('name: ', file_name)
self.meta[file_name] = {}
data = None
content_type = None
try:
response = await session.get(current_url, headers={'Accept': 'text/html'}, timeout=10.0)
content_type = response.content_type
data = await response.read()
except (aiohttp.ClientError, asyncio.TimeoutError) as client_error:
self.logger.error(client_error)
else:
await response.release()
if data is not None:
self.meta[file_name]['hash'] = hash_name
self.meta[file_name]['content_type'] = content_type
if content_type == 'text/html':
soup = await self.replace_links(data, level)
data = str(soup).encode()
with open(os.path.join(self.target_path, hash_name), 'wb') as index_fh:
index_fh.write(data)
if content_type == 'text/css':
css = cssutils.parseString(data, validate=self.css_validate)
for carved_url in cssutils.getUrls(css):
if carved_url.startswith('data'):
continue
carved_url = yarl.URL(carved_url)
if not carved_url.is_absolute():
carved_url = self.root.join(carved_url)
if carved_url.human_repr() not in self.visited_urls:
await self.new_urls.put((carved_url, level + 1))
async def get_root_host(self):
try:
async with aiohttp.ClientSession() as session:
resp = await session.get(self.root)
if resp.host != self.root.host:
self.moved_root = resp.url
resp.close()
except aiohttp.ClientError as err:
self.logger.error("Can\'t connect to target host: %s", err)
exit(-1)
async def run(self):
session = aiohttp.ClientSession()
try:
await self.new_urls.put((self.root, 0))
await self.new_urls.put((self.error_page, 0))
await self.get_body(session)
except KeyboardInterrupt:
raise
finally:
with open(os.path.join(self.target_path, 'meta.json'), 'w') as mj:
json.dump(self.meta, mj)
await session.close()