本文整理汇总了Python中threadpool.ThreadPool._del方法的典型用法代码示例。如果您正苦于以下问题:Python ThreadPool._del方法的具体用法?Python ThreadPool._del怎么用?Python ThreadPool._del使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类threadpool.ThreadPool
的用法示例。
在下文中一共展示了ThreadPool._del方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from threadpool import ThreadPool [as 别名]
# 或者: from threadpool.ThreadPool import _del [as 别名]
class SpiderControl:
'''
爬虫控制类
'''
def __init__(self):
'''
初始化
self.url 根url
self.deep 爬取深度
self.db 数据库操作类
self._thread 线程池
'''
logger.info('init control class')
self.url = conf['url']
self.deep = conf['deep']
self.db = operate['db']
self._thread = ThreadPool(conf['thread'], self.get_html)
def run(self):
'''
主控方法
:return: None
'''
logger.info("start spider, and the spider deep is " + str(self.deep))
self.url_group = []
self.r_group = []
self.recursion_deep()
logger.info("The spider page total number is : " + str(len(self.url_group)))
self._thread._del()
logger.info("Spider OVER!!")
def recursion_deep(self):
'''
根据深度值进行爬取
operate['db'].deep 当前深度
self.deep 需要爬取的深度
:return:
'''
if operate['db'].deep == 0:
logger.info("spidering deep == 0 page")
r = self.get_html(self.url)
try:
html = r['html']
except:
print "url input error!"
logger.error("url error(%s)" %(self.url))
return
operate['db'].insert(html, self.url)
self.r_group.append(r)
operate['db'].deep += 1
self.recursion_deep()
elif operate['db'].deep > self.deep:
logger.info('spider deep over!')
return
else:
logger.info("spidering deep = %s" %operate['db'].deep)
tmp = []
url_group = []
# 从上一个deep爬取的页面中提取url
for x in self.r_group:
html = x['html']
url_group.extend(self.find_url(html))
logger.debug("from %s page find %s url" %(x['url'], len(url_group)))
# 当页面没匹配出任何url, 则结束退出
if url_group == []:
return
# 把提取出来的url丢入线程池中
result_list = self._thread.my_map(url_group)
for y in xrange(len(result_list)):
if result_list[y]['type'] == 'html':
tmp.append(result_list[y])
else:
logger.debug("delete the not html page (%s)" % url_group[y])
self.r_group = tmp
operate['db'].deep += 1
self.recursion_deep()
def find_url(self, html):
'''
使用BeautifulSoup找出网页中的url
:param html: html页面
:return: 返回一个list, 其值为html中url
PS: 暂只考虑a标签中的href属性中的url
'''
url_group = []
logger.debug("start find url in a html")
try:
bs = BeautifulSoup(html, 'lxml')
except Exception, e:
logger.error("bs4(html) fail!\nthe error info is : " + str(e))
return
comp = re.compile("^https?://[/\w\.-]*/?[\w&\+%=-]*")
#.........这里部分代码省略.........