本文整理汇总了Python中threadPool.ThreadPool.startThreads方法的典型用法代码示例。如果您正苦于以下问题:Python ThreadPool.startThreads方法的具体用法?Python ThreadPool.startThreads怎么用?Python ThreadPool.startThreads使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类threadPool.ThreadPool
的用法示例。
在下文中一共展示了ThreadPool.startThreads方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Crawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class Crawler(object):
def __init__(self,threadnum,pathname,limit):
'''limit指定图片数目,path指定存放路径'''
super(Crawler, self).__init__()
self.threadPool = ThreadPool(threadnum)
self.file = PicFile('imgfile','r')
self.urlqueue = deque()
self.count = 1
self._makePath(pathname)
self.savaPath = os.getcwd()+'/'+pathname
self._getUrl(limit)
'''当前目录下创建指定目录'''
def _makePath(self,pathname):
if not os.path.isdir(os.getcwd()+'/'+pathname):
os.mkdir(os.getcwd()+'/'+pathname)
else:
pass
'''从文件取出 URL 到双向列表'''
def _getUrl(self,num):
while len(self.urlqueue) < num:
self.urlqueue.append(self.file.getData().rstrip('\n'))
self.file.close()
def start(self):
print '---start downloading picture---'
self.threadPool.startThreads()
while self.urlqueue!=deque([]):
self.threadPool.putTask(self._handleTask,self.urlqueue.popleft())
self.stop()
def stop(self):
self.threadPool.stopThreads()
print '---end downloading picture---'
'''任务处理'''
def _handleTask(self,url):
self._download(url)
'''下载图片,以数字升序命名'''
def _download(self,url):
retry = 2
try:
r = requests.get(url)
with open(self.savaPath +'/'+str(self.count)+'.jpg','wb') as jpg:
jpg.write(r.content)
self.count+=1
print url
except Exception,e:
if retry > 0:
retry = retry - 1
self._download(url)
示例2: main
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
def main():
threadPool = ThreadPool(5)
threadPool.startThreads()
f = codecs.open('tables/TopicInfo-all.txt', 'r', 'utf-8') # 读入unicode字符
count = 0
for line in f:
line = line.strip()
seg_list = line.split('[=]')
if seg_list[1] == 'ustv':
threadPool.putTask(task_handler, seg_list[0], seg_list)
count += 1
f.close()
while threadPool.getTaskLeft() > 0:
time.sleep(10)
print 'Waiting to finish. Task left: %d' % threadPool.getTaskLeft()
log.info('Number of topics in ustv: %d' % count)
示例3: saveProxies
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
def saveProxies():
threadPool = ThreadPool(30)
threadPool.startThreads()
proxyFileOK = open('proxyOK.txt','a')
proxyFileFail = open('proxyFail.txt','a')
for proxy in proxiex:
threadPool.putTask(checkProxy, proxy)
while threadPool.getTaskLeft():
flag, proxy = threadPool.getTaskResult()
print flag, proxy
if flag == 'ok':
proxyFileOK.write(proxy)
proxyFileOK.write('\n')
else:
proxyFileFail.write(proxy)
proxyFileFail.write('\n')
threadPool.stopThreads()
proxyFileOK.close()
proxyFileFail.close()
示例4: Crawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class Crawler(object):
def __init__(self, args=Strategy()):
self.url = args.url
self.max_depth = args.max_depth #指定网页深度
self.max_count = args.max_count #爬行最大数量
self.concurrency = args.concurrency #线程数
self.timeout = args.timeout #超时
self.cookies = args.cookies #cookies
self.ssl_verify = args.ssl_verify #ssl
self.same_host = args.same_host #是否只抓取相同host的链接
self.same_domain = args.same_domain #是否只抓取相同domain的链接
self.currentDepth = 1 #标注初始爬虫深度,从1开始
self.keyword = args.keyword #指定关键词,使用console的默认编码来解码
self.threadPool = ThreadPool(args.concurrency) #线程池,指定线程数
self.visitedHrefs = set() #已访问的链接
self.unvisitedHrefs = deque() #待访问的链接
self.unvisitedHrefs.append(args.url)#添加首个待访问的链接
self.isCrawling = False #标记爬虫是否开始执行任务
self.file = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt'
print self.file
print 'args.url=\t',args.url
#################
#此句有问题
self.database = Database(args.dbFile) #数据库
# print 'hehe'
self.lock = Lock()
def start(self):
print '\nStart Crawling\n'
if not self._isDatabaseAvaliable():
print 'Error: Unable to open database file.\n'
else:
pass
if True:
self.isCrawling = True
self.threadPool.startThreads()
while self.currentDepth <= self.max_depth and len(self.visitedHrefs) <= self.max_count:
#分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
self._assignCurrentDepthTasks ()
#等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
#self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
counter = 0
while self.threadPool.getTaskLeft() and counter < 600:
# print '>>taskleft:\t',self.threadPool.getTaskLeft()
# print self.threadPool.taskQueue.qsize()
# print self.threadPool.resultQueue.qsize()
# print self.threadPool.running
time.sleep(1)
counter += 1
# self.threadPool.taskJoin()
print 'Depth %d Finish. Totally visited %d links. \n' % (
self.currentDepth, len(self.visitedHrefs))
log.info('Depth %d Finish. Total visited Links: %d\n' % (
self.currentDepth, len(self.visitedHrefs)))
self.currentDepth += 1
self.stop()
def stop(self):
self.isCrawling = False
self.threadPool.stopThreads()
# self.database.close()
def saveAllHrefsToFile(self,nonehtml=True):
try:
cf = CrawlerFile(url=self.url)
contentlist = []
hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
for href in hrefs:
if href.endswith('.html') and nonehtml:
continue
contentlist.append(href)
cf.saveSection('Hrefs',contentlist,coverfile=True)
# fp = open(self.file,'w')
# fp.write('[Hrefs]'+os.linesep)
# hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
# rethrefs = []
# print 'Totally ',len(hrefs), ' hrefs'
# for href in hrefs:
# if href.endswith('.html'):
# continue
# rethrefs.append(href)
# fp.write(href + os.linesep)
# print href
# print 'Totally ',len(rethrefs), ' aviable hrefs'
# fp.close()
except:
pass
def _getCrawlerPaths(self,url):
''' '''
try:
paths = []
#.........这里部分代码省略.........
示例5: Crawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class Crawler(object):
def __init__(self, args):
self.depth = args.depth
self.currentDepth = 1
self.database = database(args.dbFile)
self.threadPool = ThreadPool(args.threadNum)
self.visitUrls = set()
self.unvisitedUrls = deque()
self.unvisitedUrls.append(args.url)
self.isCrawling = False
self.maxWebPages = args.maxWebPages
def requestPage(self, url, retry=2):
try:
h = self.customerHeader(url)
content = requests.get(url, headers=h, timeout=10)
self.handleEncoding(content)
if content.status_code == requests.codes.ok:
if 'html' in content.headers['Content-Type']:
return content.text
log.warning('Page not available. Status code:%d URL:%s\n' % (content.status_code, url))
except Exception:
if retry > 0:
return self.requestPage(url, retry-1)
else:
log.debug('request Fail URL:%s' % url)
return None
def extractUrls(self, content, url):
allUrls = self.getAllHrefs(url, content)
for href in allUrls:
if self.isHttpProtocol(href):
if href not in self.visitUrls and href not in self.unvisitedUrls:
self.unvisitedUrls.append(href)
def saveResult(self, content, url):
self.database.saveWeb(url, content)
def taskHandler(self, url):
content = self.requestPage(url)
self.saveResult(content, url)
self.extractUrls(content, url)
def assignTask(self):
while self.unvisitedUrls:
url = self.unvisitedUrls.popleft()
self.threadPool.putTask(self.taskHandler, url)
self.visitUrls.add(url)
def start(self):
print '\n Start Crawling\n'
if self.database.con:
self.isCrawling = True
self.threadPool.startThreads()
while self.currentDepth < self.depth+1:
self.assignTask()
while self.threadPool.getAllTaskCount():
time.sleep(4)
print 'Depth %d Finish' % self.currentDepth
print 'Totally crawled %d links' % len(self.visitUrls)
log.info('Depth %d Finish. Totally crawled %d links' % (self.currentDepth, len(self.visitUrls)))
if len(self.visitUrls) > self.maxWebPages:
break
self.currentDepth += 1
self.stop()
def stop(self):
self.isCrawling = False
self.threadPool.stopThreads()
self.database.close()
self.database.close()
def customerHeader(self, url):
headers = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset' : 'gb18030,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding' : 'gzip,deflate,sdch',
'Accept-Language' : 'en-US,en;q=0.8',
'Connection': 'keep-alive',
'User-Agent' : 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4',
'Referer' : url,
}
return headers
def getAllHrefs(self, url, content):
hrefs = []
s = bs(content)
res = s.findAll('a', href=True)
for r in res:
href = r.get('href').encode('utf8')
if not href.startswith('http'):
href = urljoin(url, href)
hrefs.append(href)
return hrefs
def isHttpProtocol(self, href):
protocal = urlparse(href).scheme
if protocal == 'http' or protocal == 'https':
return True
#.........这里部分代码省略.........
示例6: CommentCrawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class CommentCrawler(object):
def __init__(self, group_id, topic_id_list, thread_num, base_path, topic_info_path, comment_info_path):
"""
`group_id` 当前的Group id
`topic_id_list` 需要抓取的topic id的list
`thread_num` 开启的线程数目
`topic_info_path` 存储topic信息的文件
`comment_info_path` 存储comment信息的文件
"""
#线程池,指定线程数
self.thread_pool = ThreadPool(thread_num)
# 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储
self.save_thread = ThreadPool(10)
self.topic_info_path = topic_info_path
self.comment_info_path = comment_info_path
self.base_path = base_path
# 已经访问的页面: Group id ==> True or False
self.visited_href = set()
# 抓取失败的topic id
self.failed = set()
# 依次为每个小组抽取topic评论
self.group_id = group_id
self.topic_id_list = topic_id_list # 等待抓取的topic列表
# 存储结果
# topic ID ==> Topic对象
self.topic_dict = dict()
# 存放下一个处理的评论页数: topic ID ==> 1,2,3...
self.next_page = dict()
# 已经抓取完毕的topic id集合
self.finished = set()
self.is_crawling = False
# 每个topic抓取的最多comments个数
#self.MAX_COMMETS_NUM = 5000
self.MAX_COMMETS_NUM = float('inf')
# 每页的评论数量
self.COMMENTS_PER_PAGE = 100
def start(self):
print '\nStart Crawling comment list for group: ' + self.group_id + '...\n'
self.is_crawling = True
self.thread_pool.startThreads()
self.save_thread.startThreads()
# 打开需要存储的文件
self.topic_info_file = codecs.open(self.topic_info_path, 'w', 'utf-8')
self.comment_info_file = codecs.open(self.comment_info_path, 'w', 'utf-8')
self.topic_id_list = list(set(self.topic_id_list)) # 消除重复的topic id
print "Total topics in group %s: %d." % (self.group_id, len(self.topic_id_list))
# 初始化添加任务
for topic_id in self.topic_id_list:
url = "http://www.douban.com/group/topic/" + topic_id + "/"
self.thread_pool.putTask(self._taskHandler, url)
# 下一页评论类似:http://www.douban.com/group/topic/35082953/?start=100
self.next_page[topic_id] = 1
# 完全抛弃之前的抽取深度的概念,改为随时向thread pool推送任务
while True:
# 保证任何时候thread pool中的任务数为线程数的2倍
print "Check threalPool queue..."
while self.thread_pool.getTaskLeft() < self.thread_pool.threadNum * 2:
# 获取未来需要访问的链接
url = self._getFutureVisit()
if url is not None:
self.thread_pool.putTask(self._taskHandler, url)
else: # 已经不存在下一个链接
break
# 每隔一秒检查thread pool的队列
time.sleep(2)
# 检查是否处理完毕
if len(self.finished) == len(self.topic_id_list):
break
elif len(self.finished) > len(self.topic_id_list):
assert(False)
print 'Total topics: %d, Finished topic: %d' % (len(self.topic_id_list), len(self.finished))
remain = set(self.topic_id_list) - self.finished
if len(remain) < 5:
print 'Unfinished: ', remain
# 等待线程池中所有的任务都完成
print "Totally visited: ", len(self.visited_href)
#pdb.set_trace()
while self.thread_pool.getTaskLeft() > 0:
print "Task left in threadPool: ", self.thread_pool.getTaskLeft()
print "Task queue size: ", self.thread_pool.taskQueue.qsize()
print "Running tasks: ", self.thread_pool.running
time.sleep(2)
#.........这里部分代码省略.........
示例7: Crawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class Crawler(object):
def __init__(self, args, startURLs):
#指定网页深度
self.depth = args.depth
#标注初始爬虫深度,从1开始
self.currentDepth = 1
#指定关键词,使用console的默认编码来解码
#self.keyword = args.keyword.decode(getdefaultlocale()[1])
#数据库
self.database = Database(args.dbFile)
# store group ids to fils, using UTF-8
self.groupfile = codecs.open("GroupID.txt", "w", "UTF-8")
#线程池,指定线程数
self.threadPool = ThreadPool(args.threadNum)
#已访问的小组id
self.visitedGroups = set()
#待访问的小组id
self.unvisitedGroups = deque()
# 所有的Group信息
self.groupInfo = []
self.lock = Lock() #线程锁
#标记爬虫是否开始执行任务
self.isCrawling = False
# 添加尚未访问的小组首页
for url in startURLs:
match_obj = REGroup.match(url)
print "Add start urls:", url
assert(match_obj != None)
self.unvisitedGroups.append(match_obj.group(1))
# 一分钟内允许的最大访问次数
self.MAX_VISITS_PER_MINUTE = 10
# 当前周期内已经访问的网页数量
self.currentPeriodVisits = 0
# 将一分钟当作一个访问周期,记录当前周期的开始时间
self.periodStart = time.time() # 使用当前时间初始化
def start(self):
print '\nStart Crawling\n'
if not self._isDatabaseAvaliable():
print 'Error: Unable to open database file.\n'
else:
self.isCrawling = True
self.threadPool.startThreads()
self.periodStart = time.time() # 当前周期开始
# 按照depth来抓取网页
while self.currentDepth < self.depth+1:
#分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
self._assignCurrentDepthTasks ()
#等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
#self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
while self.threadPool.getTaskLeft() > 0:
print "Task left: ", self.threadPool.getTaskLeft()
time.sleep(3)
print 'Depth %d Finish. Totally visited %d links. \n' % (
self.currentDepth, len(self.visitedGroups))
log.info('Depth %d Finish. Total visited Links: %d\n' % (
self.currentDepth, len(self.visitedGroups)))
self.currentDepth += 1
self.stop()
assert(self.threadPool.getTaskLeft() == 0)
print "Main Crawling procedure finished!"
def stop(self):
self.isCrawling = False
self.threadPool.stopThreads()
# save group ids to file
for group_id in self.visitedGroups:
self.groupfile.write(group_id + "\n")
self.groupfile.close()
self.database.close()
def getAlreadyVisitedNum(self):
#visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。
#因此真实的已访问链接数为visitedGroups数减去待访问的链接数
if len(self.visitedGroups) == 0:
return 0
else:
return len(self.visitedGroups) - self.threadPool.getTaskLeft()
def _assignCurrentDepthTasks(self):
"""取出一个线程,并为这个线程分配任务,即抓取网页,并进行相应的访问控制
"""
# 判断当前周期内访问的网页数目是否大于最大数目
if self.currentPeriodVisits > self.MAX_VISITS_PER_MINUTE - 1:
# 等待所有的网页处理完毕
while self.threadPool.getTaskLeft() > 0:
print "Waiting period ends..."
time.sleep(1)
timeNow = time.time()
seconds = timeNow - self.periodStart
if seconds < 60: # 如果当前还没有过一分钟,则sleep
time.sleep(int(seconds + 3))
self.periodStart = time.time() # 重新设置开始时间
self.currentPeriodVisits = 0
# 从未访问的列表中抽出,并为其分配thread
#.........这里部分代码省略.........
示例8: Crawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
#.........这里部分代码省略.........
return 0
def _getAllHrefsFromPage(self, url, pageSource):
'''用beautifulsoup解析源码,得到有效连接'''
hrefs = []
soup = BeautifulSoup(pageSource)
results = soup.find_all('a', href = True)
for a in results:
#防止中文连接,encode转为utf8
href = a.get('href').encode('utf8')
if not href.strip().startswith('http'): #去除前后多余的空格
href = urljoin(url, href)
hrefs.append(href)
return hrefs
def _isHttpOrHttpsProtocol(self, href):
'''只处理http,https连i接'''
protocal = urlparse(href).scheme
if protocal == 'http' or protocal == 'https':
if not(self.url in href):
return False
if '.jpg' in href:
return False
return True
return False
def _isHrefRepeated(self, href):
'''去掉重复的网页'''
if href in self.visitedHrefs or href in self.unvisitedHrefs:
return True
return False
def _addUnvisitedHrefs(self, my_web):
'''添加未访问连接'''
url, pageSource = my_web.getDatas()
hrefs = self._getAllHrefsFromPage(url, pageSource)
for href in hrefs:
if self._isHttpOrHttpsProtocol(href):
if not self._isHrefRepeated(href):
self.unvisitedHrefs.append(href)
def getAlreadyVisitedNum(self):
'''获得已经访问的网页的数目'''
return len(self.visitedHrefs) - self.threadPool.getTaskLeft()
def _taskHandler(self, url):
'''以_开头的函数是放在队列里供线程提取用的'''
my_web = WebPage(url)
#print 'Fuck', my_web.fetch()
if my_web.fetch():
#print 'has visited %s' % url
self._saveTaskResults(my_web)
self._addUnvisitedHrefs(my_web)
def _assignCurrentDepthTasks(self):
'''分配任务,该操作不阻塞'''
while self.unvisitedHrefs:
url = self.unvisitedHrefs.popleft()
#分配给任务队列
self.threadPool.putTask(self._taskHandler, url)
self.visitedHrefs.add(url)
def stop(self):
self.isCrawling = False
self.threadPool.stopThreads()
self.database.close()
def start(self):
print '\nstart crawling', self.url
self.isCrawling = True
self.threadPool.startThreads()
while self.currentDepth < self.depth + 1:
#分配任务(该操作不阻塞)
self._assignCurrentDepthTasks()
#等待该层任务结束
#print 'sssssss'
#self.threadPool.taskJoin()
while self.threadPool.getTaskLeft():
#print self.threadPool.taskQueue.qsize()
time.sleep(8)
#print 'eeeeee'
print 'depth %d finished. totally visited %d links.\n' % (self.currentDepth, len(self.visitedHrefs))
log.info('depth %d finished. totally visited %d links.\n' % (self.currentDepth, len(self.visitedHrefs)))
self.currentDepth += 1
self.stop()
def selfTesting(self):
url = 'http://www.baidu.com'
print '\nVisiting www.baidu.com using directly'
my_web = WebPage(url)
pageSource = my_web.fetch()
#测试网络链接
if pageSource == None:
print 'please check your network'
elif not self.isDatabaseAvaliable():
print 'please make sure you have the permission to save data: %s\n' % args.dbFile
else:
self._saveTaskResults(my_web)
print 'save data successfully'
print 'seems all is ok'
示例9: PostIDCrawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class PostIDCrawler(object):
def __init__(self, start_url, thread_num, post_list_path, max_post_num = 1000):
"""
`group_id` 待抓取的group id
`thread_num` 抓取的线程
`post_list_path` 保存所有的post id list的文件路径
"""
#线程池,指定线程数
self.thread_pool = ThreadPool(thread_num)
# 保存topic的线程
# NOTE: 这里只允许一个保存进程,因为要操作同一个文件
self.save_thread = ThreadPool(1)
# 保存group相关信息
self.post_list_path = post_list_path
# 已经访问的页面: Group id ==> True or False
self.visited_href = set()
#待访问的小组讨论页面
self.unvisited_href = deque()
# 访问失败的页面链接
self.failed_href = set()
self.start_url = start_url
# 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取
# 只保存thread-id
self.post_list = list()
self.is_crawling = False
# 每个Group抓取的最大topic个数
self.MAX_POST_NUM = max_post_num
#self.MAX_POST_NUM = float('inf')
# 每一页中显示的最多的topic数量,似乎每页中不一定显示25个topic
#self.MAX_TOPICS_PER_PAGE = 25
def start(self):
print '\nStart crawling post id list...\n'
self.is_crawling = True
self.thread_pool.startThreads()
self.save_thread.startThreads()
# 打开需要存储的文件
self.post_list_file = codecs.open(self.post_list_path, 'w', 'utf-8')
print "Add start url:", self.start_url
self.unvisited_href.append(self.start_url)
#分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
self._assignInitTask()
#等待当前线程池完成所有任务,当池内的所有任务完成时,才进行下一个小组的抓取
#self.thread_pool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
while self.thread_pool.getTaskLeft() > 0:
#print "Task left: ", self.thread_pool.getTaskLeft()
# 判断是否已经抓了足够多的thread id
if len(self.post_list) > self.MAX_POST_NUM:
print u'已经达到最大讨论帖抓取数,即将退出抓取。'
break
else:
print u'当前已抓取的讨论帖个数:', len(self.post_list)
time.sleep(3)
# 存储抓取的结果并等待存储线程结束
while self.save_thread.getTaskLeft() > 0:
print 'Wairting for saving thread. Taks left: %d' % self.save_thread.getTaskLeft()
time.sleep(3)
log.info("Thread ID list crawling done.")
self.stop()
# 结束时可能还有任务,但是当前已经抓去了足够量的讨论帖
#assert(self.thread_pool.getTaskLeft() == 0)
# 关闭文件
self.post_list_file.close()
print "Main Crawling procedure finished!"
def stop(self):
self.is_crawling = False
self.thread_pool.stopThreads()
self.save_thread.stopThreads()
def _assignInitTask(self):
"""取出一个线程,并为这个线程分配任务,即抓取网页
"""
while len(self.unvisited_href) > 0:
# 从未访问的列表中抽出一个任务,并为其分配thread
url = self.unvisited_href.popleft()
self.thread_pool.putTask(self._taskHandler, url)
# 添加已经访问过的小组id
self.visited_href.add(url)
def _taskHandler(self, url):
""" 根据指定的url,抓取网页,并进行相应的访问控制
"""
print "Visiting : " + url
webPage = WebPage(url)
#.........这里部分代码省略.........
示例10: Crawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class Crawler(object):
def __init__(self, args):
#指定网页深度
self.depth = args.depth
#标注初始爬虫深度,从1开始
self.currentDepth = 1
#指定关键词,使用console的默认编码来解码
self.keyword = args.keyword.decode(getdefaultlocale()[1])
#数据库
self.database = Database(args.dbFile)
#线程池,指定线程数
self.threadPool = ThreadPool(args.threadNum)
#已访问的链接
self.visitedHrefs = set()
#待访问的链接
self.unvisitedHrefs = deque()
#添加首个待访问的链接
self.unvisitedHrefs.append(args.url)
#标记爬虫是否开始执行任务
self.isCrawling = False
def start(self):
print ('\nStart Crawling\n')
if not self._isDatabaseAvaliable():
print ('Error: Unable to open database file.\n')
else:
self.isCrawling = True
self.threadPool.startThreads()
while self.currentDepth < self.depth+1:
#分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
self._assignCurrentDepthTasks ()
#等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
#self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
while self.threadPool.getTaskLeft():
time.sleep(8)
print ('Depth %d Finish. Totally visited %d links. \n') % (
self.currentDepth, len(self.visitedHrefs))
log.info('Depth %d Finish. Total visited Links: %d\n' % (
self.currentDepth, len(self.visitedHrefs)))
self.currentDepth += 1
self.stop()
def stop(self):
self.isCrawling = False
self.threadPool.stopThreads()
self.database.close()
def getAlreadyVisitedNum(self):
#visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。
#因此真实的已访问链接数为visitedHrefs数减去待访问的链接数
return len(self.visitedHrefs) - self.threadPool.getTaskLeft()
def _assignCurrentDepthTasks(self):
while self.unvisitedHrefs:
url = self.unvisitedHrefs.popleft()
#向任务队列分配任务
self.threadPool.putTask(self._taskHandler, url)
#标注该链接已被访问,或即将被访问,防止重复访问相同链接
self.visitedHrefs.add(url)
def _taskHandler(self, url):
#先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
webPage = WebPage(url)
if webPage.fetch():
self._saveTaskResults(webPage)
self._addUnvisitedHrefs(webPage)
def _saveTaskResults(self, webPage):
url, pageSource = webPage.getDatas()
try:
if self.keyword:
#使用正则的不区分大小写search比使用lower()后再查找要高效率(?)
if re.search(self.keyword, pageSource, re.I):
self.database.saveData(url, pageSource, self.keyword)
else:
self.database.saveData(url, pageSource)
except (Exception, e):
log.error(' URL: %s ' % url + traceback.format_exc())
def _addUnvisitedHrefs(self, webPage):
'''添加未访问的链接。将有效的url放进UnvisitedHrefs列表'''
#对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次
url, pageSource = webPage.getDatas()
hrefs = self._getAllHrefsFromPage(url, pageSource)
for href in hrefs:
if self._isHttpOrHttpsProtocol(href):
if not self._isHrefRepeated(href):
self.unvisitedHrefs.append(href)
def _getAllHrefsFromPage(self, url, pageSource):
'''解析html源码,获取页面所有链接。返回链接列表'''
hrefs = []
soup = BeautifulSoup(pageSource)
results = soup.find_all('a',href=True)
for a in results:
#必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf
#在bs4中不会被自动url编码,从而导致encodeException
href = a.get('href').encode('utf8')
if not href.startswith('http'):
#.........这里部分代码省略.........
示例11: TopicCrawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class TopicCrawler(object):
def __init__(self, group_id, thread_num, group_info_path, topic_list_path, max_topics_num = 1000):
"""
`group_id` 待抓取的group id
`thread_num` 抓取的线程
`group_info_path` 存储group本身的信息文件路径
`topic_list_path` 保存所有的topic id list的文件路径
"""
#线程池,指定线程数
self.thread_pool = ThreadPool(thread_num)
# 保存topic的线程
self.save_thread = ThreadPool(1)
# 写数据库的线程
#self.DBThread = ThreadPool(1)
# 保存group相关信息
self.group_info_path = group_info_path
self.topic_list_path = topic_list_path
# 已经访问的页面: Group id ==> True or False
self.visited_href = set()
#待访问的小组讨论页面
self.unvisited_href = deque()
# 访问失败的页面链接
self.failed_href = set()
self.lock = Lock() #线程锁
self.group_id = group_id
self.group_info = None # models.Group
# 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取
# 只保存topic id
self.topic_list = list()
self.is_crawling = False
# self.database = Database("DoubanGroup.db")
# 每个Group抓取的最大topic个数
self.MAX_TOPICS_NUM = max_topics_num
#self.MAX_TOPICS_NUM = float('inf')
# 每一页中显示的最多的topic数量,似乎每页中不一定显示25个topic
#self.MAX_TOPICS_PER_PAGE = 25
def start(self):
print '\nStart Crawling topic list...\n'
self.is_crawling = True
self.thread_pool.startThreads()
self.save_thread.startThreads()
# 打开需要存储的文件
self.group_info_file = codecs.open(self.group_info_path, 'w', 'utf-8')
self.topic_list_file = codecs.open(self.topic_list_path, 'w', 'utf-8')
url = "http://www.douban.com/group/" + group_id + "/"
print "Add start url:", url
self.unvisited_href.append(url)
url = "http://www.douban.com/group/" + group_id + "/discussion?start=0"
print "Add start urls:", url
self.unvisited_href.append(url)
#分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
self._assignInitTask()
#等待当前线程池完成所有任务,当池内的所有任务完成时,才进行下一个小组的抓取
#self.thread_pool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
while self.thread_pool.getTaskLeft() > 0:
#print "Task left: ", self.thread_pool.getTaskLeft()
time.sleep(3)
# 存储抓取的结果并等待存储线程结束
while self.save_thread.getTaskLeft() > 0:
print 'Wairting for saving thread. Taks left: %d' % self.save_thread.getTaskLeft()
time.sleep(3)
print "Stroring crawling topic list for: " + group_id
print "Save to files..."
#self._saveTopicList()
print "Processing done with group: " + group_id
log.info("Topic list crawling done with group %s.", group_id)
self.stop()
assert(self.thread_pool.getTaskLeft() == 0)
# 关闭文件
self.group_info_file.close()
self.topic_list_file.close()
print "Main Crawling procedure finished!"
def stop(self):
self.is_crawling = False
self.thread_pool.stopThreads()
self.save_thread.stopThreads()
def _assignInitTask(self):
"""取出一个线程,并为这个线程分配任务,即抓取网页
#.........这里部分代码省略.........
示例12: Crawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class Crawler(object):
def __init__(self, args):
#指定网页深度
self.depth = args.depth
#标注初始爬虫深度,从1开始
self.currentDepth = 1
#指定关键词,使用console的默认编码来解码
self.keyword = args.keyword.decode(getdefaultlocale()[1])
#数据库
self.database = Database(args.dbFile)
#线程池,指定线程数
self.threadPool = ThreadPool(args.threadNum)
#已访问的链接
self.visitedHrefs = set()
#待访问的链接
self.unvisitedHrefs = deque()
#添加首个待访问的链接
self.unvisitedHrefs.append(args.url)
#标记爬虫是否开始执行任务
self.isCrawling = False
def start(self):
print '\nStart Crawling\n'
if not self._isDatabaseAvailable():
print 'Error: Unable to open database file.\n'
else:
self.isCrawling = True
self.threadPool.startThreads()
while self.currentDepth < self.depth+1:
#分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
self._assignCurrentDepthTasks ()
#等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
#self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
while self.threadPool.getTaskLeft():
time.sleep(8)
print 'Depth %d Finish. Totally visited %d links. \n' % (
self.currentDepth, len(self.visitedHrefs))
log.info('Depth %d Finish. Total visited Links: %d\n' % (
self.currentDepth, len(self.visitedHrefs)))
self.currentDepth += 1
self.stop()
def stop(self):
self.isCrawling = False
self.threadPool.stopThreads()
self.database.close()
def getAlreadyVisitedNum(self):
#visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。
#因此真实的已访问链接数为visitedHrefs数减去待访问的链接数
return len(self.visitedHrefs) - self.threadPool.getTaskLeft()
def _assignCurrentDepthTasks(self):
while self.unvisitedHrefs:
url = self.unvisitedHrefs.popleft()
#向任务队列分配任务
self.threadPool.putTask(self._taskHandler, url)
#标注该链接已被访问,或即将被访问,防止重复访问相同链接
self.visitedHrefs.add(url)
def _taskHandler(self, url):
#先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
webPage = WebPage(url)
if webPage.fetch():
self._saveTaskResults(webPage)
self._addUnvisitedHrefs(webPage)
def _saveTaskResults(self, webPage):
url, pageSource = webPage.getDatas()
try:
if self.keyword:
#使用正则的不区分大小写search比使用lower()后再查找要高效率(?)
if re.search(self.keyword, pageSource, re.I):
self.database.saveData(url, pageSource, self.keyword)
else:
self.database.saveData(url, pageSource)
except Exception, e:
log.error(' URL: %s ' % url + traceback.format_exc())
示例13: Crawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class Crawler(object):
def __init__(self, args):
#指定网页深度
self.depth = args.depth
#标注初始爬虫深度,从1开始
self.currentDepth = 1
#指定关键词,使用console的默认编码来解码
self.keyword = args.keyword.decode(getdefaultlocale()[1])
#数据库
self.database = Database(args.dbFile)
#线程池,指定线程数
self.threadPool = ThreadPool(args.threadNum)
#已访问的链接
self.visitedHrefs = set()
#待访问的链接
self.unvisitedHrefs = deque()
#添加首个待访问的链接
#self.unvisitedHrefs.append(args.url)
#标记爬虫是否开始执行任务
self.isCrawling = False
self.domainPattern = re.compile(r"^([0-9a-zA-Z][0-9a-zA-Z-]{0,62}\.)+([0-9a-zA-Z][0-9a-zA-Z-]{0,62})\.?$")
self.maxDomainSeeds = args.maxDomainSeeds
self._initDomainSeedsList(args.domainSeeds)
def _initDomainSeedsList(self, domainSeedsFile):
fp = open(domainSeedsFile, 'r+')
urlList = fp.readlines()
for url in urlList:
formattedUrl = self._formatUrl(url)
if len(formattedUrl) > 0 and len(self.unvisitedHrefs) <= self.maxDomainSeeds:
self.unvisitedHrefs.append(formattedUrl)
fp.close()
print 'We have got %d domain feeds.'%len(self.unvisitedHrefs)
def _formatUrl(self, rawValue):
rawValueStr = rawValue.strip().strip('\n')
if len(rawValueStr) <= 0:
return ''
if not self.domainPattern.match(rawValueStr):
return ''
if not rawValueStr.startswith('http'):
value = 'http://' + rawValueStr
else:
value = rawValueStr
return value
def start(self):
print '\nStart Crawling\n'
if not self._isDatabaseAvaliable():
print 'Error: Unable to open database file.\n'
else:
self.isCrawling = True
self.threadPool.startThreads()
while self.currentDepth < self.depth+1:
#分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
self._assignCurrentDepthTasks ()
#等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
#self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
while self.threadPool.getTaskLeft():
time.sleep(8)
print 'Depth %d Finish. Totally visited %d links. \n' % (
self.currentDepth, len(self.visitedHrefs))
log.info('Depth %d Finish. Total visited Links: %d\n' % (
self.currentDepth, len(self.visitedHrefs)))
self.currentDepth += 1
self.stop()
def stop(self):
self.isCrawling = False
self.threadPool.stopThreads()
self.database.close()
def getAlreadyVisitedNum(self):
#visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。
#因此真实的已访问链接数为visitedHrefs数减去待访问的链接数
return len(self.visitedHrefs) - self.threadPool.getTaskLeft()
def _assignCurrentDepthTasks(self):
mylock.acquire()
copiedUnvisitedHrefs = deque()
while self.unvisitedHrefs:
copiedUnvisitedHrefs.append(self.unvisitedHrefs.popleft())
mylock.release()
while copiedUnvisitedHrefs:
url = copiedUnvisitedHrefs.popleft()
#标注该链接已被访问,或即将被访问,防止重复访问相同链接
self.visitedHrefs.add(url)
#向任务队列分配任务
self.threadPool.putTask(self._taskHandler, url)
def _taskHandler(self, url):
#先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
webPage = WebPage(url)
retry =1
if webPage.fetch(retry):
print 'Visited URL : %s ' % url
self._saveTaskResults(webPage)
self._addUnvisitedHrefs(webPage)
#.........这里部分代码省略.........
示例14: Crawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class Crawler(threading.Thread):
def __init__(self, args, queue):
threading.Thread.__init__(self)
#指定网页深度
self.depth = args['depth']
#标注初始爬虫深度,从1开始
self.currentDepth = 1
#指定关键词,使用console的默认编码来解码
self.keyword = args['keyword'].decode(getdefaultlocale()[1])
#数据库
self.database = Database(db="bt_tornado")
#线程池,指定线程数
self.threadPool = ThreadPool(args['threadNum'])
#已访问的链接
self.visitedHrefs = set()
#待访问的链接
self.unvisitedHrefs = deque()
#添加待访问的链接
for url in args['url']:
self.unvisitedHrefs.append(url)
#标记爬虫是否开始执行任务
self.isCrawling = False
# allow or deny crawl url
self.entryFilter = args['entryFilter']
# allow to output back url
self.yieldFilter = args['yieldFilter']
#
self.callbackFilter = args['callbackFilter']
#
self.db = args['db']
self.collection = args['collection']
# communication queue
self.queue = queue
def run(self):
print '\nStart Crawling\n'
if not self._isDatabaseAvaliable():
print 'Error: Unable to open database file.\n'
else:
self.isCrawling = True
self.threadPool.startThreads()
while self.currentDepth < self.depth+1:
#分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
self._assignCurrentDepthTasks ()
#等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
#self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
while self.threadPool.getTaskLeft():
time.sleep(8)
print 'Depth %d Finish. Totally visited %d links. \n' % (
self.currentDepth, len(self.visitedHrefs))
log.info('Depth %d Finish. Total visited Links: %d\n' % (
self.currentDepth, len(self.visitedHrefs)))
self.currentDepth += 1
self.stop()
def stop(self):
self.isCrawling = False
self.threadPool.stopThreads()
self.database.close()
#use queue to communicate between threads
self.queue.get()
self.queue.task_done()
def getAlreadyVisitedNum(self):
#visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。
#因此真实的已访问链接数为visitedHrefs数减去待访问的链接数
return len(self.visitedHrefs) - self.threadPool.getTaskLeft()
def _assignCurrentDepthTasks(self):
while self.unvisitedHrefs:
url = self.unvisitedHrefs.popleft()
if not self.__entry_filter(url):
self.visitedHrefs.add(url)
continue
#向任务队列分配任务
self.threadPool.putTask(self._taskHandler, url)
#标注该链接已被访问,或即将被访问,防止重复访问相同链接
self.visitedHrefs.add(url)
def _callback_filter(self, webPage):
#parse the web page to do sth
url , pageSource = webPage.getDatas()
for tmp in self.callbackFilter['List']:
if re.compile(tmp,re.I|re.U).search(url):
self.callbackFilter['func'](webPage)
def _taskHandler(self, url):
#先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
webPage = WebPage(url)
tmp = webPage.fetch()
if tmp:
self._callback_filter(webPage)
self._saveTaskResults(webPage)
self._addUnvisitedHrefs(webPage)
def _saveTaskResults(self, webPage):
url, pageSource = webPage.getDatas()
_id = md5(url).hexdigest()
try:
#.........这里部分代码省略.........
示例15: Crawler
# 需要导入模块: from threadPool import ThreadPool [as 别名]
# 或者: from threadPool.ThreadPool import startThreads [as 别名]
class Crawler(object):
"""Main part, carwl the site"""
def __init__(self, args):
# 抓取深度
self.max_deepth = args['deepth']
# 指定当前深度
self.current_deepth = 1
# 线程管理
self.threadPool = ThreadPool(args['threads'])
# 指定存取数据库文件
self.dbfile = args['dbfile']
# 指定关键字
self.keyword = args['keyword']
# 是否自测
self.testself = args['testself']
# 当前层待访问的链接,用集合来去重
self.unvisitedUrl = set()
self.unvisitedUrl.add(args['url'])
# 已访问的链接
self.visitedUrl = set()
self.q = Queue()
# http header
self.header = {
'Accetpt': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accetpt-Encoding': 'gzip,deflate,sdch',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36'
}
# 连接数据库
self.connDB()
self.isRunning = True
def start(self):
self.threadPool.startThreads()
# 判断当前深度
while self.current_deepth <= self.max_deepth:
self.taskQueue()
while not self.q.empty():
url = self.q.get()
# 往线程池中添加任务
self.threadPool.addJob(self.getLinks, url)
self.threadPool.workJoin() # 等待所有线程完成
self.current_deepth += 1
# 爬取结束
self.isRunning = False
self.closeDB()
def fetchPage(self, url, retry=3):
'''获取页面内容'''
try:
self.r = requests.get(url, headers=self.header, timeout=3)
if self.r.status_code == requests.codes.ok:
source = self.r.text
self.writeDB(url, source)
return source
except Exception, e:
if retry>0:
return self.fetchPage(url, retry-1)
else:
logging.error('Open failed for 3 time: %s' % url)