本文整理汇总了Python中threadPool.ThreadPool类的典型用法代码示例。如果您正苦于以下问题:Python ThreadPool类的具体用法?Python ThreadPool怎么用?Python ThreadPool使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ThreadPool类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, start_url, thread_num, post_list_path, max_post_num = 1000):
"""
`group_id` 待抓取的group id
`thread_num` 抓取的线程
`post_list_path` 保存所有的post id list的文件路径
"""
#线程池,指定线程数
self.thread_pool = ThreadPool(thread_num)
# 保存topic的线程
# NOTE: 这里只允许一个保存进程,因为要操作同一个文件
self.save_thread = ThreadPool(1)
# 保存group相关信息
self.post_list_path = post_list_path
# 已经访问的页面: Group id ==> True or False
self.visited_href = set()
#待访问的小组讨论页面
self.unvisited_href = deque()
# 访问失败的页面链接
self.failed_href = set()
self.start_url = start_url
# 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取
# 只保存thread-id
self.post_list = list()
self.is_crawling = False
# 每个Group抓取的最大topic个数
self.MAX_POST_NUM = max_post_num
示例2: __init__
def __init__(self, groupID, topicIDList, threadNum, topic_info_path, comment_info_path):
"""
`groupID` 当前的Group id
`topicIDList` 需要抓取的topic id的list
`threadNum` 开启的线程数目
`topic_info_path` 存储topic信息的文件
`comment_info_path` 存储comment信息的文件
"""
#线程池,指定线程数
self.threadPool = ThreadPool(threadNum)
# 写数据库的线程
#self.DBThread = ThreadPool(1)
# 保证同时只有一个线程在写文件
self.saveThread = ThreadPool(1)
self.database = Database("DoubanGroup.db")
#self.database = Database("test.db")
self.topic_info_path = topic_info_path
self.comment_info_path = comment_info_path
# 已经访问的页面: Group id ==> True or False
self.visitedHref = set()
# 抓取失败的topic id
self.failed = set()
# 依次为每个小组抽取topic评论
self.groupID = groupID
self.topicIDList = topicIDList # 等待抓取的topic列表
# 存储结果
# topic ID ==> Topic对象
self.topicDict = dict()
# 存放下一个处理的评论页数: topic ID ==> 1,2,3...
self.nextPage = dict()
# 已经抓取完毕的topic id集合
self.finished = set()
self.visitedHref = set() # 已经访问的网页
self.isCrawling = False
# 每个topic抓取的最多comments个数
#self.MAX_COMMETS_NUM = 5000
self.MAX_COMMETS_NUM = float('inf')
# 每页的评论数量
self.COMMENTS_PER_PAGE = 100
示例3: __init__
def __init__(self, args, queue):
threading.Thread.__init__(self)
#指定网页深度
self.depth = args['depth']
#标注初始爬虫深度,从1开始
self.currentDepth = 1
#指定关键词,使用console的默认编码来解码
self.keyword = args['keyword'].decode(getdefaultlocale()[1])
#数据库
self.database = Database(db="bt_tornado")
#线程池,指定线程数
self.threadPool = ThreadPool(args['threadNum'])
#已访问的链接
self.visitedHrefs = set()
#待访问的链接
self.unvisitedHrefs = deque()
#添加待访问的链接
for url in args['url']:
self.unvisitedHrefs.append(url)
#标记爬虫是否开始执行任务
self.isCrawling = False
# allow or deny crawl url
self.entryFilter = args['entryFilter']
# allow to output back url
self.yieldFilter = args['yieldFilter']
#
self.callbackFilter = args['callbackFilter']
#
self.db = args['db']
self.collection = args['collection']
# communication queue
self.queue = queue
示例4: __init__
def __init__(self, pool, maxHostID, monitorInterval=2):
self._messageTypes = {}
# Save arguments
self._stop = False
self._stopped = False
self._poolID = str(pool.spUUID)
self._spmStorageDir = pool.storage_repository
tpSize = config.getint('irs', 'thread_pool_size') / 2
waitTimeout = 3
maxTasks = config.getint('irs', 'max_tasks')
self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
# *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice
# versa *** #
self._inbox = os.path.join(self._spmStorageDir, self._poolID,
"mastersd", sd.DOMAIN_META_DATA, "inbox")
if not os.path.exists(self._inbox):
self.log.error("SPM_MailMonitor create failed - inbox %s does not "
"exist" % repr(self._inbox))
raise RuntimeError("SPM_MailMonitor create failed - inbox %s does "
"not exist" % repr(self._inbox))
self._outbox = os.path.join(self._spmStorageDir, self._poolID,
"mastersd", sd.DOMAIN_META_DATA, "outbox")
if not os.path.exists(self._outbox):
self.log.error("SPM_MailMonitor create failed - outbox %s does "
"not exist" % repr(self._outbox))
raise RuntimeError("SPM_MailMonitor create failed - outbox %s "
"does not exist" % repr(self._outbox))
self._numHosts = int(maxHostID)
self._outMailLen = MAILBOX_SIZE * self._numHosts
self._monitorInterval = monitorInterval
# TODO: add support for multiple paths (multiple mailboxes)
self._outgoingMail = self._outMailLen * "\0"
self._incomingMail = self._outgoingMail
self._inCmd = ['dd',
'if=' + str(self._inbox),
'iflag=direct,fullblock',
'count=1'
]
self._outCmd = ['dd',
'of=' + str(self._outbox),
'oflag=direct',
'iflag=fullblock',
'conv=notrunc',
'count=1'
]
self._outLock = threading.Lock()
self._inLock = threading.Lock()
# Clear outgoing mail
self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: "
"%s", self._outCmd)
cmd = self._outCmd + ['bs=' + str(self._outMailLen)]
(rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail)
if rc:
self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, "
"dd failed")
t = concurrent.thread(self.run, name="mailbox.SPMMonitor",
logger=self.log.name)
t.start()
self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)
示例5: __init__
def __init__(self, args):
# 抓取深度
self.max_deepth = args['deepth']
# 指定当前深度
self.current_deepth = 1
# 线程管理
self.threadPool = ThreadPool(args['threads'])
# 指定存取数据库文件
self.dbfile = args['dbfile']
# 指定关键字
self.keyword = args['keyword']
# 是否自测
self.testself = args['testself']
# 当前层待访问的链接,用集合来去重
self.unvisitedUrl = set()
self.unvisitedUrl.add(args['url'])
# 已访问的链接
self.visitedUrl = set()
self.q = Queue()
# http header
self.header = {
'Accetpt': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accetpt-Encoding': 'gzip,deflate,sdch',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36'
}
# 连接数据库
self.connDB()
self.isRunning = True
示例6: __init__
def __init__(self, args=Strategy()):
self.url = args.url
self.max_depth = args.max_depth #指定网页深度
self.max_count = args.max_count #爬行最大数量
self.concurrency = args.concurrency #线程数
self.timeout = args.timeout #超时
self.cookies = args.cookies #cookies
self.ssl_verify = args.ssl_verify #ssl
self.same_host = args.same_host #是否只抓取相同host的链接
self.same_domain = args.same_domain #是否只抓取相同domain的链接
self.currentDepth = 1 #标注初始爬虫深度,从1开始
self.keyword = args.keyword #指定关键词,使用console的默认编码来解码
self.threadPool = ThreadPool(args.concurrency) #线程池,指定线程数
self.visitedHrefs = set() #已访问的链接
self.unvisitedHrefs = deque() #待访问的链接
self.unvisitedHrefs.append(args.url)#添加首个待访问的链接
self.isCrawling = False #标记爬虫是否开始执行任务
self.file = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt'
print self.file
print 'args.url=\t',args.url
#################
#此句有问题
self.database = Database(args.dbFile) #数据库
# print 'hehe'
self.lock = Lock()
示例7: __init__
class TaskManager:
log = logging.getLogger('TaskManager')
def __init__(self, tpSize=config.getfloat('irs', 'thread_pool_size'), waitTimeout=3, maxTasks=config.getfloat('irs', 'max_tasks')):
self.storage_repository = config.get('irs', 'repository')
self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
self._tasks = {}
self._unqueuedTasks = []
def queue(self, task):
return self._queueTask(task, task.commit)
def queueRecovery(self, task):
return self._queueTask(task, task.recover)
def _queueTask(self, task, method):
try:
self.log.debug("queueing task: %s", task.id)
self._tasks[task.id] = task
if not self.tp.queueTask(task.id, method):
self.log.error("unable to queue task: %s", task.dumpTask())
del self._tasks[task.id]
raise se.AddTaskError()
self.log.debug("task queued: %s", task.id)
except Exception, ex:
self.log.error("Could not queue task, encountered: %s", str(ex))
raise
return task.id
示例8: __init__
def __init__(self,
tpSize=config.getint('irs', 'thread_pool_size'),
waitTimeout=3,
maxTasks=config.getint('irs', 'max_tasks')):
self.storage_repository = config.get('irs', 'repository')
self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
self._tasks = {}
self._unqueuedTasks = []
示例9: __init__
def __init__(self, group_id, topic_id_list, thread_num, base_path, topic_info_path, comment_info_path):
"""
`group_id` 当前的Group id
`topic_id_list` 需要抓取的topic id的list
`thread_num` 开启的线程数目
`topic_info_path` 存储topic信息的文件
`comment_info_path` 存储comment信息的文件
"""
#线程池,指定线程数
self.thread_pool = ThreadPool(thread_num)
# 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储
self.save_thread = ThreadPool(10)
self.topic_info_path = topic_info_path
self.comment_info_path = comment_info_path
self.base_path = base_path
# 已经访问的页面: Group id ==> True or False
self.visited_href = set()
# 抓取失败的topic id
self.failed = set()
# 依次为每个小组抽取topic评论
self.group_id = group_id
self.topic_id_list = topic_id_list # 等待抓取的topic列表
# 存储结果
# topic ID ==> Topic对象
self.topic_dict = dict()
# 存放下一个处理的评论页数: topic ID ==> 1,2,3...
self.next_page = dict()
# 已经抓取完毕的topic id集合
self.finished = set()
self.is_crawling = False
# 每个topic抓取的最多comments个数
#self.MAX_COMMETS_NUM = 5000
self.MAX_COMMETS_NUM = float('inf')
# 每页的评论数量
self.COMMENTS_PER_PAGE = 100
示例10: __init__
def __init__(self,url,threadnum,limit):
#self.database = Database('pichref.sql')
self.file = PicFile('imgfile','a')
self.threadPool = ThreadPool(threadnum)
self.unaccesshref = deque()#双向列表
self.accessedhref = set()#已访问的链接集合
self.unaccesshref.append(url)#添加初始链接
self.limit = limit
self.picUrlCount = 1
示例11: __init__
def __init__(self, args):
self.depth = args.depth
self.currentDepth = 1
self.database = database(args.dbFile)
self.threadPool = ThreadPool(args.threadNum)
self.visitUrls = set()
self.unvisitedUrls = deque()
self.unvisitedUrls.append(args.url)
self.isCrawling = False
self.maxWebPages = args.maxWebPages
示例12: __init__
def __init__(self,threadnum,pathname,limit):
'''limit指定图片数目,path指定存放路径'''
super(Crawler, self).__init__()
self.threadPool = ThreadPool(threadnum)
self.file = PicFile('imgfile','r')
self.urlqueue = deque()
self.count = 1
self._makePath(pathname)
self.savaPath = os.getcwd()+'/'+pathname
self._getUrl(limit)
示例13: __init__
def __init__(
self,
tpSize=config.getfloat("irs", "thread_pool_size"),
waitTimeout=3,
maxTasks=config.getfloat("irs", "max_tasks"),
):
self.storage_repository = config.get("irs", "repository")
self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
self._tasks = {}
self._unqueuedTasks = []
示例14: __init__
def __init__(self, group_id, thread_num, group_info_path, topic_list_path, max_topics_num = 1000):
"""
`group_id` 待抓取的group id
`thread_num` 抓取的线程
`group_info_path` 存储group本身的信息文件路径
`topic_list_path` 保存所有的topic id list的文件路径
"""
#线程池,指定线程数
self.thread_pool = ThreadPool(thread_num)
# 保存topic的线程
self.save_thread = ThreadPool(1)
# 写数据库的线程
#self.DBThread = ThreadPool(1)
# 保存group相关信息
self.group_info_path = group_info_path
self.topic_list_path = topic_list_path
# 已经访问的页面: Group id ==> True or False
self.visited_href = set()
#待访问的小组讨论页面
self.unvisited_href = deque()
# 访问失败的页面链接
self.failed_href = set()
self.lock = Lock() #线程锁
self.group_id = group_id
self.group_info = None # models.Group
# 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取
# 只保存topic id
self.topic_list = list()
self.is_crawling = False
# self.database = Database("DoubanGroup.db")
# 每个Group抓取的最大topic个数
self.MAX_TOPICS_NUM = max_topics_num
示例15: __init__
def __init__(self, section_id, post_id_list, crawler_thread_num, save_thread_num, post_base_path):
"""
`section_id` 天涯的板块名称
`post_id_list` 需要抓取的post id的list
`thread_num` 开启的线程数目
post_base_path: 存储抓取结果的基本目录,每个post一个文件,并以该post的ID命名
"""
# 抓取网页的线程池,指定线程数
self.thread_pool = ThreadPool(crawler_thread_num)
# 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储
self.save_thread = ThreadPool(save_thread_num)
# 保存抓取信息的base path
self.base_path = post_base_path
# 已经访问的页面: Group id ==> True or False
self.visited_href = set()
self.visited_post = set() # 已经添加访问的页面的id集合
self.finished = set() # 已经抓取完毕的topic id集合
# 抓取失败的topic id
self.failed = set()
# 依次为每个小组抽取topic评论
self.section_id = section_id
self.post_id_list = post_id_list # 等待抓取的topic列表
self.current_post_id_list = list(post_id_list) # 用于逐步向任务列表中加入post id
# 存储结果
# topic ID ==> Topic对象
self.post_dict = dict()
# 存放下一个处理的评论页数: topic ID ==> 1,2,3...
self.next_page = dict()
self.is_crawling = False
# 每个topic抓取的最多comments个数
#self.MAX_COMMETS_NUM = 1000
self.MAX_COMMETS_NUM = float('inf')