本文整理汇总了Python中models.Topic.isComplete方法的典型用法代码示例。如果您正苦于以下问题:Python Topic.isComplete方法的具体用法?Python Topic.isComplete怎么用?Python Topic.isComplete使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类models.Topic
的用法示例。
在下文中一共展示了Topic.isComplete方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _taskHandler
# 需要导入模块: from models import Topic [as 别名]
# 或者: from models.Topic import isComplete [as 别名]
def _taskHandler(self, url):
""" 根据指定的url,抓取网页,并进行相应的访问控制
"""
print "Visiting : " + url
webPage = WebPage(url)
# 抓取页面内容
flag = webPage.fetch()
match_obj = RETopic.match(url)
match_obj2 = REComment.match(url)
if flag:
if match_obj is not None:
topic_id = match_obj.group(1)
topic = Topic(topic_id, self.group_id)
comment_list = topic.parse(webPage, isFirstPage = True) # First page parsing
self.topic_dict[topic_id] = topic
# 保存到单个文件(已废弃不用)
#self.save_thread.putTask(self._saveHandler, comment_list, topic = topic)
elif match_obj2 is not None:
topic_id = match_obj2.group(1)
start = int(match_obj2.group(2))
# 抽取非第一页的评论数据
if topic_id in self.topic_dict:
topic = self.topic_dict[topic_id]
if topic is None:
log.error('未知程序错误:结束topic id为%s的抽取,释放内存。' % topic_id)
self.topic_dict[topic_id] = None
return False
else:
# 这里的含义为:必须先处理第一页的评论,否则该topic_id不会作为self.topic_dict的键出现
log.error('错误:必须先抽取第一页的评论数据:topic id: %s' % topic_id)
self.failed.add(topic_id)
self.finished.add(topic_id)
return False
comment_list = topic.parse(webPage, isFirstPage = False) # non-firstpage parsing
# 保存到单个文件(已废弃不用)
#self.save_thread.putTask(self._saveHandler, comment_list, topic = None)
else:
#pdb.set_trace()
log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id))
# 判断抓取是否结束,如果结束,则释放dict内存
# 这个很重要,因为随着topic数量增多,内存会占很多
if topic.isComplete():
self.save_thread.putTask(self._saveTopicHandler, self.topic_dict, topic_id)
#self.topic_dict[topic_id] = None # 释放资源
self.finished.add(topic_id)
log.info('Topic: %s 抓取结束。' % topic_id)
self.visited_href.add(url)
return True
else:
# 处理抓取失败的网页集合
# 只要一个网页抓取失败,则加入到finished
if match_obj is not None:
# 讨论贴的第一页就没有抓到,则将其列入finished名单中
topic_id = match_obj.group(1)
elif match_obj2 is not None:
topic_id = match_obj2.group(1)
start = int(match_obj2.group(2))
else:
log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id))
# 添加抓取失败的topic id和标记抓取结束的topic
self.failed.add(topic_id)
self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息
self.visited_href.add(url)
return False
示例2: _taskHandler
# 需要导入模块: from models import Topic [as 别名]
# 或者: from models.Topic import isComplete [as 别名]
def _taskHandler(self, url):
""" 根据指定的url,抓取网页,并进行相应的访问控制
"""
print "Visiting : " + url
webPage = WebPage(url)
# 抓取页面内容
flag = webPage.fetch()
match_obj = RETopic.match(url)
match_obj2 = REComment.match(url)
if flag:
if match_obj is not None:
topic_id = match_obj.group(1)
topic = Topic(topic_id, self.groupID)
comment_list = topic.parse(webPage, True) # First page parsing
self.topicDict[topic_id] = topic
# 保存到文件
self.saveThread.putTask(self._save_handler, comment_list, topic = topic)
# 如果
elif match_obj2 is not None:
topic_id = match_obj2.group(1)
start = int(match_obj2.group(2))
# 抽取非第一页的评论数据
if topic_id in self.topicDict:
topic = self.topicDict[topic_id]
if topic is None:
log.error('未知程序错误:该topic已经抓取结束,已释放相关内存,topic id:%s' % topic_id)
return False
else:
log.error('未知程序错误:在topicDict字典中找不到topic id: %s' % topic_id)
self.failed.add(topic_id)
self.finished.add(topic_id)
return False
comment_list = topic.parse(webPage, False) # non-firstpage parsing
# 保存到文件
self.saveThread.putTask(self._save_handler, comment_list, topic = None)
else:
#pdb.set_trace()
log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID))
# 判断抓取是否结束,如果结束,则释放dict内存
# 这个很重要,因为随着topic数量增多,内存会占很多
if topic.isComplete():
self.topicDict[topic_id] = None
self.finished.add(topic_id)
log.info('Topic: %s 抓取结束。' % topic_id)
self.visitedHref.add(url)
return True
else:
# 处理抓取失败的网页集合
# 只要一个网页抓取失败,则加入到finished
if match_obj is not None:
# 讨论贴的第一页就没有抓到,则将其列入finished名单中
topic_id = match_obj.group(1)
elif match_obj2 is not None:
topic_id = match_obj2.group(1)
start = int(match_obj2.group(2))
else:
log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID))
# 添加抓取失败的topic id和标记抓取结束的topic
self.failed.add(topic_id)
self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息
self.visitedHref.add(url)
return False