本文整理汇总了Python中pybloom.BloomFilter.clear_all方法的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter.clear_all方法的具体用法?Python BloomFilter.clear_all怎么用?Python BloomFilter.clear_all使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pybloom.BloomFilter
的用法示例。
在下文中一共展示了BloomFilter.clear_all方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: MySpider
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import clear_all [as 别名]
#.........这里部分代码省略.........
self.__process_sub_content(result, insert_id)
self.__log('process the', str(num), 'reply page ... done')
self.__log('process reply page ... done')
def __process_titles_page(self, page_url):
self.__log('reading titles page .... start')
req = self.requests.get(page_url)
content = req.text
soup = BeautifulSoup(content)
# 获取所有标题
titles = soup.select('tbody tr')
# 去掉不符合的部分
titles = titles[1:]
# 对每一个标题进行处理
self.__log('reading titles page .... done')
self.__log('processing all titles in', self.start_url, ' ... start')
counter = 1
for each in titles:
# 获取标题的tag信息
# 注意在beautifulSoup的tag中,空白也是标签,即相邻两个td之间标签还有空白
# 所以下面content索引需要考虑到这点
self.__log('process the', counter, 'title', ' ... start')
counter += 1
title_content = each.contents
title_href = title_content[1].a.get('href') # 获取标题链接
title_text = title_content[1].text.strip() # 获取标题内容
title_author = title_content[3].a.text # 获取作者
title_click_num = title_content[5].text # 点击数
title_reply_number = title_content[7].text # 获取回复数
title_time = title_content[9].get('title') # 获取时间
sub_href = self.basic_url + title_href # 子链接
# 构造标题的字典,插入标题
title_dict = {
"reply_num": title_reply_number,
"click_num": title_click_num,
"author": title_author,
"time": title_time,
"link": sub_href,
"text": title_text
}
# for each in title_dict:
# print each
# print type(title_dict[each])
# 利用链接地址和回复数判断是否重复
# flag = sub_href + title_click_num
flag = sub_href
if not (self.bf.add(flag)):
self.__log('', flag, 'not in bloom filter')
self.__log('insert to database ... start')
insert_id = self.mysql.insert_data("titles", title_dict)
self.__log('insert to database ... done')
self.__process_reply_page(sub_href, title_author.encode('utf-8'), title_time, str(insert_id))
self.__log('process the', counter, 'title', ' ... done')
# 下一页的链接
next_page_tag = soup.find('a', text='下一页')
if next_page_tag:
next_page = next_page_tag.get('href')
next_page = self.basic_url + next_page
else:
next_page = None
return next_page
# 清空bloom filter
def clean_bloom_filter(self):
self.__log('clean all in bloom filter ... start')
self.bf.clear_all()
self.__log('clean all in bloom filter ... done')
def bloom_filter_len(self):
return len(self.bf)
def main(self):
self.__log('spider ... start')
self.__log('process start url ... running')
next_page = self.__process_titles_page(self.start_url)
self.__log('process start url ... done')
start_time = self.__get_time()
print start_time
depth = 1
while next_page:
if depth == 2:
break
self.__log('now it is the', str(depth), 'page')
next_page = self.__process_titles_page(next_page)
depth += 1
end_time = self.__get_time()
print end_time
duration = self.__cal_time(start_time, end_time)
self.__log('duration are', duration)
self.__log('spider ... done')
def clean_table(self, table):
self.mysql.clean_table(table)
def test(self):
test_url = 'http://bbs.tianya.cn/post-333-778768-1.shtml'
print self.bf.add(test_url)