本文整理汇总了Python中pybloomfilter.BloomFilter.clear_all方法的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter.clear_all方法的具体用法?Python BloomFilter.clear_all怎么用?Python BloomFilter.clear_all使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pybloomfilter.BloomFilter
的用法示例。
在下文中一共展示了BloomFilter.clear_all方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: URIBloomFilter
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import clear_all [as 别名]
class URIBloomFilter(BaseDupeFilter):
def __init__(self, settings, debug = False):
self.capacity = settings.getint("DUPEFILTER_CAPACITY")
self.filename = settings.get("DUPEFILTER_FILENAME")
self.debug = debug
self.error_rate = 0.01
self.logger = logging.getLogger(__name__)
self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename)
@classmethod
def from_settings(cls, settings):
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(settings, debug)
def request_seen(self, request):
fp = self.request_fingerprint(request)
if self.check(fp):
return True
else:
self.insert(fp)
###-------todo-------##
def request_fingerprint(self, request):
return request_fingerprint(request)
def check(self, request):
ret = request in self.bloom_filter_
return ret
def insert(self, request):
self.bloom_filter_.add(request)
#print len(self.bloom_filter_)
#print self.bloom_filter_.hash_seeds
#print self.bloom_filter_.num_bits
#print self.bloom_filter_.num_hashes
def reset(self):
self.bloom_filter_.clear_all()
def save(self):
pass
def load(self):
self.bloom_filter_.sync()
self.bloom_filter_.open("bloom.dump")
pass
def log(self, request, spider):
if self.debug:
msg = "Filtered duplicate request: %(request)s"
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
elif self.logdupes:
msg = ("Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
self.logdupes = False
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
示例2: MySpider
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import clear_all [as 别名]
#.........这里部分代码省略.........
self.__process_sub_content(result, insert_id)
self.__log('process the', str(num), 'reply page ... done')
self.__log('process reply page ... done')
def __process_titles_page(self, page_url):
self.__log('reading titles page .... start')
req = self.requests.get(page_url)
content = req.text
soup = BeautifulSoup(content)
# 获取所有标题
titles = soup.select('tbody tr')
# 去掉不符合的部分
titles = titles[1:]
# 对每一个标题进行处理
self.__log('reading titles page .... done')
self.__log('processing all titles in', self.start_url, ' ... start')
counter = 1
for each in titles:
# 获取标题的tag信息
# 注意在beautifulSoup的tag中,空白也是标签,即相邻两个td之间标签还有空白
# 所以下面content索引需要考虑到这点
self.__log('process the', counter, 'title', ' ... start')
counter += 1
title_content = each.contents
title_href = title_content[1].a.get('href') # 获取标题链接
title_text = title_content[1].text.strip() # 获取标题内容
title_author = title_content[3].a.text # 获取作者
title_click_num = title_content[5].text # 点击数
title_reply_number = title_content[7].text # 获取回复数
title_time = title_content[9].get('title') # 获取时间
sub_href = self.basic_url + title_href # 子链接
# 构造标题的字典,插入标题
title_dict = {
"reply_num": title_reply_number,
"click_num": title_click_num,
"author": title_author,
"time": title_time,
"link": sub_href,
"text": title_text
}
# for each in title_dict:
# print each
# print type(title_dict[each])
# 利用链接地址和回复数判断是否重复
# flag = sub_href + title_click_num
flag = sub_href
if not (self.bf.add(flag)):
self.__log('', flag, 'not in bloom filter')
self.__log('insert to database ... start')
insert_id = self.mysql.insert_data("titles", title_dict)
self.__log('insert to database ... done')
self.__process_reply_page(sub_href, title_author.encode('utf-8'), title_time, str(insert_id))
self.__log('process the', counter, 'title', ' ... done')
# 下一页的链接
next_page_tag = soup.find('a', text='下一页')
if next_page_tag:
next_page = next_page_tag.get('href')
next_page = self.basic_url + next_page
else:
next_page = None
return next_page
# 清空bloom filter
def clean_bloom_filter(self):
self.__log('clean all in bloom filter ... start')
self.bf.clear_all()
self.__log('clean all in bloom filter ... done')
def bloom_filter_len(self):
return len(self.bf)
def main(self):
self.__log('spider ... start')
self.__log('process start url ... running')
next_page = self.__process_titles_page(self.start_url)
self.__log('process start url ... done')
start_time = self.__get_time()
print start_time
depth = 1
while next_page:
# if depth == 2:
# break
self.__log('now it is the', str(depth), 'page')
next_page = self.__process_titles_page(next_page)
depth += 1
end_time = self.__get_time()
print end_time
duration = self.__cal_time(start_time, end_time)
self.__log('duration are', duration)
self.__log('spider ... done')
def clean_table(self, table):
self.mysql.clean_table(table)
def test(self):
test_url = 'http://bbs.tianya.cn/post-333-778768-1.shtml'
print self.bf.add(test_url)
示例3: BloomFilter
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import clear_all [as 别名]
import time
import re
import Queue
import redis
#!!! db=1 use db1 to store the seeds
r = redis.Redis(host='192.168.134.235',password='5#rFLwtg53&GzSjPbpb2', db=1)
done_sites_fname='done_sites.bin'
try:
bfdone = BloomFilter.open(done_sites_fname)
except:
print "can not open file, create it"
bfdone = BloomFilter(2**23, 0.00001, done_sites_fname) #8M
bfdone.clear_all()
#first check the bf
f = "urls_uniq.txt"
urls = open(f).read().strip().split('\n')
for url in urls:
if url in bfdone:
print "Error"
exit(0)
print "BF is ok"
# here we got id in each db increase from 1 to n, rather than sequencely for all db
cmd = "select id from mainpages"
示例4: MySpider
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import clear_all [as 别名]
class MySpider(object):
def __init__(self):
self.mysql = mysql.Mysql()
self.re = re
self.time = time
self.datetime = datetime
self.requests = requests
# 使用bloom_filter去重,每次从文件中读取dump.bloom
if os.path.isfile("new_filter.bloom"):
self.bf = BloomFilter.open("new_filter.bloom")
else:
self.bf = BloomFilter(10000000, 0.01, "new_filter.bloom")
def __process_text(self, my_str):
my_str = self.re.sub("http.*?html", "", my_str).encode("utf-8")
if type(my_str) == "unicode":
my_str = my_str.encode("utf-8")
return my_str.replace(" ", "").replace(" ", "").replace("\n", "")
def open_url(self, url):
html = self.requests.get(url)
code = html.encoding
# print code
content = html.content.decode(code, "ignore")
soup = BeautifulSoup(content)
return soup
def process_content_page(self, url):
soup = self.open_url(url)
body = soup.find_all("p")
# print soup.contents
content = ""
for each in body:
content += each.text.strip()
# print each.text.strip()
return self.__process_text(content)
def process_title_page(self, url):
soup = self.open_url(url)
result = soup.find("table", class_="mt12 p14")
titles = result.find_all("tr")
titles = titles[1:-1]
# 处理每个标题
for each in titles:
title_href = each.a.get("href")
if not self.bf.add(title_href):
text = each.text.strip()
title_time = "20" + text[-14:] + ":00"
content = text[1:-15].strip()
print title_time + "\n" + content + "\n" + title_href
title_text = self.process_content_page(title_href)
# 构造插入字典
title_dict = {
"link": title_href,
"title": content,
"text": title_text.decode("utf-8", "ignore"),
"time": title_time,
}
# 插入数据库
self.mysql.insert_data("gzrb_titles", title_dict)
# 得到下一页
result = soup.find("table", class_="mt12 p12")
result = result.find_all("a")
if len(result) == 1:
next_page_href = result[0].get("href")
# print result[0].text
elif len(result) == 2:
next_page_href = result[1].get("href")
# print result[1].text
else:
next_page_href = None
# print next_page_href
return next_page_href
def clean_bloom_filter(self):
self.bf.clear_all()
def process_nav(self, url):
# 对于每个标题页,如果有下一页,则继续迭代
next_page = self.process_title_page(url)
# 爬取深度
depth = 1
while next_page:
if not self.bf.add(next_page):
next_page = self.process_title_page(next_page)
else:
next_page = None
if depth == 10:
return
depth += 1
def main(self, start_url):
soup = self.open_url(start_url)
# 获得导航栏链接
nav = soup.find("div", class_="nav")
result = nav.find_all("li")
# 去掉#
#.........这里部分代码省略.........