当前位置: 首页>>代码示例>>Python>>正文


Python BloomFilter.clear_all方法代码示例

本文整理汇总了Python中pybloomfilter.BloomFilter.clear_all方法的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter.clear_all方法的具体用法?Python BloomFilter.clear_all怎么用?Python BloomFilter.clear_all使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pybloomfilter.BloomFilter的用法示例。


在下文中一共展示了BloomFilter.clear_all方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: URIBloomFilter

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import clear_all [as 别名]
class URIBloomFilter(BaseDupeFilter):
    def __init__(self, settings, debug = False):
        self.capacity = settings.getint("DUPEFILTER_CAPACITY")
        self.filename = settings.get("DUPEFILTER_FILENAME")
        self.debug = debug
        self.error_rate = 0.01
        self.logger = logging.getLogger(__name__)
        self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename) 
    
    @classmethod
    def from_settings(cls, settings):
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(settings, debug)
    def request_seen(self, request):
        fp = self.request_fingerprint(request)
        if self.check(fp):
            return True
        else:
            self.insert(fp)

    ###-------todo-------##
    def request_fingerprint(self, request):
        return request_fingerprint(request)
    
    def check(self, request):

        ret = request in self.bloom_filter_
        return ret
    
    def insert(self, request):
        self.bloom_filter_.add(request)
        #print len(self.bloom_filter_)
        #print self.bloom_filter_.hash_seeds
        #print self.bloom_filter_.num_bits
        #print self.bloom_filter_.num_hashes
    
    def reset(self):
        self.bloom_filter_.clear_all()
    
    def save(self):
        pass
    def load(self):
        self.bloom_filter_.sync()
        self.bloom_filter_.open("bloom.dump") 
        pass
    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s"
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
开发者ID:wuwenjunwwj,项目名称:inst_spider,代码行数:59,代码来源:bloom_filter.py

示例2: MySpider

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import clear_all [as 别名]

#.........这里部分代码省略.........
                self.__process_sub_content(result, insert_id)
                self.__log('process the', str(num), 'reply page ... done')
        self.__log('process reply page ... done')

    def __process_titles_page(self, page_url):
        self.__log('reading titles page .... start')
        req = self.requests.get(page_url)
        content = req.text
        soup = BeautifulSoup(content)

        # 获取所有标题
        titles = soup.select('tbody tr')
        # 去掉不符合的部分
        titles = titles[1:]
        # 对每一个标题进行处理
        self.__log('reading titles page .... done')
        self.__log('processing all titles in', self.start_url, ' ... start')
        counter = 1
        for each in titles:
            # 获取标题的tag信息
            # 注意在beautifulSoup的tag中,空白也是标签,即相邻两个td之间标签还有空白
            # 所以下面content索引需要考虑到这点
            self.__log('process the', counter, 'title', ' ... start')
            counter += 1
            title_content = each.contents
            title_href = title_content[1].a.get('href')         # 获取标题链接
            title_text = title_content[1].text.strip()          # 获取标题内容
            title_author = title_content[3].a.text              # 获取作者
            title_click_num = title_content[5].text             # 点击数
            title_reply_number = title_content[7].text          # 获取回复数
            title_time = title_content[9].get('title')          # 获取时间
            sub_href = self.basic_url + title_href                   # 子链接
            # 构造标题的字典,插入标题
            title_dict = {
                "reply_num": title_reply_number,
                "click_num": title_click_num,
                "author": title_author,
                "time": title_time,
                "link": sub_href,
                "text": title_text
                    }
            # for each in title_dict:
            #    print each
            #    print type(title_dict[each])
            # 利用链接地址和回复数判断是否重复
            # flag = sub_href + title_click_num
            flag = sub_href
            if not (self.bf.add(flag)):
                self.__log('', flag, 'not in bloom filter')
                self.__log('insert to database ... start')

                insert_id = self.mysql.insert_data("titles", title_dict)
                self.__log('insert to database ... done')
                self.__process_reply_page(sub_href, title_author.encode('utf-8'), title_time, str(insert_id))
            self.__log('process the', counter, 'title', ' ... done')

        # 下一页的链接
        next_page_tag = soup.find('a', text='下一页')
        if next_page_tag:
            next_page = next_page_tag.get('href')
            next_page = self.basic_url + next_page
        else:
            next_page = None
        return next_page

    # 清空bloom filter
    def clean_bloom_filter(self):
        self.__log('clean all in bloom filter ... start')
        self.bf.clear_all()
        self.__log('clean all in bloom filter ... done')

    def bloom_filter_len(self):
        return len(self.bf)

    def main(self):
        self.__log('spider ... start')
        self.__log('process start url ... running')
        next_page = self.__process_titles_page(self.start_url)
        self.__log('process start url ... done')
        start_time = self.__get_time()
        print start_time
        depth = 1
        while next_page:
            # if depth == 2:
            #    break
            self.__log('now it is the', str(depth), 'page')
            next_page = self.__process_titles_page(next_page)
            depth += 1
        end_time = self.__get_time()
        print end_time
        duration = self.__cal_time(start_time, end_time)
        self.__log('duration are', duration)
        self.__log('spider ... done')

    def clean_table(self, table):
        self.mysql.clean_table(table)

    def test(self):
        test_url = 'http://bbs.tianya.cn/post-333-778768-1.shtml'
        print self.bf.add(test_url)
开发者ID:mylinlan,项目名称:spider,代码行数:104,代码来源:myspider.py

示例3: BloomFilter

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import clear_all [as 别名]
import time
import re
import Queue
import redis

#!!! db=1 use db1 to store the seeds

r = redis.Redis(host='192.168.134.235',password='5#rFLwtg53&GzSjPbpb2', db=1)
done_sites_fname='done_sites.bin'

try:
    bfdone = BloomFilter.open(done_sites_fname)
except:
    print "can not open file, create it"
    bfdone = BloomFilter(2**23, 0.00001, done_sites_fname) #8M 
    bfdone.clear_all()


#first check the bf
f = "urls_uniq.txt"
urls = open(f).read().strip().split('\n')
for url in urls:
    if url in bfdone:
        print "Error"
        exit(0)

print "BF is ok"

# here we got id in each db increase from 1 to n, rather than sequencely for all db
cmd = "select id from mainpages"
开发者ID:salmonx,项目名称:fengbei,代码行数:32,代码来源:extractseeds.py

示例4: MySpider

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import clear_all [as 别名]
class MySpider(object):
    def __init__(self):
        self.mysql = mysql.Mysql()
        self.re = re
        self.time = time
        self.datetime = datetime
        self.requests = requests

        # 使用bloom_filter去重,每次从文件中读取dump.bloom
        if os.path.isfile("new_filter.bloom"):
            self.bf = BloomFilter.open("new_filter.bloom")
        else:
            self.bf = BloomFilter(10000000, 0.01, "new_filter.bloom")

    def __process_text(self, my_str):
        my_str = self.re.sub("http.*?html", "", my_str).encode("utf-8")
        if type(my_str) == "unicode":
            my_str = my_str.encode("utf-8")
        return my_str.replace(" ", "").replace("  ", "").replace("\n", "")

    def open_url(self, url):
        html = self.requests.get(url)
        code = html.encoding
        # print code
        content = html.content.decode(code, "ignore")
        soup = BeautifulSoup(content)
        return soup

    def process_content_page(self, url):
        soup = self.open_url(url)
        body = soup.find_all("p")
        # print soup.contents
        content = ""
        for each in body:
            content += each.text.strip()
            # print each.text.strip()
        return self.__process_text(content)

    def process_title_page(self, url):
        soup = self.open_url(url)
        result = soup.find("table", class_="mt12 p14")
        titles = result.find_all("tr")
        titles = titles[1:-1]
        # 处理每个标题
        for each in titles:
            title_href = each.a.get("href")
            if not self.bf.add(title_href):
                text = each.text.strip()
                title_time = "20" + text[-14:] + ":00"
                content = text[1:-15].strip()
                print title_time + "\n" + content + "\n" + title_href
                title_text = self.process_content_page(title_href)

                # 构造插入字典
                title_dict = {
                    "link": title_href,
                    "title": content,
                    "text": title_text.decode("utf-8", "ignore"),
                    "time": title_time,
                }
                # 插入数据库
                self.mysql.insert_data("gzrb_titles", title_dict)

        # 得到下一页
        result = soup.find("table", class_="mt12 p12")
        result = result.find_all("a")
        if len(result) == 1:
            next_page_href = result[0].get("href")
            # print result[0].text
        elif len(result) == 2:
            next_page_href = result[1].get("href")
            # print result[1].text
        else:
            next_page_href = None
        # print next_page_href
        return next_page_href

    def clean_bloom_filter(self):
        self.bf.clear_all()

    def process_nav(self, url):
        # 对于每个标题页,如果有下一页,则继续迭代
        next_page = self.process_title_page(url)
        # 爬取深度
        depth = 1
        while next_page:
            if not self.bf.add(next_page):
                next_page = self.process_title_page(next_page)
            else:
                next_page = None
            if depth == 10:
                return
            depth += 1

    def main(self, start_url):
        soup = self.open_url(start_url)
        # 获得导航栏链接
        nav = soup.find("div", class_="nav")
        result = nav.find_all("li")
        # 去掉#
#.........这里部分代码省略.........
开发者ID:mylinlan,项目名称:spider,代码行数:103,代码来源:gzrb.py


注:本文中的pybloomfilter.BloomFilter.clear_all方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。