当前位置: 首页>>代码示例>>Python>>正文


Python BloomFilter.clear_all方法代码示例

本文整理汇总了Python中pybloom.BloomFilter.clear_all方法的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter.clear_all方法的具体用法?Python BloomFilter.clear_all怎么用?Python BloomFilter.clear_all使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pybloom.BloomFilter的用法示例。


在下文中一共展示了BloomFilter.clear_all方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: MySpider

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import clear_all [as 别名]

#.........这里部分代码省略.........
                self.__process_sub_content(result, insert_id)
                self.__log('process the', str(num), 'reply page ... done')
        self.__log('process reply page ... done')

    def __process_titles_page(self, page_url):
        self.__log('reading titles page .... start')
        req = self.requests.get(page_url)
        content = req.text
        soup = BeautifulSoup(content)

        # 获取所有标题
        titles = soup.select('tbody tr')
        # 去掉不符合的部分
        titles = titles[1:]
        # 对每一个标题进行处理
        self.__log('reading titles page .... done')
        self.__log('processing all titles in', self.start_url, ' ... start')
        counter = 1
        for each in titles:
            # 获取标题的tag信息
            # 注意在beautifulSoup的tag中,空白也是标签,即相邻两个td之间标签还有空白
            # 所以下面content索引需要考虑到这点
            self.__log('process the', counter, 'title', ' ... start')
            counter += 1
            title_content = each.contents
            title_href = title_content[1].a.get('href')         # 获取标题链接
            title_text = title_content[1].text.strip()          # 获取标题内容
            title_author = title_content[3].a.text              # 获取作者
            title_click_num = title_content[5].text             # 点击数
            title_reply_number = title_content[7].text          # 获取回复数
            title_time = title_content[9].get('title')          # 获取时间
            sub_href = self.basic_url + title_href                   # 子链接
            # 构造标题的字典,插入标题
            title_dict = {
                "reply_num": title_reply_number,
                "click_num": title_click_num,
                "author": title_author,
                "time": title_time,
                "link": sub_href,
                "text": title_text
                    }
            # for each in title_dict:
            #    print each
            #    print type(title_dict[each])
            # 利用链接地址和回复数判断是否重复
            # flag = sub_href + title_click_num
            flag = sub_href
            if not (self.bf.add(flag)):
                self.__log('', flag, 'not in bloom filter')
                self.__log('insert to database ... start')

                insert_id = self.mysql.insert_data("titles", title_dict)
                self.__log('insert to database ... done')
                self.__process_reply_page(sub_href, title_author.encode('utf-8'), title_time, str(insert_id))
            self.__log('process the', counter, 'title', ' ... done')

        # 下一页的链接
        next_page_tag = soup.find('a', text='下一页')
        if next_page_tag:
            next_page = next_page_tag.get('href')
            next_page = self.basic_url + next_page
        else:
            next_page = None
        return next_page

    # 清空bloom filter
    def clean_bloom_filter(self):
        self.__log('clean all in bloom filter ... start')
        self.bf.clear_all()
        self.__log('clean all in bloom filter ... done')

    def bloom_filter_len(self):
        return len(self.bf)

    def main(self):
        self.__log('spider ... start')
        self.__log('process start url ... running')
        next_page = self.__process_titles_page(self.start_url)
        self.__log('process start url ... done')
        start_time = self.__get_time()
        print start_time
        depth = 1
        while next_page:
            if depth == 2:
                break
            self.__log('now it is the', str(depth), 'page')
            next_page = self.__process_titles_page(next_page)
            depth += 1
        end_time = self.__get_time()
        print end_time
        duration = self.__cal_time(start_time, end_time)
        self.__log('duration are', duration)
        self.__log('spider ... done')

    def clean_table(self, table):
        self.mysql.clean_table(table)

    def test(self):
        test_url = 'http://bbs.tianya.cn/post-333-778768-1.shtml'
        print self.bf.add(test_url)
开发者ID:mylinlan,项目名称:spider,代码行数:104,代码来源:myspider_win.py


注:本文中的pybloom.BloomFilter.clear_all方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。