当前位置: 首页>>代码示例>>Python>>正文


Python pybloomfilter.BloomFilter类代码示例

本文整理汇总了Python中pybloomfilter.BloomFilter的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter类的具体用法?Python BloomFilter怎么用?Python BloomFilter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了BloomFilter类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

 def __init__(self, path=FILTER_PATH, debug=False):
     if os.path.exists(FILTER_PATH):
         self.url_filter = BloomFilter.open(FILTER_PATH)
     else:
         print "created a new bloom filter. "
         self.url_filter = BloomFilter(100000, 0.00001, FILTER_PATH)
     super(DuplicateFilter, self).__init__(path, debug)
开发者ID:JASON0916,项目名称:DianpingSpider,代码行数:7,代码来源:duplicate_filter.py

示例2: create_ref_bloom_filter

def create_ref_bloom_filter(reference_file, error_rate, bf_file, format="fasta"):
    """From a given FASTA reference sequence creates a bloom filter file
    from each read.
    """

    if format == "fasta":
    	file_it = FastaIterator
        record = lambda it: (seq.seq for seq in it)
    elif format == "fastq":
        file_it = FastqGeneralIterator
        record = lambda it: (seq for _, seq, _ in it)

    capacity = total_reads(reference_file)
    with open(reference_file) as handle:
        it = file_it(handle)
        read_it = record(it)
        read_len = 109
        read_in = []
        read = []
        buffer = []
        
        bf = BloomFilter(capacity, error_rate, bf_file)
        sequence = read_it.next()

        step = read_len
        
        i = 0
        while i < len(sequence):
            read = sequence[i:i + read_len - 1]
            i += step
            print(read)
            bf.update(read)
                
        bf.close()
开发者ID:vals,项目名称:Boutonniere,代码行数:34,代码来源:boutonniere.py

示例3: LinkFilter

class LinkFilter():
    
    def __init__(self, domain):
        self.file_index = '%s_%s' % (domain, 'index.bf')
        self.file_html = '%s_%s' % (domain, 'html.bf')

        if os.path.exists(self.file_index):
            self.bf_index = BloomFilter.open(self.file_index)
        else:
            self.bf_index = BloomFilter(100000000, 0.001, self.file_index)

        if os.path.exists(self.file_html):
            self.bf_html = BloomFilter.open(self.file_html)
        else:
            self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
    
    def index_filter(self, links):
        new_links = []
        for link in links:
            if not self.bf_index.add(link.url):
                new_links.append(link)
        return new_links

    def html_filter(self, links):
        new_links = []
        for link in links:
            #log.msg('This is a link : %s' % link, level=log.WARNING)
            if not self.bf_html.add(link.url):
                new_links.append(link)
        return new_links
开发者ID:wangjie1991,项目名称:crawler,代码行数:30,代码来源:linkfilter.py

示例4: main

def main():
   #Check for command line arguments
   if len(sys.argv) != 2:
      print 'Usage: %s [trace file]' % os.path.basename(sys.argv[0])
      sys.exit(1)

   #Read arguments from command line
   inFile = sys.argv[1]


   bf1 = BloomFilter(100000000, 0.001, 'bf1')   
   bf2 = BloomFilter(100000000, 0.001, 'bf2')
     
   outputFileName="converted-"+sys.argv[1]
   f = open(outputFileName, "a")



   for line in open(inFile,'r'):
      if (line[0:2]=="W," or line[0:2]=="R,"):
         hash1=int(hashlib.sha1(line[2:]).hexdigest(), 16) % (10 ** 10)
         hash2=int(hashlib.md5(line[2:]).hexdigest(), 16) % (10 ** 10)
         if (bf1.add(hash1) and bf2.add(hash2)):
         	f.write('%s,%d\n' % (line[0],hash1*10000) )
         else:
        	   f.write('%s,%d\n' % (line[0],hash2*10000) )  
      elif(line==''):
         break
      else:
         pass
   f.close()
开发者ID:theopengroup,项目名称:EAD,代码行数:31,代码来源:convert.py

示例5: __init__

	def __init__(self, seeds, done_que, run_que):

		self.showpercounts = 10
		self.timeout = 5
		self.starttime = time.time()
		self.oldtime = 0

		self.quit = 0
		self.https_enable = 0


		self.run_que = run_que
		self.done_que = done_que
		self.tasks = []
		self.done = 1

		self.errdone = set()
		self.err = Error()

		self.loadstate()

		self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google',
	'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' ))
		self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv'))

		self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

		self.poolsize = 60
		self.poolmaxfree = 20
		self.freecount = 0
		self.down_pool = Pool(size=self.poolsize)

		self.totalnettime = 0
		self.cbcputime = 0
		self.totaldownsize = 0
		
		self.curspeed = 0

		self.debugnosave = 1
		self.tt = 1

		self.done_sites_fname='done_sites.bin'
		try:
			self.bfdone = BloomFilter.open(self.done_sites_fname)
		except:
			self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M 

		if self.run_que.qsize() == 0:
			for seed in seeds:
				self.run_que.put( seed.split("http://")[1] )

		if self.https_enable == 0:
			self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I)
		else:
			self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I)
开发者ID:salmonx,项目名称:crawler,代码行数:55,代码来源:gevent_redis_multiprocess.py

示例6: __init__

    def __init__(self):
        self.mysql = mysql.Mysql()
        self.re = re
        self.time = time
        self.datetime = datetime
        self.requests = requests

        # 使用bloom_filter去重,每次从文件中读取dump.bloom
        if os.path.isfile("new_filter.bloom"):
            self.bf = BloomFilter.open("new_filter.bloom")
        else:
            self.bf = BloomFilter(10000000, 0.01, "new_filter.bloom")
开发者ID:mylinlan,项目名称:spider,代码行数:12,代码来源:gzrb.py

示例7: __init__

  def __init__(self, node_n, seen_persist, Q_logs=None):
    self.node_n = node_n
    self.Q_logs = Q_logs
    self.total_crawled = 0
    self.payloads_dropped = 0

    # single variable for tracking whether node should be active or not
    self.active = True
    
    # crawl task Queue
    # Priority Queue ~ [ (next_pull_time, host_addr, url, parent_page_stats, seed_dist, parent_url) ]
    self.Q_crawl_tasks = Queue.PriorityQueue()

    # host queue dict
    # { host_addr: [(url, ref_page_stats, seed_dist, parent_url), ...] }
    self.hqs = {}
    
    # seen url check
    # Bloom Filter ~ [ url ]
    if seen_persist:
      try:
        self.seen = BloomFilter.open(BF_FILENAME)
      except:
        self.Q_logs.put('Error opening bloom filter, creating new one')
        self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME)
    else:
      self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME)

    # DNS Cache
    # { netloc: (host_addr, time_last_checked) }
    self.DNScache = {}

    # overflow url Queue
    # Queue ~ [ (host_addr, url, ref_page_stats, seen_dist, parent_url) ]
    self.Q_overflow_urls = Queue.Queue()

    # host queue cleanup Queue
    # Priority Queue ~ [ (time_to_delete, host_addr) ]
    self.Q_hq_cleanup = Queue.PriorityQueue()

    # active url count queue- for counting/tracking active
    # Queue ~ [ True ]
    self.Q_active_count = Queue.Queue()

    # thread active url dict- a dict of active urls by thread using, for restart dump
    # { thread_name: active_url }
    # NOTE: note that there are problems with this methodology, but that errors will only lead
    # to data redundancy (as opposed to omission)...
    self.thread_active = {}
    
    # Queue of messages to be sent to other nodes
    # Queue ~ [ (node_num_to, url, seed_dist, parent_page_stats) ]
    self.Q_to_other_nodes = Queue.Queue()
开发者ID:abresler,项目名称:RL-crawler,代码行数:53,代码来源:urlFrontier.py

示例8: dedup

def dedup(fname):
    bf = BloomFilter(1E8, 0.01)
    
    with open(fname, 'r') as fin:
        with open('deduped.tsv', 'w') as fout:
            for line in fin:
                splitLine = line.split('\t')
                description = splitLine[5]
                if bf.add(md5.new(description).digest()):
                    continue
                else:
                    fout.write(line)
开发者ID:jisaacso,项目名称:team-thorn,代码行数:12,代码来源:deduper.py

示例9: create_bf

def create_bf():
	bf = BloomFilter(count, error_rate, 'filter_base.bloom')
	keyDigest_list = []
	FILE = open(keyDigestFile, 'r')
	
	for i in range(count):
		keyDigest = FILE.read(keyDigestLen)
		keyDigest_list.append(keyDigest)
		
	FILE.close()
	
	for publicKeyID in keyDigest_list:
		bf.add(publicKeyID)
开发者ID:enzocxt,项目名称:bloomfilter,代码行数:13,代码来源:bloomfilter.py

示例10: __init__

    def __init__(self, domain):
        self.file_index = '%s_%s' % (domain, 'index.bf')
        self.file_html = '%s_%s' % (domain, 'html.bf')

        if os.path.exists(self.file_index):
            self.bf_index = BloomFilter.open(self.file_index)
        else:
            self.bf_index = BloomFilter(100000000, 0.001, self.file_index)

        if os.path.exists(self.file_html):
            self.bf_html = BloomFilter.open(self.file_html)
        else:
            self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
开发者ID:wangjie1991,项目名称:crawler,代码行数:13,代码来源:linkfilter.py

示例11: __init__

    def __init__(self, start_url, basic_url):
        self.basic_url = basic_url
        self.start_url = start_url
        self.mysql = mysql.Mysql()
        self.re = re
        self.time = time
        self.datetime = datetime
        self.requests = requests

        # 使用bloom_filter去重,每次从文件中读取dump.bloom
        if os.path.isfile('filter.bloom'):
            self.bf = BloomFilter.open('filter.bloom')
        else:
            self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
开发者ID:mylinlan,项目名称:spider,代码行数:14,代码来源:myspider.py

示例12: __init__

 def __init__(self):
     bc = config.get_boolmfilter_config()
     if os.path.exists(bc['bin_path']):
         self.bloomfilter = BloomFilter.open(bc['bin_path'])
     else:
         self.bloomfilter = BloomFilter(
             bc['capacity'], bc['wrong_rate'], bc['bin_path'])
开发者ID:intohole,项目名称:mortred,代码行数:7,代码来源:utils.py

示例13: __init__

 def __init__(self, settings, debug = False):
     self.capacity = settings.getint("DUPEFILTER_CAPACITY")
     self.filename = settings.get("DUPEFILTER_FILENAME")
     self.debug = debug
     self.error_rate = 0.01
     self.logger = logging.getLogger(__name__)
     self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename) 
开发者ID:wuwenjunwwj,项目名称:inst_spider,代码行数:7,代码来源:bloom_filter.py

示例14: __init__

    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()
        self.count_num = 0
        self.db = MySQLdb.connect("localhost","root","","storecount")
        self.cursor = self.db.cursor()
        self.cursor.execute("DROP TABLE IF EXISTS POPULAR")
        sql1 = """CREATE TABLE POPULAR(URL text(512),COUNT_MARK INT);"""
    
        try:
            self.cursor.execute(sql1)
            self.db.commit()
#             print "cao create"
        except:
            traceback.print_exc()
            self.db.rollback()
#         self.dbpool = adbapi.ConnectionPool('MySQLdb',
#                                             host = '127.0.0.1',
#                                             db = 'storecount',
#                                             user = 'root',
#                                             passwd = '',
#                                             cursorclass = MySQLdb.cursors.DictCursor,
#                                             charset = 'utf8',
#                                             use_unicode = True)
        self.mark = 0
开发者ID:wybini,项目名称:search-engine,代码行数:27,代码来源:pipelines.py

示例15: DuplicatesPipeline

class DuplicatesPipeline(object):

    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()

    def process_item(self, item, spider):
        print '************%d pages visited!*****************' %len(self.bf)
        if self.bf.add(item['url']):#True if item in the BF
            raise DropItem("Duplicate item found: %s" % item)
        else:
            #print '%d pages visited!'% len(self.url_seen)
            self.save_to_file(item['url'],item['title'])
            self.si.AddIndex(item)
            return item

    def save_to_file(self,url,utitle):
        self.f_write.write(url)
        self.f_write.write('\t')
        self.f_write.write(utitle.encode('utf-8'))
        self.f_write.write('\n')

    def __del__(self):
        """docstring for __del__"""
        self.f_write.close()
        self.si.IndexDone()
开发者ID:PeinYu,项目名称:SearchEngine,代码行数:28,代码来源:pipelines.py


注:本文中的pybloomfilter.BloomFilter类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。