当前位置: 首页>>代码示例>>Python>>正文


Python ScalableBloomFilter.add方法代码示例

本文整理汇总了Python中pybloom.ScalableBloomFilter.add方法的典型用法代码示例。如果您正苦于以下问题:Python ScalableBloomFilter.add方法的具体用法?Python ScalableBloomFilter.add怎么用?Python ScalableBloomFilter.add使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pybloom.ScalableBloomFilter的用法示例。


在下文中一共展示了ScalableBloomFilter.add方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: vacuum_all

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
	def vacuum_all(self, limit=None):
		logger.debug('Begin vacuum_all(limit=%s)', limit)
		self.plugins = self.load_plugins()

		self.session.begin(subtransactions=True)
		
		ts = self.term_stat('SupplierCatalogItemVersion Vacuum', len(self.plugins))
		
		#s = set()
		s = ScalableBloomFilter()
		query = self.session.query(SupplierCatalogModel.id)
		for (supplier_catalog_id, ) in query.yield_per(100):
			s.add(supplier_catalog_id)
		
		
		for plug in self.plugins.itervalues():
			supplier_catalog_filter_id = plug.supplier_catalog_filter_id()
			model_name = plug.version_model()  + 'Model'
			VersionModel = getattr(model, model_name)
			query = self.session.query(VersionModel)
			if limit:
				query = query.order_by(VersionModel.vacuumed.nullsfirst())
				query = query.limit(limit)

			ts['sub_done'] = 0
			ts['sub_total'] = query.count()
			for supplier_catalog_item_version in query.yield_per(10):
				if supplier_catalog_item_version.supplier_catalog_id not in s:
					logger.debug("Deleting %s %s", model_name, supplier_catalog_item_version.id)
					self.session.delete(supplier_catalog_item_version)
				ts['sub_done'] += 1
			ts['done'] += 1
		self.session.commit()
		ts.finish()
		logger.debug('End vacuum_all()')
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:37,代码来源:supplier_catalog_item_version_task.py

示例2: URLFilter

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class URLFilter(object):

    lock = RLock()

    def __init__(self):
        self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv',
                               'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare']
        self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def forbidden_key_word(self, url):
        for key_word in self.forbidden_keys:
            if key_word in url:
                log.debug('## FORBIDDEN: {}'.format(url))
                return False
        return True

    @staticmethod
    def is_english(url):
        try:
            url.decode('ascii')
        except UnicodeDecodeError:
            log.debug('## NON-ENGLISH PAGE DETECTED: {}'.format(url))
            return False
        else:
            return True

    def pass_check(self, url):
        with URLFilter.lock:
            if url in self.seen:
                log.debug('## SEEN: {}'.format(url))
                return False
            self.seen.add(url)
            return self.forbidden_key_word(url) and self.is_english(url)
开发者ID:heroxdream,项目名称:information-retrieval,代码行数:35,代码来源:URLFilter.py

示例3: WishPipeline

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class WishPipeline(object):
    def __init__(self):
        self.urls = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def process_item(self, item, spider):
        if item is None or item['url'] is None or item['url'] in self.urls:
            raise DropItem("Duplicate item found.")
        else:
            self.urls.add(item['url'])
            return item
开发者ID:yangxue088,项目名称:wish,代码行数:12,代码来源:pipelines.py

示例4: to_bloomfilter

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
    """
    Converts the iterable into a ScalableBloomFilter
    
    :rtype : pybloom.ScalableBloomFilter
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    bloom = ScalableBloomFilter(init_cap, err_rate)
    for element in iterable:
        bloom.add(element)

    return bloom
开发者ID:Faiz7412,项目名称:itpy,代码行数:17,代码来源:sketch.py

示例5: RequestFilter

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class RequestFilter(object):

    """ RequestFilter """

    def __init__(self):
        self.sbf = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    def request_seen(self, request):
        """request seen
        """
        finger = request_fingerprint(request)
        if finger in self.sbf:
            return True
        self.sbf.add(finger)
        return False
开发者ID:kaito-kidd,项目名称:mini-scrapy,代码行数:18,代码来源:scheduler.py

示例6: count_distinct_approx

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def count_distinct_approx(iterable, init_cap=200, err_rate=0.001):
    """
    Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate
    the number of distinct values found in this iterable.
    
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    counter = 0

    set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate)

    for element in iterable:
        if element not in set_of_distinct_values:
            set_of_distinct_values.add(element)
            counter += 1

    return counter
开发者ID:Faiz7412,项目名称:itpy,代码行数:22,代码来源:sketch.py

示例7: main

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def main(args):
    seenUrlSet = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
    for ln in sys.stdin:
        if not ln:
            continue
        fetchedUrl = json.loads(ln)

        # continue if we've seen this url already.
        if fetchedUrl["url"] in seenUrlSet or fetchedUrl["effective_url"] in seenUrlSet:
            continue

        # add unseen url to the url set
        seenUrlSet.add(fetchedUrl["url"])
        seenUrlSet.add(fetchedUrl["effective_url"])

        # extract links and filter out some urls by url filter.
        outlinks = url_filter(extract_links(fetchedUrl))

        # analyze

        print "[postproc]%s" % fetchedUrl["url"]
开发者ID:etman,项目名称:xPyCrawler,代码行数:23,代码来源:postproc.py

示例8: BloomSet

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class BloomSet(object):
    def __init__(self, initial_capacity=1000, error_rate=0.0001):
        self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
                error_rate=error_rate,
                mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        # False positives in the Bloom filter will cause us to fail to
        # garbage-collect an object.  Salt the Bloom filter to ensure
        # that we get a different set of false positives on every run.
        self._bloom_salt = os.urandom(2)

    def add(self, name):
        self._set.add(self._bloom_key(name))

    def __contains__(self, name):
        # May return false positives.
        return self._bloom_key(name) in self._set

    def _bloom_key(self, name):
        if isinstance(name, unicode):
            name = name.encode('utf-8')
        return self._bloom_salt + name
开发者ID:cmusatyalab,项目名称:deltaic,代码行数:23,代码来源:util.py

示例9: vacuum_all

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
	def vacuum_all(self, limit=None):
		logger.debug('Begin vacuum_all(limit=%s)', limit)
		self.plugins = self.load_plugins()
		ts = self.term_stat('SupplierSpecialItemVersion Vacuum', len(self.plugins))
		tx = transaction.get()
		
		try:
			#s = set()
			s = ScalableBloomFilter()
			query = DBSession.query(SupplierSpecialModel.id)
			for (supplier_special_id, ) in query.yield_per(100):
				s.add(supplier_special_id)
			
			for plug in self.plugins.itervalues():
				supplier_special_filter_id = plug.supplier_special_filter_id()
				model_name = plug.version_model()  + 'Model'
				VersionModel = getattr(model, model_name)
				query = DBSession.query(VersionModel)
				if limit:
					query = query.order_by(VersionModel.vacuumed.nullsfirst())
					query = query.limit(limit)

				ts['sub_done'] = 0
				ts['sub_total'] = query.count()
				for supplier_special_item_version in query.yield_per(10):
					if supplier_special_item_version.supplier_special_id not in s:
						logger.debug("Deleting %s %s", model_name, supplier_special_item_version.id)
						DBSession.delete(supplier_special_item_version)
					ts['sub_done'] += 1
					if ts['sub_done'] % 1000 == 0:
						DBSession.flush()
				DBSession.flush()
				ts['done'] += 1
		except Exception:
			logger.exception('Caught Exception: ')
			tx.abort()
		finally:
			ts.finish()
		transaction.commit()
		logger.debug('End vacuum_all()')
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:42,代码来源:supplier_special_item_version_task.py

示例10: test_bloom_string

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
    def test_bloom_string(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
            rnd = ''.join(random.choice(string.letters) for i in xrange(40))
            _ = f.add(rnd)

        self.assertEqual(rnd in f, True)

        for i in string.letters:
            self.assertEqual(i in f, False)

        self.assertEqual(rnd in f, True)
开发者ID:DavisHevin,项目名称:sqli_benchmark,代码行数:15,代码来源:test_pybloom.py

示例11: __init__

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class kmer_store:
	def __init__(self):
		self.bloom_filter = ScalableBloomFilter(initial_capacity=1000000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)
		self.kmers = {}

	def update(self, item):
		if item in self.bloom_filter:
			if item in self.kmers:
				self.kmers[item] += 1
			else:
				self.kmers[item] = 2
		else:
			self.bloom_filter.add(item)

	def __iter__(self):
		for key in self.kmers:
			yield key
	def __getitem__(self, key):
		return self.kmers[key]
	def __repr__(self):
		return str(self.kmers)
	def __str__(self):
		return str(self.kmers)
开发者ID:robertf224,项目名称:bio_final,代码行数:25,代码来源:utils.py

示例12: StockTileExclusions

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class StockTileExclusions(object):
    """
    Object that keeps track of which stock tiles have already been used.
    """
    def __init__(self, source_image):
        self.source_image = source_image
        self.bloom_filter = ScalableBloomFilter(
            initial_capacity=source_image.tiles.count(),
            error_rate=0.0001,  # 1 in 10,000
        )
        existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match')
        for tile_id, existing_match_id in existing_matches:
            self.bloom_filter.add((tile_id, existing_match_id))

    def __contains__(self, key):
        if key in self.bloom_filter:
            return True
        elif self.source_image.tiles.filter(stock_tile_match_id=key[1]).exists():
            self.add(key)
            return True
        return False

    def add(self, key):
        self.bloom_filter.add(key)
开发者ID:pipermerriam,项目名称:mozy,代码行数:26,代码来源:exclusions.py

示例13: test_bloom_int

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
    def test_bloom_int(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
             _ = f.add(i)

        for i in xrange(0, 10000):
            self.assertEqual(i in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(0,10000-1)
            self.assertEqual(r in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(10000,10000 * 2)
            self.assertEqual(r in f, False)
开发者ID:DavisHevin,项目名称:sqli_benchmark,代码行数:18,代码来源:test_pybloom.py

示例14: addNewUrl

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def addNewUrl():

	conn = database.getConn()
	cursor = conn.cursor()

	# check if empty
	cursor.execute('SELECT outlinks FROM webpage WHERE status = 2')
	num_outlinks = cursor.rowcount
	rows_outlinks = cursor.fetchall()
	cursor.execute("SELECT error FROM webpage WHERE status = 11")
	num_redirect = cursor.rowcount
	rows_redirect = cursor.fetchall()
	
	num_all = num_redirect + num_outlinks
	if num_all == 0 :
		return {'exist':0 , 'insert':0 , 'all':0}
		cursor.close()
		conn.close()

	#bloom start ..input the urls into bloom
	import bitarray
	from pybloom import ScalableBloomFilter
	
	sql = "SELECT url FROM webpage WHERE 1"
	cursor.execute(sql)
	num_exist = cursor.rowcount
	rows = cursor.fetchall()

	sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
	for row in rows:
		sbf.add(row[0])
	#bloom end  sbf

	insert_arr = []
	num_insert = 0

	for row in rows_outlinks:
		outlinks_arr = row[0].split(',')
		proper_links = filterOutLinks(outlinks_arr)
		for link in proper_links:
			if link in sbf:
				pass
			else:
				num_insert += 1
				sbf.add(link)
				insert_arr.append((link,0))

	# for the redirect url
	cursor.execute("SELECT error FROM webpage WHERE status = 11")
	rows = cursor.fetchall()
	for row in rows_redirect:
		link = row[0]
		link = filterLink(link) 
		if link == '':
			continue

		if link in sbf:
			pass
		else:
			num_insert += 1
			sbf.add(link)
			insert_arr.append((link,0))
	
	sql = "INSERT INTO webpage (url,status)VALUE(%s,%s)"
	cursor.executemany(sql,insert_arr)

	cursor.execute("UPDATE webpage SET status = 3 WHERE status = 2 OR status = 11")

	cursor.close()
	conn.close()

	return {'exist':num_exist , 'insert':num_insert , 'all':num_all}
开发者ID:rokiyer,项目名称:Wanderer,代码行数:74,代码来源:generate.py

示例15: __init__

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class BloomAutoYara:
  def __init__(self,filterfile):
    self.filterfile = filterfile
	  #if filterfile is present load bloom filter from that file, else create new one
    if os.path.exists(filterfile):
      self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
      print "available signatures = %d"%len(self.bf)
    else:
      self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

  def save_filter(self):
    print "saving filter to file %s "%self.filterfile
    self.bf.tofile(open(self.filterfile,"wb"))

  def add_string(self,str):
    self.bf.add(str)

  def search_string(self,str):
    if str in self.bf:
      return True
    else:
      return False

  def extractlines(self,filename,min_len=4):
    chars = r"A-Za-z0-9/\-:.,_$%'()[\]<> "
    shortest_run = 4
    regexp = '[%s]{%d,}' % (chars, shortest_run)
    pattern = re.compile(regexp)
    fp = open(filename,"rb")
    data = fp.read()
    lines = pattern.findall(data)
    s = set(lines)
    fp.close()
    return list(s)
   
  def build_filter(self,dirname,extensions=[]):
    print extensions
    total = 0
    for (dir, _, files) in os.walk(dirname):
      for f in files:
        ext = f.split(".")[-1]
        
        if len(extensions) != 0 and ext not in extensions:
          continue
          
        print "processing file %s"%f
        total += 1
        path = os.path.join(dir, f)
        lines = self.extractlines(path)
        for line in lines:
          self.add_string(line)
  
    print "creating bloom filter done. Total files = %d (Total entries = %d). Overwriting to bloom filter output file %s"%(total,len(self.bf),self.filterfile)
    self.save_filter()
    
  def find_file_topn(self,filename,topn=10):
    tmp = []
    lines = self.extractlines(filename)
    print "total unique strings in file %s = %d"%(filename,len(lines))
    for line in lines:
      if self.search_string(line) == False:
        tmp.append(line)
    tmp.sort(key=len)
    print "total strings which can be used for signature = %d"%len(tmp)
    tmp = tmp[-topn:]
    tmp.reverse()
    return tmp
    
  def find_dir_topn(self,dirname,topn=10):
    tmplist = []
    for (dir, _, files) in os.walk(dirname):
      for f in files:
        path = os.path.join(dir, f)
        lines = self.extractlines(path)
        for line in lines:
          if self.search_string(line) == False:
            tmplist.append(line) 
    
    counts = Counter(list(tmplist))
    return counts.most_common(topn)

  def escapechars(self,str):
    for c in "\/.^$*+-?()[]{}|":
      str = str.replace(c,"\\"+c)
    return str
    
  def list_to_rule(self,list,rulename,threshold=0.5):
    tmp = "rule " + rulename + "{\n"
    tmp += " strings:\n"
    
    for i in xrange(0,len(list)):
      esc = self.escapechars(list[i])
      tmp = tmp + "$str%d = "%i + r"/[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]" + esc + r"[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]/"
      tmp += "\n"
    
    tmp += "condition:\n"
    tmp += str(int(len(list)*threshold))
    tmp += " of ("
    for i in xrange(0,len(list)):
      tmp += "$str"+ str(i)
#.........这里部分代码省略.........
开发者ID:FireAVR,项目名称:BloomAutoYara,代码行数:103,代码来源:BloomAutoYara.py


注:本文中的pybloom.ScalableBloomFilter.add方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。