當前位置: 首頁>>代碼示例>>Python>>正文


Python pybloom.ScalableBloomFilter類代碼示例

本文整理匯總了Python中pybloom.ScalableBloomFilter的典型用法代碼示例。如果您正苦於以下問題:Python ScalableBloomFilter類的具體用法?Python ScalableBloomFilter怎麽用?Python ScalableBloomFilter使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


在下文中一共展示了ScalableBloomFilter類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: vacuum_all

	def vacuum_all(self, limit=None):
		logger.debug('Begin vacuum_all(limit=%s)', limit)
		self.plugins = self.load_plugins()

		self.session.begin(subtransactions=True)
		
		ts = self.term_stat('SupplierCatalogItemVersion Vacuum', len(self.plugins))
		
		#s = set()
		s = ScalableBloomFilter()
		query = self.session.query(SupplierCatalogModel.id)
		for (supplier_catalog_id, ) in query.yield_per(100):
			s.add(supplier_catalog_id)
		
		
		for plug in self.plugins.itervalues():
			supplier_catalog_filter_id = plug.supplier_catalog_filter_id()
			model_name = plug.version_model()  + 'Model'
			VersionModel = getattr(model, model_name)
			query = self.session.query(VersionModel)
			if limit:
				query = query.order_by(VersionModel.vacuumed.nullsfirst())
				query = query.limit(limit)

			ts['sub_done'] = 0
			ts['sub_total'] = query.count()
			for supplier_catalog_item_version in query.yield_per(10):
				if supplier_catalog_item_version.supplier_catalog_id not in s:
					logger.debug("Deleting %s %s", model_name, supplier_catalog_item_version.id)
					self.session.delete(supplier_catalog_item_version)
				ts['sub_done'] += 1
			ts['done'] += 1
		self.session.commit()
		ts.finish()
		logger.debug('End vacuum_all()')
開發者ID:jdsteele,項目名稱:bakedpytato,代碼行數:35,代碼來源:supplier_catalog_item_version_task.py

示例2: URLFilter

class URLFilter(object):

    lock = RLock()

    def __init__(self):
        self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv',
                               'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare']
        self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def forbidden_key_word(self, url):
        for key_word in self.forbidden_keys:
            if key_word in url:
                log.debug('## FORBIDDEN: {}'.format(url))
                return False
        return True

    @staticmethod
    def is_english(url):
        try:
            url.decode('ascii')
        except UnicodeDecodeError:
            log.debug('## NON-ENGLISH PAGE DETECTED: {}'.format(url))
            return False
        else:
            return True

    def pass_check(self, url):
        with URLFilter.lock:
            if url in self.seen:
                log.debug('## SEEN: {}'.format(url))
                return False
            self.seen.add(url)
            return self.forbidden_key_word(url) and self.is_english(url)
開發者ID:heroxdream,項目名稱:information-retrieval,代碼行數:33,代碼來源:URLFilter.py

示例3: FilterHandler

class FilterHandler(object):
  def __init__(self, logger):
    self.logger_ = logger
    self._load_from_file()


  def url_seen(self, url):
    if self.deduper_.add(url):
      self.logger_.info('url duplicated: %s', url)
      return True
    return False


  def _load_from_file(self):
    self.logger_.info('loading data from cache file...')
    if not os.path.isfile('data/bloom.data'):
      self.logger_.error('bloom cache file not found, create one instead.')
      self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
    else:
      with open('data/bloom.data', 'r') as f:
        self.deduper_ = ScalableBloomFilter.fromfile(f)


  def _dump_to_file(self):
    self.logger_.info('dumping data...')
    if not os.path.isdir('data'):
      os.mkdir('data')
    with open('data/bloom.data', 'w') as f:
      self.deduper_.tofile(f)
    self.logger_.info('dump data finished.')


  def close(self):
    self._dump_to_file()
開發者ID:cfhb,項目名稱:crawl_youtube,代碼行數:34,代碼來源:url_filter_service.py

示例4: _load_from_file

 def _load_from_file(self):
   self.logger_.info('loading data from cache file...')
   if not os.path.isfile('data/bloom.data'):
     self.logger_.error('bloom cache file not found, create one instead.')
     self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
   else:
     with open('data/bloom.data', 'r') as f:
       self.deduper_ = ScalableBloomFilter.fromfile(f)
開發者ID:cfhb,項目名稱:crawl_youtube,代碼行數:8,代碼來源:url_filter_service.py

示例5: __init__

  def __init__(self,filterfile):
    self.filterfile = filterfile
	  #if filterfile is present load bloom filter from that file, else create new one
    if os.path.exists(filterfile):
      self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
      print "available signatures = %d"%len(self.bf)
    else:
      self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
開發者ID:FireAVR,項目名稱:BloomAutoYara,代碼行數:8,代碼來源:BloomAutoYara.py

示例6: WishPipeline

class WishPipeline(object):
    def __init__(self):
        self.urls = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def process_item(self, item, spider):
        if item is None or item['url'] is None or item['url'] in self.urls:
            raise DropItem("Duplicate item found.")
        else:
            self.urls.add(item['url'])
            return item
開發者ID:yangxue088,項目名稱:wish,代碼行數:10,代碼來源:pipelines.py

示例7: test_bloom_string

    def test_bloom_string(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
            rnd = ''.join(random.choice(string.letters) for i in xrange(40))
            _ = f.add(rnd)

        self.assertEqual(rnd in f, True)

        for i in string.letters:
            self.assertEqual(i in f, False)

        self.assertEqual(rnd in f, True)
開發者ID:DavisHevin,項目名稱:sqli_benchmark,代碼行數:13,代碼來源:test_pybloom.py

示例8: to_bloomfilter

def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
    """
    Converts the iterable into a ScalableBloomFilter
    
    :rtype : pybloom.ScalableBloomFilter
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    bloom = ScalableBloomFilter(init_cap, err_rate)
    for element in iterable:
        bloom.add(element)

    return bloom
開發者ID:Faiz7412,項目名稱:itpy,代碼行數:15,代碼來源:sketch.py

示例9: test_bloom_int

    def test_bloom_int(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
             _ = f.add(i)

        for i in xrange(0, 10000):
            self.assertEqual(i in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(0,10000-1)
            self.assertEqual(r in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(10000,10000 * 2)
            self.assertEqual(r in f, False)
開發者ID:DavisHevin,項目名稱:sqli_benchmark,代碼行數:16,代碼來源:test_pybloom.py

示例10: RequestFilter

class RequestFilter(object):

    """ RequestFilter """

    def __init__(self):
        self.sbf = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    def request_seen(self, request):
        """request seen
        """
        finger = request_fingerprint(request)
        if finger in self.sbf:
            return True
        self.sbf.add(finger)
        return False
開發者ID:kaito-kidd,項目名稱:mini-scrapy,代碼行數:16,代碼來源:scheduler.py

示例11: get_category_conversion

	def get_category_conversion(self, supplier_id, manufacturer_id, category_identifier):
		"""Category Conversion"""
		if self.category_conversion_filter is None:
			self.category_conversion_filter = ScalableBloomFilter()
			query = self.session.query(
				CategoryConversionModel.supplier_id,
				CategoryConversionModel.manufacturer_id,
				CategoryConversionModel.needle
			)
			for row in query.yield_per(100):
				self.category_conversion_filter.add(row)
		
		row = (supplier_id, manufacturer_id, category_identifier)
		if row in self.category_conversion_filter:
			query = self.session.query(CategoryConversionModel)
			query = query.filter(CategoryConversionModel.supplier_id == supplier_id)
			query = query.filter(CategoryConversionModel.manufacturer_id == manufacturer_id)
			query = query.filter(CategoryConversionModel.needle == category_identifier)
			try:
				category_conversion = query.one()
				return category_conversion
			except NoResultFound:
				pass

		category_conversion = CategoryConversionModel()
		category_conversion.manufacturer_id = manufacturer_id
		category_conversion.supplier_id = supplier_id
		category_conversion.needle = category_identifier
		self.session.add(category_conversion)
		self.category_conversion_filter.add(row)
		return category_conversion
開發者ID:jdsteele,項目名稱:bakedpytato,代碼行數:31,代碼來源:supplier_catalog_item_task.py

示例12: __init__

 def __init__(self, initial_capacity=1000, error_rate=0.0001):
     self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
             error_rate=error_rate,
             mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     # False positives in the Bloom filter will cause us to fail to
     # garbage-collect an object.  Salt the Bloom filter to ensure
     # that we get a different set of false positives on every run.
     self._bloom_salt = os.urandom(2)
開發者ID:cmusatyalab,項目名稱:deltaic,代碼行數:8,代碼來源:util.py

示例13: __init__

 def __init__(self, source_image):
     self.source_image = source_image
     self.bloom_filter = ScalableBloomFilter(
         initial_capacity=source_image.tiles.count(),
         error_rate=0.0001,  # 1 in 10,000
     )
     existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match')
     for tile_id, existing_match_id in existing_matches:
         self.bloom_filter.add((tile_id, existing_match_id))
開發者ID:pipermerriam,項目名稱:mozy,代碼行數:9,代碼來源:exclusions.py

示例14: __init__

 def __init__(self, spider):
     super(BFSFrontier, self).__init__(spider)
     self._spider = spider
     self.args = {'rules': [],
                  'order': 'bfs'}
     self.redis = RediSugar.getConnection()
     self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
     self.todo = spider.name + '-todo'
     self.visited = spider.name + '-visited'
     self._feedfilter()
開發者ID:ymero,項目名稱:PyCrawler,代碼行數:10,代碼來源:frontier.py

示例15: count_distinct_approx

def count_distinct_approx(iterable, init_cap=200, err_rate=0.001):
    """
    Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate
    the number of distinct values found in this iterable.
    
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    counter = 0

    set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate)

    for element in iterable:
        if element not in set_of_distinct_values:
            set_of_distinct_values.add(element)
            counter += 1

    return counter
開發者ID:Faiz7412,項目名稱:itpy,代碼行數:20,代碼來源:sketch.py


注:本文中的pybloom.ScalableBloomFilter類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。