当前位置: 首页>>代码示例>>Python>>正文


Python pybloom.ScalableBloomFilter类代码示例

本文整理汇总了Python中pybloom.ScalableBloomFilter的典型用法代码示例。如果您正苦于以下问题:Python ScalableBloomFilter类的具体用法?Python ScalableBloomFilter怎么用?Python ScalableBloomFilter使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了ScalableBloomFilter类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: vacuum_all

	def vacuum_all(self, limit=None):
		logger.debug('Begin vacuum_all(limit=%s)', limit)
		self.plugins = self.load_plugins()

		self.session.begin(subtransactions=True)
		
		ts = self.term_stat('SupplierCatalogItemVersion Vacuum', len(self.plugins))
		
		#s = set()
		s = ScalableBloomFilter()
		query = self.session.query(SupplierCatalogModel.id)
		for (supplier_catalog_id, ) in query.yield_per(100):
			s.add(supplier_catalog_id)
		
		
		for plug in self.plugins.itervalues():
			supplier_catalog_filter_id = plug.supplier_catalog_filter_id()
			model_name = plug.version_model()  + 'Model'
			VersionModel = getattr(model, model_name)
			query = self.session.query(VersionModel)
			if limit:
				query = query.order_by(VersionModel.vacuumed.nullsfirst())
				query = query.limit(limit)

			ts['sub_done'] = 0
			ts['sub_total'] = query.count()
			for supplier_catalog_item_version in query.yield_per(10):
				if supplier_catalog_item_version.supplier_catalog_id not in s:
					logger.debug("Deleting %s %s", model_name, supplier_catalog_item_version.id)
					self.session.delete(supplier_catalog_item_version)
				ts['sub_done'] += 1
			ts['done'] += 1
		self.session.commit()
		ts.finish()
		logger.debug('End vacuum_all()')
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:35,代码来源:supplier_catalog_item_version_task.py

示例2: URLFilter

class URLFilter(object):

    lock = RLock()

    def __init__(self):
        self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv',
                               'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare']
        self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def forbidden_key_word(self, url):
        for key_word in self.forbidden_keys:
            if key_word in url:
                log.debug('## FORBIDDEN: {}'.format(url))
                return False
        return True

    @staticmethod
    def is_english(url):
        try:
            url.decode('ascii')
        except UnicodeDecodeError:
            log.debug('## NON-ENGLISH PAGE DETECTED: {}'.format(url))
            return False
        else:
            return True

    def pass_check(self, url):
        with URLFilter.lock:
            if url in self.seen:
                log.debug('## SEEN: {}'.format(url))
                return False
            self.seen.add(url)
            return self.forbidden_key_word(url) and self.is_english(url)
开发者ID:heroxdream,项目名称:information-retrieval,代码行数:33,代码来源:URLFilter.py

示例3: FilterHandler

class FilterHandler(object):
  def __init__(self, logger):
    self.logger_ = logger
    self._load_from_file()


  def url_seen(self, url):
    if self.deduper_.add(url):
      self.logger_.info('url duplicated: %s', url)
      return True
    return False


  def _load_from_file(self):
    self.logger_.info('loading data from cache file...')
    if not os.path.isfile('data/bloom.data'):
      self.logger_.error('bloom cache file not found, create one instead.')
      self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
    else:
      with open('data/bloom.data', 'r') as f:
        self.deduper_ = ScalableBloomFilter.fromfile(f)


  def _dump_to_file(self):
    self.logger_.info('dumping data...')
    if not os.path.isdir('data'):
      os.mkdir('data')
    with open('data/bloom.data', 'w') as f:
      self.deduper_.tofile(f)
    self.logger_.info('dump data finished.')


  def close(self):
    self._dump_to_file()
开发者ID:cfhb,项目名称:crawl_youtube,代码行数:34,代码来源:url_filter_service.py

示例4: _load_from_file

 def _load_from_file(self):
   self.logger_.info('loading data from cache file...')
   if not os.path.isfile('data/bloom.data'):
     self.logger_.error('bloom cache file not found, create one instead.')
     self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
   else:
     with open('data/bloom.data', 'r') as f:
       self.deduper_ = ScalableBloomFilter.fromfile(f)
开发者ID:cfhb,项目名称:crawl_youtube,代码行数:8,代码来源:url_filter_service.py

示例5: __init__

  def __init__(self,filterfile):
    self.filterfile = filterfile
	  #if filterfile is present load bloom filter from that file, else create new one
    if os.path.exists(filterfile):
      self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
      print "available signatures = %d"%len(self.bf)
    else:
      self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
开发者ID:FireAVR,项目名称:BloomAutoYara,代码行数:8,代码来源:BloomAutoYara.py

示例6: WishPipeline

class WishPipeline(object):
    def __init__(self):
        self.urls = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def process_item(self, item, spider):
        if item is None or item['url'] is None or item['url'] in self.urls:
            raise DropItem("Duplicate item found.")
        else:
            self.urls.add(item['url'])
            return item
开发者ID:yangxue088,项目名称:wish,代码行数:10,代码来源:pipelines.py

示例7: test_bloom_string

    def test_bloom_string(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
            rnd = ''.join(random.choice(string.letters) for i in xrange(40))
            _ = f.add(rnd)

        self.assertEqual(rnd in f, True)

        for i in string.letters:
            self.assertEqual(i in f, False)

        self.assertEqual(rnd in f, True)
开发者ID:DavisHevin,项目名称:sqli_benchmark,代码行数:13,代码来源:test_pybloom.py

示例8: to_bloomfilter

def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
    """
    Converts the iterable into a ScalableBloomFilter
    
    :rtype : pybloom.ScalableBloomFilter
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    bloom = ScalableBloomFilter(init_cap, err_rate)
    for element in iterable:
        bloom.add(element)

    return bloom
开发者ID:Faiz7412,项目名称:itpy,代码行数:15,代码来源:sketch.py

示例9: test_bloom_int

    def test_bloom_int(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
             _ = f.add(i)

        for i in xrange(0, 10000):
            self.assertEqual(i in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(0,10000-1)
            self.assertEqual(r in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(10000,10000 * 2)
            self.assertEqual(r in f, False)
开发者ID:DavisHevin,项目名称:sqli_benchmark,代码行数:16,代码来源:test_pybloom.py

示例10: RequestFilter

class RequestFilter(object):

    """ RequestFilter """

    def __init__(self):
        self.sbf = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    def request_seen(self, request):
        """request seen
        """
        finger = request_fingerprint(request)
        if finger in self.sbf:
            return True
        self.sbf.add(finger)
        return False
开发者ID:kaito-kidd,项目名称:mini-scrapy,代码行数:16,代码来源:scheduler.py

示例11: get_category_conversion

	def get_category_conversion(self, supplier_id, manufacturer_id, category_identifier):
		"""Category Conversion"""
		if self.category_conversion_filter is None:
			self.category_conversion_filter = ScalableBloomFilter()
			query = self.session.query(
				CategoryConversionModel.supplier_id,
				CategoryConversionModel.manufacturer_id,
				CategoryConversionModel.needle
			)
			for row in query.yield_per(100):
				self.category_conversion_filter.add(row)
		
		row = (supplier_id, manufacturer_id, category_identifier)
		if row in self.category_conversion_filter:
			query = self.session.query(CategoryConversionModel)
			query = query.filter(CategoryConversionModel.supplier_id == supplier_id)
			query = query.filter(CategoryConversionModel.manufacturer_id == manufacturer_id)
			query = query.filter(CategoryConversionModel.needle == category_identifier)
			try:
				category_conversion = query.one()
				return category_conversion
			except NoResultFound:
				pass

		category_conversion = CategoryConversionModel()
		category_conversion.manufacturer_id = manufacturer_id
		category_conversion.supplier_id = supplier_id
		category_conversion.needle = category_identifier
		self.session.add(category_conversion)
		self.category_conversion_filter.add(row)
		return category_conversion
开发者ID:jdsteele,项目名称:bakedpytato,代码行数:31,代码来源:supplier_catalog_item_task.py

示例12: __init__

 def __init__(self, initial_capacity=1000, error_rate=0.0001):
     self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
             error_rate=error_rate,
             mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     # False positives in the Bloom filter will cause us to fail to
     # garbage-collect an object.  Salt the Bloom filter to ensure
     # that we get a different set of false positives on every run.
     self._bloom_salt = os.urandom(2)
开发者ID:cmusatyalab,项目名称:deltaic,代码行数:8,代码来源:util.py

示例13: __init__

 def __init__(self, source_image):
     self.source_image = source_image
     self.bloom_filter = ScalableBloomFilter(
         initial_capacity=source_image.tiles.count(),
         error_rate=0.0001,  # 1 in 10,000
     )
     existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match')
     for tile_id, existing_match_id in existing_matches:
         self.bloom_filter.add((tile_id, existing_match_id))
开发者ID:pipermerriam,项目名称:mozy,代码行数:9,代码来源:exclusions.py

示例14: __init__

 def __init__(self, spider):
     super(BFSFrontier, self).__init__(spider)
     self._spider = spider
     self.args = {'rules': [],
                  'order': 'bfs'}
     self.redis = RediSugar.getConnection()
     self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
     self.todo = spider.name + '-todo'
     self.visited = spider.name + '-visited'
     self._feedfilter()
开发者ID:ymero,项目名称:PyCrawler,代码行数:10,代码来源:frontier.py

示例15: count_distinct_approx

def count_distinct_approx(iterable, init_cap=200, err_rate=0.001):
    """
    Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate
    the number of distinct values found in this iterable.
    
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    counter = 0

    set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate)

    for element in iterable:
        if element not in set_of_distinct_values:
            set_of_distinct_values.add(element)
            counter += 1

    return counter
开发者ID:Faiz7412,项目名称:itpy,代码行数:20,代码来源:sketch.py


注:本文中的pybloom.ScalableBloomFilter类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。