本文整理汇总了Python中pybloom.ScalableBloomFilter类的典型用法代码示例。如果您正苦于以下问题:Python ScalableBloomFilter类的具体用法?Python ScalableBloomFilter怎么用?Python ScalableBloomFilter使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ScalableBloomFilter类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: vacuum_all
def vacuum_all(self, limit=None):
logger.debug('Begin vacuum_all(limit=%s)', limit)
self.plugins = self.load_plugins()
self.session.begin(subtransactions=True)
ts = self.term_stat('SupplierCatalogItemVersion Vacuum', len(self.plugins))
#s = set()
s = ScalableBloomFilter()
query = self.session.query(SupplierCatalogModel.id)
for (supplier_catalog_id, ) in query.yield_per(100):
s.add(supplier_catalog_id)
for plug in self.plugins.itervalues():
supplier_catalog_filter_id = plug.supplier_catalog_filter_id()
model_name = plug.version_model() + 'Model'
VersionModel = getattr(model, model_name)
query = self.session.query(VersionModel)
if limit:
query = query.order_by(VersionModel.vacuumed.nullsfirst())
query = query.limit(limit)
ts['sub_done'] = 0
ts['sub_total'] = query.count()
for supplier_catalog_item_version in query.yield_per(10):
if supplier_catalog_item_version.supplier_catalog_id not in s:
logger.debug("Deleting %s %s", model_name, supplier_catalog_item_version.id)
self.session.delete(supplier_catalog_item_version)
ts['sub_done'] += 1
ts['done'] += 1
self.session.commit()
ts.finish()
logger.debug('End vacuum_all()')
示例2: URLFilter
class URLFilter(object):
lock = RLock()
def __init__(self):
self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv',
'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare']
self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)
def forbidden_key_word(self, url):
for key_word in self.forbidden_keys:
if key_word in url:
log.debug('## FORBIDDEN: {}'.format(url))
return False
return True
@staticmethod
def is_english(url):
try:
url.decode('ascii')
except UnicodeDecodeError:
log.debug('## NON-ENGLISH PAGE DETECTED: {}'.format(url))
return False
else:
return True
def pass_check(self, url):
with URLFilter.lock:
if url in self.seen:
log.debug('## SEEN: {}'.format(url))
return False
self.seen.add(url)
return self.forbidden_key_word(url) and self.is_english(url)
示例3: FilterHandler
class FilterHandler(object):
def __init__(self, logger):
self.logger_ = logger
self._load_from_file()
def url_seen(self, url):
if self.deduper_.add(url):
self.logger_.info('url duplicated: %s', url)
return True
return False
def _load_from_file(self):
self.logger_.info('loading data from cache file...')
if not os.path.isfile('data/bloom.data'):
self.logger_.error('bloom cache file not found, create one instead.')
self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
else:
with open('data/bloom.data', 'r') as f:
self.deduper_ = ScalableBloomFilter.fromfile(f)
def _dump_to_file(self):
self.logger_.info('dumping data...')
if not os.path.isdir('data'):
os.mkdir('data')
with open('data/bloom.data', 'w') as f:
self.deduper_.tofile(f)
self.logger_.info('dump data finished.')
def close(self):
self._dump_to_file()
示例4: _load_from_file
def _load_from_file(self):
self.logger_.info('loading data from cache file...')
if not os.path.isfile('data/bloom.data'):
self.logger_.error('bloom cache file not found, create one instead.')
self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
else:
with open('data/bloom.data', 'r') as f:
self.deduper_ = ScalableBloomFilter.fromfile(f)
示例5: __init__
def __init__(self,filterfile):
self.filterfile = filterfile
#if filterfile is present load bloom filter from that file, else create new one
if os.path.exists(filterfile):
self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
print "available signatures = %d"%len(self.bf)
else:
self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
示例6: WishPipeline
class WishPipeline(object):
def __init__(self):
self.urls = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)
def process_item(self, item, spider):
if item is None or item['url'] is None or item['url'] in self.urls:
raise DropItem("Duplicate item found.")
else:
self.urls.add(item['url'])
return item
示例7: test_bloom_string
def test_bloom_string(self):
f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
for i in xrange(0, 10000):
rnd = ''.join(random.choice(string.letters) for i in xrange(40))
_ = f.add(rnd)
self.assertEqual(rnd in f, True)
for i in string.letters:
self.assertEqual(i in f, False)
self.assertEqual(rnd in f, True)
示例8: to_bloomfilter
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
"""
Converts the iterable into a ScalableBloomFilter
:rtype : pybloom.ScalableBloomFilter
:param iterable:
:param init_cap:
:param err_rate:
"""
bloom = ScalableBloomFilter(init_cap, err_rate)
for element in iterable:
bloom.add(element)
return bloom
示例9: test_bloom_int
def test_bloom_int(self):
f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
for i in xrange(0, 10000):
_ = f.add(i)
for i in xrange(0, 10000):
self.assertEqual(i in f, True)
for i in xrange(0, 10000 / 2 ):
r = random.randint(0,10000-1)
self.assertEqual(r in f, True)
for i in xrange(0, 10000 / 2 ):
r = random.randint(10000,10000 * 2)
self.assertEqual(r in f, False)
示例10: RequestFilter
class RequestFilter(object):
""" RequestFilter """
def __init__(self):
self.sbf = ScalableBloomFilter(
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
def request_seen(self, request):
"""request seen
"""
finger = request_fingerprint(request)
if finger in self.sbf:
return True
self.sbf.add(finger)
return False
示例11: get_category_conversion
def get_category_conversion(self, supplier_id, manufacturer_id, category_identifier):
"""Category Conversion"""
if self.category_conversion_filter is None:
self.category_conversion_filter = ScalableBloomFilter()
query = self.session.query(
CategoryConversionModel.supplier_id,
CategoryConversionModel.manufacturer_id,
CategoryConversionModel.needle
)
for row in query.yield_per(100):
self.category_conversion_filter.add(row)
row = (supplier_id, manufacturer_id, category_identifier)
if row in self.category_conversion_filter:
query = self.session.query(CategoryConversionModel)
query = query.filter(CategoryConversionModel.supplier_id == supplier_id)
query = query.filter(CategoryConversionModel.manufacturer_id == manufacturer_id)
query = query.filter(CategoryConversionModel.needle == category_identifier)
try:
category_conversion = query.one()
return category_conversion
except NoResultFound:
pass
category_conversion = CategoryConversionModel()
category_conversion.manufacturer_id = manufacturer_id
category_conversion.supplier_id = supplier_id
category_conversion.needle = category_identifier
self.session.add(category_conversion)
self.category_conversion_filter.add(row)
return category_conversion
示例12: __init__
def __init__(self, initial_capacity=1000, error_rate=0.0001):
self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
error_rate=error_rate,
mode=ScalableBloomFilter.LARGE_SET_GROWTH)
# False positives in the Bloom filter will cause us to fail to
# garbage-collect an object. Salt the Bloom filter to ensure
# that we get a different set of false positives on every run.
self._bloom_salt = os.urandom(2)
示例13: __init__
def __init__(self, source_image):
self.source_image = source_image
self.bloom_filter = ScalableBloomFilter(
initial_capacity=source_image.tiles.count(),
error_rate=0.0001, # 1 in 10,000
)
existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match')
for tile_id, existing_match_id in existing_matches:
self.bloom_filter.add((tile_id, existing_match_id))
示例14: __init__
def __init__(self, spider):
super(BFSFrontier, self).__init__(spider)
self._spider = spider
self.args = {'rules': [],
'order': 'bfs'}
self.redis = RediSugar.getConnection()
self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
self.todo = spider.name + '-todo'
self.visited = spider.name + '-visited'
self._feedfilter()
示例15: count_distinct_approx
def count_distinct_approx(iterable, init_cap=200, err_rate=0.001):
"""
Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate
the number of distinct values found in this iterable.
:param iterable:
:param init_cap:
:param err_rate:
"""
counter = 0
set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate)
for element in iterable:
if element not in set_of_distinct_values:
set_of_distinct_values.add(element)
counter += 1
return counter