本文整理汇总了Python中pybloom.ScalableBloomFilter.add方法的典型用法代码示例。如果您正苦于以下问题:Python ScalableBloomFilter.add方法的具体用法?Python ScalableBloomFilter.add怎么用?Python ScalableBloomFilter.add使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pybloom.ScalableBloomFilter
的用法示例。
在下文中一共展示了ScalableBloomFilter.add方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: vacuum_all
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def vacuum_all(self, limit=None):
logger.debug('Begin vacuum_all(limit=%s)', limit)
self.plugins = self.load_plugins()
self.session.begin(subtransactions=True)
ts = self.term_stat('SupplierCatalogItemVersion Vacuum', len(self.plugins))
#s = set()
s = ScalableBloomFilter()
query = self.session.query(SupplierCatalogModel.id)
for (supplier_catalog_id, ) in query.yield_per(100):
s.add(supplier_catalog_id)
for plug in self.plugins.itervalues():
supplier_catalog_filter_id = plug.supplier_catalog_filter_id()
model_name = plug.version_model() + 'Model'
VersionModel = getattr(model, model_name)
query = self.session.query(VersionModel)
if limit:
query = query.order_by(VersionModel.vacuumed.nullsfirst())
query = query.limit(limit)
ts['sub_done'] = 0
ts['sub_total'] = query.count()
for supplier_catalog_item_version in query.yield_per(10):
if supplier_catalog_item_version.supplier_catalog_id not in s:
logger.debug("Deleting %s %s", model_name, supplier_catalog_item_version.id)
self.session.delete(supplier_catalog_item_version)
ts['sub_done'] += 1
ts['done'] += 1
self.session.commit()
ts.finish()
logger.debug('End vacuum_all()')
示例2: URLFilter
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class URLFilter(object):
lock = RLock()
def __init__(self):
self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv',
'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare']
self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)
def forbidden_key_word(self, url):
for key_word in self.forbidden_keys:
if key_word in url:
log.debug('## FORBIDDEN: {}'.format(url))
return False
return True
@staticmethod
def is_english(url):
try:
url.decode('ascii')
except UnicodeDecodeError:
log.debug('## NON-ENGLISH PAGE DETECTED: {}'.format(url))
return False
else:
return True
def pass_check(self, url):
with URLFilter.lock:
if url in self.seen:
log.debug('## SEEN: {}'.format(url))
return False
self.seen.add(url)
return self.forbidden_key_word(url) and self.is_english(url)
示例3: WishPipeline
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class WishPipeline(object):
def __init__(self):
self.urls = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)
def process_item(self, item, spider):
if item is None or item['url'] is None or item['url'] in self.urls:
raise DropItem("Duplicate item found.")
else:
self.urls.add(item['url'])
return item
示例4: to_bloomfilter
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
"""
Converts the iterable into a ScalableBloomFilter
:rtype : pybloom.ScalableBloomFilter
:param iterable:
:param init_cap:
:param err_rate:
"""
bloom = ScalableBloomFilter(init_cap, err_rate)
for element in iterable:
bloom.add(element)
return bloom
示例5: RequestFilter
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class RequestFilter(object):
""" RequestFilter """
def __init__(self):
self.sbf = ScalableBloomFilter(
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
def request_seen(self, request):
"""request seen
"""
finger = request_fingerprint(request)
if finger in self.sbf:
return True
self.sbf.add(finger)
return False
示例6: count_distinct_approx
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def count_distinct_approx(iterable, init_cap=200, err_rate=0.001):
"""
Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate
the number of distinct values found in this iterable.
:param iterable:
:param init_cap:
:param err_rate:
"""
counter = 0
set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate)
for element in iterable:
if element not in set_of_distinct_values:
set_of_distinct_values.add(element)
counter += 1
return counter
示例7: main
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def main(args):
seenUrlSet = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
for ln in sys.stdin:
if not ln:
continue
fetchedUrl = json.loads(ln)
# continue if we've seen this url already.
if fetchedUrl["url"] in seenUrlSet or fetchedUrl["effective_url"] in seenUrlSet:
continue
# add unseen url to the url set
seenUrlSet.add(fetchedUrl["url"])
seenUrlSet.add(fetchedUrl["effective_url"])
# extract links and filter out some urls by url filter.
outlinks = url_filter(extract_links(fetchedUrl))
# analyze
print "[postproc]%s" % fetchedUrl["url"]
示例8: BloomSet
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class BloomSet(object):
def __init__(self, initial_capacity=1000, error_rate=0.0001):
self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
error_rate=error_rate,
mode=ScalableBloomFilter.LARGE_SET_GROWTH)
# False positives in the Bloom filter will cause us to fail to
# garbage-collect an object. Salt the Bloom filter to ensure
# that we get a different set of false positives on every run.
self._bloom_salt = os.urandom(2)
def add(self, name):
self._set.add(self._bloom_key(name))
def __contains__(self, name):
# May return false positives.
return self._bloom_key(name) in self._set
def _bloom_key(self, name):
if isinstance(name, unicode):
name = name.encode('utf-8')
return self._bloom_salt + name
示例9: vacuum_all
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def vacuum_all(self, limit=None):
logger.debug('Begin vacuum_all(limit=%s)', limit)
self.plugins = self.load_plugins()
ts = self.term_stat('SupplierSpecialItemVersion Vacuum', len(self.plugins))
tx = transaction.get()
try:
#s = set()
s = ScalableBloomFilter()
query = DBSession.query(SupplierSpecialModel.id)
for (supplier_special_id, ) in query.yield_per(100):
s.add(supplier_special_id)
for plug in self.plugins.itervalues():
supplier_special_filter_id = plug.supplier_special_filter_id()
model_name = plug.version_model() + 'Model'
VersionModel = getattr(model, model_name)
query = DBSession.query(VersionModel)
if limit:
query = query.order_by(VersionModel.vacuumed.nullsfirst())
query = query.limit(limit)
ts['sub_done'] = 0
ts['sub_total'] = query.count()
for supplier_special_item_version in query.yield_per(10):
if supplier_special_item_version.supplier_special_id not in s:
logger.debug("Deleting %s %s", model_name, supplier_special_item_version.id)
DBSession.delete(supplier_special_item_version)
ts['sub_done'] += 1
if ts['sub_done'] % 1000 == 0:
DBSession.flush()
DBSession.flush()
ts['done'] += 1
except Exception:
logger.exception('Caught Exception: ')
tx.abort()
finally:
ts.finish()
transaction.commit()
logger.debug('End vacuum_all()')
示例10: test_bloom_string
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def test_bloom_string(self):
f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
for i in xrange(0, 10000):
rnd = ''.join(random.choice(string.letters) for i in xrange(40))
_ = f.add(rnd)
self.assertEqual(rnd in f, True)
for i in string.letters:
self.assertEqual(i in f, False)
self.assertEqual(rnd in f, True)
示例11: __init__
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class kmer_store:
def __init__(self):
self.bloom_filter = ScalableBloomFilter(initial_capacity=1000000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)
self.kmers = {}
def update(self, item):
if item in self.bloom_filter:
if item in self.kmers:
self.kmers[item] += 1
else:
self.kmers[item] = 2
else:
self.bloom_filter.add(item)
def __iter__(self):
for key in self.kmers:
yield key
def __getitem__(self, key):
return self.kmers[key]
def __repr__(self):
return str(self.kmers)
def __str__(self):
return str(self.kmers)
示例12: StockTileExclusions
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class StockTileExclusions(object):
"""
Object that keeps track of which stock tiles have already been used.
"""
def __init__(self, source_image):
self.source_image = source_image
self.bloom_filter = ScalableBloomFilter(
initial_capacity=source_image.tiles.count(),
error_rate=0.0001, # 1 in 10,000
)
existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match')
for tile_id, existing_match_id in existing_matches:
self.bloom_filter.add((tile_id, existing_match_id))
def __contains__(self, key):
if key in self.bloom_filter:
return True
elif self.source_image.tiles.filter(stock_tile_match_id=key[1]).exists():
self.add(key)
return True
return False
def add(self, key):
self.bloom_filter.add(key)
示例13: test_bloom_int
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def test_bloom_int(self):
f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
for i in xrange(0, 10000):
_ = f.add(i)
for i in xrange(0, 10000):
self.assertEqual(i in f, True)
for i in xrange(0, 10000 / 2 ):
r = random.randint(0,10000-1)
self.assertEqual(r in f, True)
for i in xrange(0, 10000 / 2 ):
r = random.randint(10000,10000 * 2)
self.assertEqual(r in f, False)
示例14: addNewUrl
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
def addNewUrl():
conn = database.getConn()
cursor = conn.cursor()
# check if empty
cursor.execute('SELECT outlinks FROM webpage WHERE status = 2')
num_outlinks = cursor.rowcount
rows_outlinks = cursor.fetchall()
cursor.execute("SELECT error FROM webpage WHERE status = 11")
num_redirect = cursor.rowcount
rows_redirect = cursor.fetchall()
num_all = num_redirect + num_outlinks
if num_all == 0 :
return {'exist':0 , 'insert':0 , 'all':0}
cursor.close()
conn.close()
#bloom start ..input the urls into bloom
import bitarray
from pybloom import ScalableBloomFilter
sql = "SELECT url FROM webpage WHERE 1"
cursor.execute(sql)
num_exist = cursor.rowcount
rows = cursor.fetchall()
sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
for row in rows:
sbf.add(row[0])
#bloom end sbf
insert_arr = []
num_insert = 0
for row in rows_outlinks:
outlinks_arr = row[0].split(',')
proper_links = filterOutLinks(outlinks_arr)
for link in proper_links:
if link in sbf:
pass
else:
num_insert += 1
sbf.add(link)
insert_arr.append((link,0))
# for the redirect url
cursor.execute("SELECT error FROM webpage WHERE status = 11")
rows = cursor.fetchall()
for row in rows_redirect:
link = row[0]
link = filterLink(link)
if link == '':
continue
if link in sbf:
pass
else:
num_insert += 1
sbf.add(link)
insert_arr.append((link,0))
sql = "INSERT INTO webpage (url,status)VALUE(%s,%s)"
cursor.executemany(sql,insert_arr)
cursor.execute("UPDATE webpage SET status = 3 WHERE status = 2 OR status = 11")
cursor.close()
conn.close()
return {'exist':num_exist , 'insert':num_insert , 'all':num_all}
示例15: __init__
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import add [as 别名]
class BloomAutoYara:
def __init__(self,filterfile):
self.filterfile = filterfile
#if filterfile is present load bloom filter from that file, else create new one
if os.path.exists(filterfile):
self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
print "available signatures = %d"%len(self.bf)
else:
self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
def save_filter(self):
print "saving filter to file %s "%self.filterfile
self.bf.tofile(open(self.filterfile,"wb"))
def add_string(self,str):
self.bf.add(str)
def search_string(self,str):
if str in self.bf:
return True
else:
return False
def extractlines(self,filename,min_len=4):
chars = r"A-Za-z0-9/\-:.,_$%'()[\]<> "
shortest_run = 4
regexp = '[%s]{%d,}' % (chars, shortest_run)
pattern = re.compile(regexp)
fp = open(filename,"rb")
data = fp.read()
lines = pattern.findall(data)
s = set(lines)
fp.close()
return list(s)
def build_filter(self,dirname,extensions=[]):
print extensions
total = 0
for (dir, _, files) in os.walk(dirname):
for f in files:
ext = f.split(".")[-1]
if len(extensions) != 0 and ext not in extensions:
continue
print "processing file %s"%f
total += 1
path = os.path.join(dir, f)
lines = self.extractlines(path)
for line in lines:
self.add_string(line)
print "creating bloom filter done. Total files = %d (Total entries = %d). Overwriting to bloom filter output file %s"%(total,len(self.bf),self.filterfile)
self.save_filter()
def find_file_topn(self,filename,topn=10):
tmp = []
lines = self.extractlines(filename)
print "total unique strings in file %s = %d"%(filename,len(lines))
for line in lines:
if self.search_string(line) == False:
tmp.append(line)
tmp.sort(key=len)
print "total strings which can be used for signature = %d"%len(tmp)
tmp = tmp[-topn:]
tmp.reverse()
return tmp
def find_dir_topn(self,dirname,topn=10):
tmplist = []
for (dir, _, files) in os.walk(dirname):
for f in files:
path = os.path.join(dir, f)
lines = self.extractlines(path)
for line in lines:
if self.search_string(line) == False:
tmplist.append(line)
counts = Counter(list(tmplist))
return counts.most_common(topn)
def escapechars(self,str):
for c in "\/.^$*+-?()[]{}|":
str = str.replace(c,"\\"+c)
return str
def list_to_rule(self,list,rulename,threshold=0.5):
tmp = "rule " + rulename + "{\n"
tmp += " strings:\n"
for i in xrange(0,len(list)):
esc = self.escapechars(list[i])
tmp = tmp + "$str%d = "%i + r"/[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]" + esc + r"[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]/"
tmp += "\n"
tmp += "condition:\n"
tmp += str(int(len(list)*threshold))
tmp += " of ("
for i in xrange(0,len(list)):
tmp += "$str"+ str(i)
#.........这里部分代码省略.........