本文整理汇总了Python中pybloom.BloomFilter.tofile方法的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter.tofile方法的具体用法?Python BloomFilter.tofile怎么用?Python BloomFilter.tofile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pybloom.BloomFilter
的用法示例。
在下文中一共展示了BloomFilter.tofile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _build_filter
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def _build_filter():
bf = BloomFilter(capacity=10000, error_rate=0.001)
worst = [w[:-2] for w in open(_WORST_DUMP).readlines()]
map(bf.add, worst)
with open(_BLOOM_DUMP, 'w') as f:
bf.tofile(f)
print "Serialized bloom filter to ", _BLOOM_DUMP
示例2: main
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def main(argv):
if argv:
error_rate = float(argv[0])
print "[BUILDING] Using error-rate: {}".format(error_rate)
if os.path.isfile(nsrl_path):
print "[BUILDING] Reading in NSRL Database"
with open(nsrl_path) as f_line:
# Strip off header
_ = f_line.readline()
print "[BUILDING] Calculating number of hashes in NSRL..."
num_lines = sum(bl.count("\n") for bl in blocks(f_line))
print "[BUILDING] There are %s hashes in the NSRL Database" % num_lines
with open(nsrl_path) as f_nsrl:
# Strip off header
_ = f_nsrl.readline()
print "[BUILDING] Creating bloomfilter"
bf = BloomFilter(num_lines, error_rate)
print "[BUILDING] Inserting hashes into bloomfilter"
for line in f_nsrl:
sha1_hash = line.split(",")[0].strip('"')
if sha1_hash:
try:
sha1 = binascii.unhexlify(sha1_hash)
bf.add(sha1)
except Exception as e:
print "[ERROR] %s" % e
print "[BUILDING] NSRL bloomfilter contains {} items.".format(len(bf))
with open('nsrl.bloom', 'wb') as nb:
bf.tofile(nb)
print "[BUILDING] Complete"
else:
print("[ERROR] No such file or directory: %s", nsrl_path)
return
示例3: main
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def main():
if os.path.isfile(nsrl_path):
print "BUILDING: Reading in NSRL Database"
with open(nsrl_path) as f_line:
# Strip off header
_ = f_line.readline()
print "BUILDING: Calculating number of hashes in NSRL..."
num_lines = sum(bl.count("\n") for bl in blocks(f_line))
print "BUILDING: There are %s hashes in the NSRL Database" % num_lines
with open(nsrl_path) as f_nsrl:
# Strip off header
_ = f_nsrl.readline()
print "BUILDING: Creating bloomfilter"
bf = BloomFilter(num_lines, error_rate)
print "BUILDING: Inserting hashes into bloomfilter"
for line in f_nsrl:
md5_hash = line.split(",")[1].strip('"')
if md5_hash:
try:
bf.add(md5_hash)
except Exception as e:
print "ERROR: %s" % e
print "BUILDING: NSRL bloomfilter contains {} items.".format(len(bf))
with open('nsrl.bloom', 'wb') as nb:
bf.tofile(nb)
print "BUILDING: Complete"
else:
print("ERROR: No such file or directory: %s", nsrl_path)
return
示例4: BloomZip
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
class BloomZip(object):
def __init__(self, name):
super(BloomZip, self).__init__()
self.__data = StringIO()
self._name = name
self._bf = None
if os.path.isfile(self._name):
with open(self._name, 'rb') as f:
length = struct.unpack(">L", f.read(4))[0]
self._bf = BloomFilter.fromfile(f, length)
def contains(self, word):
return word in self._bf
def write(self, data):
self.__data.write(data)
def close(self):
if self._bf is None and self.__data is None:
return
words = self.__data.getvalue().split()
self._bf = BloomFilter(capacity=len(words) + 1)
for word in words:
self._bf.add(word, skip_check=True)
def get_bl_size():
t = tempfile.NamedTemporaryFile().name
with open(t, 'w') as fn:
self._bf.tofile(fn)
s = os.path.getsize(t)
os.remove(t)
return s
if os.path.isfile(self._name):
return
a = open(self._name, 'w')
a.write(struct.pack(">L", get_bl_size()))
self._bf.tofile(a)
with GzipFile(self._name, 'w', fileobj=a) as f:
f.write(self.__data.getvalue())
a.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
if exc_type is not None:
print(exc_tb)
raise exc_val
示例5: to_bloom
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def to_bloom(filename):
with open(filename, 'r') as f:
b = BloomFilter(capacity=1000000, error_rate=0.001)
for line in f:
if line != "":
b.add(line)
new_filename = filename + ".bloom"
out_f = open(new_filename, 'wb')
b.tofile(out_f)
示例6: record
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def record(url):
"""
first time download tieba img
create a bloomfliter for the next time downloading
"""
numlist =getallnumlist(url)
bloomfilter =BloomFilter(1000000)
for number in numlist:
bloomfilter.add(number)
with open('./%s/check/bloomfilter' %(url[28:]) ,'ab+') as b:
bloomfilter.tofile(b)
#print 'pool'
multiprocessdownload(numlist)
示例7: compile
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def compile():
boys = BloomFilter(capacity=703)
girls = BloomFilter(capacity=1003)
with open('sample_data/names.csv', 'r') as f:
reader = csv.reader(f)
reader.next()
for row in reader:
if float(row[2])<.0005:
continue
if row[3].lower() == 'boy':
boys.add(row[1].lower())
elif row[3].lower() == 'girl':
girls.add(row[1].lower())
with open('blooms/boys', 'w') as f:
boys.tofile(f)
with open('blooms/girls', 'w') as f:
girls.tofile(f)
print len(boys), len(girls)
示例8: compile
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def compile():
uni_names = BloomFilter(capacity=719)
name_strings = []
with open('sample_data/uni_names.out', 'r') as f:
for line in f:
m = re.search(r'\| "(.*)"$', line.strip())
if m:
name = m.group(1).strip().lower()
name_strings.append(name)
uni_names.add(name)
ngpol_filt = NGPOLFilter(4, name_strings)
for name in name_strings:
if name not in ngpol_filt:
print name
print ngpol_filt.min_rating
print ngpol_filt.deviation
ngpol_filt.clean()
with open('blooms/uni_names','w') as f:
uni_names.tofile(f)
with open('ngpols/uni_names','w') as f:
ngpol_filt.tofile(f)
print len(uni_names)
示例9: create_filter
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def create_filter(datafile, force=False):
assert os.path.isfile(datafile)
datadir, datafilename = os.path.split(datafile)
filter_file = os.path.join(datadir, datafilename + ".filter")
if force or not os.path.isfile(filter_file):
bf = BloomFilter(capacity=1e6)
with open(datafile) as df:
line = next(df)
try:
while True:
if is_parent_line(line):
word, skips, _ = parse_parent_line(line)
bf.add(word)
for i in xrange(1, skips):
next(df)
line = next(df)
except StopIteration:
with open(filter_file, 'w') as ff:
bf.tofile(ff)
del bf
print("%s done." % filter_file)
示例10: __init__
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
class UrlFilterHandler:
# 第n次保存到文件中
NUMS = 2000
def __init__(self):
try:
with open(FILTER_FILE) as f:
self.f = BloomFilter.fromfile(f)
except IOError:
self.f = BloomFilter(capacity=10000000, error_rate=0.001)
self.num = 0
def is_dup(self, url):
self.num += 1
if self.num > self.NUMS:
self.save()
self.num = 0
return self.f.add(url)
def save(self):
with open(FILTER_FILE, 'w') as f:
self.f.tofile(f)
示例11: compile
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def compile():
names = []
bloom = BloomFilter(capacity=2494)
with open('sample_data/actor_names.json', 'r') as f:
j = json.load(f)
for b in j['results']['bindings']:
name = b['name']['value'].upper()
name = ''.join(re.findall('[A-Z0-9]+',name))
names.append(name)
bloom.add(name)
names = list(set(names))
filt = NGPOLFilter(3, names, false_neg_rate=.01)
filt.update_bounds()
filt.clean()
#create rating histogram
ratings = [filt.ngpol.rate(n) for n in names]
plot.hist(ratings, bins=50)
plot.xlabel("NGram Rating")
plot.ylabel("Names")
plot.savefig("imgs/actor_names.png")
plot.show()
with open('blooms/actor_names','wb') as f:
bloom.tofile(f)
with open('ngpols/actor_names','wb') as f:
filt.tofile(f)
#create prob sets
probset = NgramProbSet(3,names)
probset2 = LengthProbSet(names)
with open('probsets/actor_names', 'wb') as f:
probset.tofile(f)
with open('probsets/actor_names_len', 'wb') as f:
probset2.tofile(f)
print "compiled."
示例12: build_one_case
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def build_one_case(case_file):
bf = BloomFilter(capacity=MAX_CAPACITY,error_rate=0.01)
case_file_path = init.dir_case_files + case_file + format_case_file
for line in open(case_file_path,"r").readlines():
ls = line.strip().split(r'!#!')
if len(ls) < 5:
print "case_file: ",case_file_path," dump line: ",line
func_name = ls[1]
tid = ls[2]
status = ls[3]
time_stamp = ls[4]
hash_code = ls[0]
insert_func_stack = "insert into "+init.case_table_name+" values(%s,%s,%s,%s,%s)"
param_func_stack = (case_file,func_name,tid,status,time_stamp)
cursor.execute(insert_func_stack,param_func_stack)
bf.add(hash_code)
print case_file," bf_set size: ",len(bf)
bf_filename = get_bf_filename(case_file)
bf.tofile(open(bf_filename,"wb"))
sql = "insert into CASE_VERSION VALUES(%s,%s,%s,%s)"
case_file_path = init.dir_case_files + case_file + format_case_file
param = (case_file,case_file_path," ",bf_filename)
cursor.execute(sql,param)
return bf_filename
示例13: UidQueue
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
class UidQueue():
"""
Uid queue, include queue and bloom filter
"""
def __init__(self, max_count=200000, error_rate=0.001):
"""
Initialize
@param max_count: capacity of bloom filter
@param error_rate: error_rate of bloom filter
@return: None
"""
self.queue = Queue()
self.bloom = BloomFilter(capacity=max_count, error_rate=error_rate)
self.crawled = 0
@staticmethod
def _remove_duplicate(list_in):
"""
remove duplicated item in list
@param list_in: list
@return: None
"""
return list(set(list_in))
def dump(self, path, encoding):
"""
Dump data to file
@param path: path prefix
@param encoding: file encoding
@return: None
"""
try:
print "Saving ... "
with codecs.open(path+'-queue.bak', 'wb', encoding) as wf:
tmp = {'queue': list(set(list(self.queue.queue))), 'count': self.crawled}
json.dump(tmp, wf)
with codecs.open(path+'-bloom.bak', 'wb') as wf:
self.bloom.tofile(wf)
except Exception as e:
print "Dump Uid Queue Failed"
print e
def restore(self, path, encoding):
"""
Restore data from file
@param path: path prefix
@param encoding: file encoding
@return: None
"""
try:
with codecs.open(path+'-bloom.bak', 'rb') as rf:
self.bloom.fromfile(rf)
with codecs.open(path+'-queue.bak', 'rb', encoding) as rf:
tmp = json.load(rf)
[self.queue.put(uid) for uid in tmp['queue']]
self.crawled = tmp['count']
# set encoding=utf-8 is wrong, only deal with ascii
except Exception as e:
print "Restore Uid Queue Failed: ", e
def _put_all(self, list_in, block, timeout):
"""
Put all item in list to queue while item not in bloom
@param list_in: Item from
@param block: Is block for put open
@param timeout: Timeout of put
@return: None
"""
[self.queue.put(uid, block, timeout) for uid in list_in]
def extend(self, container, block=True, timeout=3):
"""
Extend uid Queue, remove duplicated and put all in container to queue
@param container: Where items in
@param block: Is block for put open
@param timeout: Timeout of put
@return: Error return WRONG_TYPE
"""
# TODO: seem like can not remove duplicate using set
tmp = []
for uid in container:
if uid not in self.bloom and uid not in self.queue.queue:
tmp.append(uid)
self._put_all(list_in=tmp, block=block, timeout=timeout)
def get(self, block=True, timeout=3):
"""
Get uid from queue, and add it into bloom filter
@param block: Is block for put open
@param timeout: Timeout of put
@return: uid
"""
uid = self.queue.get(block=block, timeout=timeout)
self.bloom.add(uid)
self.crawled += 1
return uid
def __len__(self):
#.........这里部分代码省略.........
示例14: GadgetBox
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
class GadgetBox( object ):
def __init__( self, name, size, dataDir="" ):
self.dataDir = dataDir
self.filter = BloomFilter( capacity=size, error_rate=0.1 )
self.memory = None
self.keys = None
self.name = name
self.size = size
self.loadGadgets()
def persist( self, keys, memory ):
with open( os.path.join( self.dataDir, "%s.bloom" % self.name ), "wb" ) as f:
self.filter.tofile( f )
with open( os.path.join( self.dataDir, "%s.meta" % self.name ), "wb" ) as f:
keyList = pb_pb2.KeyList()
for k in keys:
kk = pb_pb2.Key()
kk.key = k[0]
kk.gadget.CopyFrom( k[1] )
keyList.keys.add().CopyFrom( kk )
f.write( keyList.SerializeToString() )
return self
def get( self, key ):
with open( "%s/%s.data" % ( self.dataDir, self.name ), "rb" ) as f:
f.seek( key.offset )
ret = f.read( key.size )
return ret
def putGadget( self, key, gadget ):
self.keys[key] = gadget
def put( self, data ):
if( self.memory == None ):
self.memory = MemoryStore( self.name, size=self.size, dataDir=self.dataDir )
return self.memory.put( data )
def close( self ):
if( self.memory != None ):
self.memory.close()
self.memory = None
def loadGadgets( self ):
if( self.keys == None ):
data = cStringIO.StringIO()
try:
with open( os.path.join( self.dataDir, "%s.meta" % self.name ), "rb" ) as f:
for i in itertools.count():
d = os.read( f.fileno(), 1024 * 1024 )
if not d:
break
else:
data.write( d )
keyList = pb_pb2.KeyList.FromString( data.getvalue() )
self.keys = {}
for key in keyList.keys:
self.keys[key.key] = key.gadget
except IOError:
self.keys = {}
finally:
data.close()
return self.keys
示例15: open
# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
import sys
from pybloom import BloomFilter
from unittest import TestSuite
if __name__=="__main__":
ser=open("oom","rb")
ser.seek(0)
oom=BloomFilter.fromfile(ser)
for f in os.listdir(sys.argv[1]):
if f.find(".log") == -1:
continue
for line in open(os.path.join(sys.argv[1],f)):
ss=line.strip().split("\t")
if not (ss[len(ss)-1] in oom):
print ss[len(ss)-1]
sys.exit(0)
oom = BloomFilter(capacity=1000*1000*200,error_rate=0.0001)
for f in os.listdir(sys.argv[1]):
if f.find(".log") == -1:
continue
for line in open(os.path.join(sys.argv[1],f)):
ss=line.strip().split("\t")
oom.add(ss[len(ss)-1])
if not (ss[len(ss)-1] in oom ):
print ss[len(ss)-1]
ser=open("oom","wb")
oom.tofile(ser)
ser.close()