本文整理汇总了Python中pybloom.ScalableBloomFilter.tofile方法的典型用法代码示例。如果您正苦于以下问题:Python ScalableBloomFilter.tofile方法的具体用法?Python ScalableBloomFilter.tofile怎么用?Python ScalableBloomFilter.tofile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pybloom.ScalableBloomFilter
的用法示例。
在下文中一共展示了ScalableBloomFilter.tofile方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: FilterHandler
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import tofile [as 别名]
class FilterHandler(object):
def __init__(self, logger):
self.logger_ = logger
self._load_from_file()
def url_seen(self, url):
if self.deduper_.add(url):
self.logger_.info('url duplicated: %s', url)
return True
return False
def _load_from_file(self):
self.logger_.info('loading data from cache file...')
if not os.path.isfile('data/bloom.data'):
self.logger_.error('bloom cache file not found, create one instead.')
self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
else:
with open('data/bloom.data', 'r') as f:
self.deduper_ = ScalableBloomFilter.fromfile(f)
def _dump_to_file(self):
self.logger_.info('dumping data...')
if not os.path.isdir('data'):
os.mkdir('data')
with open('data/bloom.data', 'w') as f:
self.deduper_.tofile(f)
self.logger_.info('dump data finished.')
def close(self):
self._dump_to_file()
示例2: __init__
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import tofile [as 别名]
class BloomAutoYara:
def __init__(self,filterfile):
self.filterfile = filterfile
#if filterfile is present load bloom filter from that file, else create new one
if os.path.exists(filterfile):
self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
print "available signatures = %d"%len(self.bf)
else:
self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
def save_filter(self):
print "saving filter to file %s "%self.filterfile
self.bf.tofile(open(self.filterfile,"wb"))
def add_string(self,str):
self.bf.add(str)
def search_string(self,str):
if str in self.bf:
return True
else:
return False
def extractlines(self,filename,min_len=4):
chars = r"A-Za-z0-9/\-:.,_$%'()[\]<> "
shortest_run = 4
regexp = '[%s]{%d,}' % (chars, shortest_run)
pattern = re.compile(regexp)
fp = open(filename,"rb")
data = fp.read()
lines = pattern.findall(data)
s = set(lines)
fp.close()
return list(s)
def build_filter(self,dirname,extensions=[]):
print extensions
total = 0
for (dir, _, files) in os.walk(dirname):
for f in files:
ext = f.split(".")[-1]
if len(extensions) != 0 and ext not in extensions:
continue
print "processing file %s"%f
total += 1
path = os.path.join(dir, f)
lines = self.extractlines(path)
for line in lines:
self.add_string(line)
print "creating bloom filter done. Total files = %d (Total entries = %d). Overwriting to bloom filter output file %s"%(total,len(self.bf),self.filterfile)
self.save_filter()
def find_file_topn(self,filename,topn=10):
tmp = []
lines = self.extractlines(filename)
print "total unique strings in file %s = %d"%(filename,len(lines))
for line in lines:
if self.search_string(line) == False:
tmp.append(line)
tmp.sort(key=len)
print "total strings which can be used for signature = %d"%len(tmp)
tmp = tmp[-topn:]
tmp.reverse()
return tmp
def find_dir_topn(self,dirname,topn=10):
tmplist = []
for (dir, _, files) in os.walk(dirname):
for f in files:
path = os.path.join(dir, f)
lines = self.extractlines(path)
for line in lines:
if self.search_string(line) == False:
tmplist.append(line)
counts = Counter(list(tmplist))
return counts.most_common(topn)
def escapechars(self,str):
for c in "\/.^$*+-?()[]{}|":
str = str.replace(c,"\\"+c)
return str
def list_to_rule(self,list,rulename,threshold=0.5):
tmp = "rule " + rulename + "{\n"
tmp += " strings:\n"
for i in xrange(0,len(list)):
esc = self.escapechars(list[i])
tmp = tmp + "$str%d = "%i + r"/[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]" + esc + r"[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]/"
tmp += "\n"
tmp += "condition:\n"
tmp += str(int(len(list)*threshold))
tmp += " of ("
for i in xrange(0,len(list)):
tmp += "$str"+ str(i)
#.........这里部分代码省略.........
示例3: open
# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import tofile [as 别名]
with open(CVX_PATH, encoding='utf-16') as handle:
reader = csv.DictReader(handle, delimiter='|', fieldnames=fieldnames)
for row in reader:
bf.add(CVX + '|' + row['cvx code'].strip())
try:
# If the bloom filter already exists, we're probably just appending to it
with open(BF_PATH, 'rb') as handle:
bf = ScalableBloomFilter.fromfile(handle)
except FileNotFoundError:
# If it doesn't, we need to make one
bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH,
initial_capacity=INITIAL_CAPACITY,
error_rate=ERROR_RATE)
import_loinc(bf)
import_snomed(bf)
import_rxnorm(bf)
import_icd9(bf)
import_icd10(bf)
import_cpt(bf)
import_fhir(bf)
import_daf(bf)
import_argo(bf)
import_cvx(bf)
if __name__ == '__main__':
with open(BF_PATH, 'wb') as handle:
bf.tofile(handle)