当前位置: 首页>>代码示例>>Python>>正文


Python ScalableBloomFilter.tofile方法代码示例

本文整理汇总了Python中pybloom.ScalableBloomFilter.tofile方法的典型用法代码示例。如果您正苦于以下问题:Python ScalableBloomFilter.tofile方法的具体用法?Python ScalableBloomFilter.tofile怎么用?Python ScalableBloomFilter.tofile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pybloom.ScalableBloomFilter的用法示例。


在下文中一共展示了ScalableBloomFilter.tofile方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: FilterHandler

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import tofile [as 别名]
class FilterHandler(object):
  def __init__(self, logger):
    self.logger_ = logger
    self._load_from_file()


  def url_seen(self, url):
    if self.deduper_.add(url):
      self.logger_.info('url duplicated: %s', url)
      return True
    return False


  def _load_from_file(self):
    self.logger_.info('loading data from cache file...')
    if not os.path.isfile('data/bloom.data'):
      self.logger_.error('bloom cache file not found, create one instead.')
      self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
    else:
      with open('data/bloom.data', 'r') as f:
        self.deduper_ = ScalableBloomFilter.fromfile(f)


  def _dump_to_file(self):
    self.logger_.info('dumping data...')
    if not os.path.isdir('data'):
      os.mkdir('data')
    with open('data/bloom.data', 'w') as f:
      self.deduper_.tofile(f)
    self.logger_.info('dump data finished.')


  def close(self):
    self._dump_to_file()
开发者ID:cfhb,项目名称:crawl_youtube,代码行数:36,代码来源:url_filter_service.py

示例2: __init__

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import tofile [as 别名]
class BloomAutoYara:
  def __init__(self,filterfile):
    self.filterfile = filterfile
	  #if filterfile is present load bloom filter from that file, else create new one
    if os.path.exists(filterfile):
      self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
      print "available signatures = %d"%len(self.bf)
    else:
      self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

  def save_filter(self):
    print "saving filter to file %s "%self.filterfile
    self.bf.tofile(open(self.filterfile,"wb"))

  def add_string(self,str):
    self.bf.add(str)

  def search_string(self,str):
    if str in self.bf:
      return True
    else:
      return False

  def extractlines(self,filename,min_len=4):
    chars = r"A-Za-z0-9/\-:.,_$%'()[\]<> "
    shortest_run = 4
    regexp = '[%s]{%d,}' % (chars, shortest_run)
    pattern = re.compile(regexp)
    fp = open(filename,"rb")
    data = fp.read()
    lines = pattern.findall(data)
    s = set(lines)
    fp.close()
    return list(s)
   
  def build_filter(self,dirname,extensions=[]):
    print extensions
    total = 0
    for (dir, _, files) in os.walk(dirname):
      for f in files:
        ext = f.split(".")[-1]
        
        if len(extensions) != 0 and ext not in extensions:
          continue
          
        print "processing file %s"%f
        total += 1
        path = os.path.join(dir, f)
        lines = self.extractlines(path)
        for line in lines:
          self.add_string(line)
  
    print "creating bloom filter done. Total files = %d (Total entries = %d). Overwriting to bloom filter output file %s"%(total,len(self.bf),self.filterfile)
    self.save_filter()
    
  def find_file_topn(self,filename,topn=10):
    tmp = []
    lines = self.extractlines(filename)
    print "total unique strings in file %s = %d"%(filename,len(lines))
    for line in lines:
      if self.search_string(line) == False:
        tmp.append(line)
    tmp.sort(key=len)
    print "total strings which can be used for signature = %d"%len(tmp)
    tmp = tmp[-topn:]
    tmp.reverse()
    return tmp
    
  def find_dir_topn(self,dirname,topn=10):
    tmplist = []
    for (dir, _, files) in os.walk(dirname):
      for f in files:
        path = os.path.join(dir, f)
        lines = self.extractlines(path)
        for line in lines:
          if self.search_string(line) == False:
            tmplist.append(line) 
    
    counts = Counter(list(tmplist))
    return counts.most_common(topn)

  def escapechars(self,str):
    for c in "\/.^$*+-?()[]{}|":
      str = str.replace(c,"\\"+c)
    return str
    
  def list_to_rule(self,list,rulename,threshold=0.5):
    tmp = "rule " + rulename + "{\n"
    tmp += " strings:\n"
    
    for i in xrange(0,len(list)):
      esc = self.escapechars(list[i])
      tmp = tmp + "$str%d = "%i + r"/[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]" + esc + r"[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]/"
      tmp += "\n"
    
    tmp += "condition:\n"
    tmp += str(int(len(list)*threshold))
    tmp += " of ("
    for i in xrange(0,len(list)):
      tmp += "$str"+ str(i)
#.........这里部分代码省略.........
开发者ID:FireAVR,项目名称:BloomAutoYara,代码行数:103,代码来源:BloomAutoYara.py

示例3: open

# 需要导入模块: from pybloom import ScalableBloomFilter [as 别名]
# 或者: from pybloom.ScalableBloomFilter import tofile [as 别名]
    with open(CVX_PATH, encoding='utf-16') as handle:
        reader = csv.DictReader(handle, delimiter='|', fieldnames=fieldnames)

        for row in reader:
            bf.add(CVX + '|' + row['cvx code'].strip())

try:
    # If the bloom filter already exists, we're probably just appending to it
    with open(BF_PATH, 'rb') as handle:
        bf = ScalableBloomFilter.fromfile(handle)
except FileNotFoundError:
    # If it doesn't, we need to make one
    bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH,
                             initial_capacity=INITIAL_CAPACITY,
                             error_rate=ERROR_RATE)

import_loinc(bf)
import_snomed(bf)
import_rxnorm(bf)
import_icd9(bf)
import_icd10(bf)
import_cpt(bf)
import_fhir(bf)
import_daf(bf)
import_argo(bf)
import_cvx(bf)

if __name__ == '__main__':
    with open(BF_PATH, 'wb') as handle:
        bf.tofile(handle)
开发者ID:sync-for-science,项目名称:test-suite,代码行数:32,代码来源:build_bf.py


注:本文中的pybloom.ScalableBloomFilter.tofile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。