当前位置: 首页>>代码示例>>Python>>正文


Python BloomFilter.tofile方法代码示例

本文整理汇总了Python中pybloom.BloomFilter.tofile方法的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter.tofile方法的具体用法?Python BloomFilter.tofile怎么用?Python BloomFilter.tofile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pybloom.BloomFilter的用法示例。


在下文中一共展示了BloomFilter.tofile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _build_filter

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def _build_filter():
    bf = BloomFilter(capacity=10000, error_rate=0.001)
    worst = [w[:-2] for w in open(_WORST_DUMP).readlines()]
    map(bf.add, worst)
    with open(_BLOOM_DUMP, 'w') as f:
        bf.tofile(f)
    print "Serialized bloom filter to ", _BLOOM_DUMP
开发者ID:kalikaneko,项目名称:wrongpass,代码行数:9,代码来源:wrongpass.py

示例2: main

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def main(argv):
    if argv:
        error_rate = float(argv[0])
    print "[BUILDING] Using error-rate: {}".format(error_rate)
    if os.path.isfile(nsrl_path):
        print "[BUILDING] Reading in NSRL Database"
        with open(nsrl_path) as f_line:
            # Strip off header
            _ = f_line.readline()
            print "[BUILDING] Calculating number of hashes in NSRL..."
            num_lines = sum(bl.count("\n") for bl in blocks(f_line))
            print "[BUILDING] There are %s hashes in the NSRL Database" % num_lines
        with open(nsrl_path) as f_nsrl:
            # Strip off header
            _ = f_nsrl.readline()
            print "[BUILDING] Creating bloomfilter"
            bf = BloomFilter(num_lines, error_rate)
            print "[BUILDING] Inserting hashes into bloomfilter"
            for line in f_nsrl:
                sha1_hash = line.split(",")[0].strip('"')
                if sha1_hash:
                    try:
                        sha1 = binascii.unhexlify(sha1_hash)
                        bf.add(sha1)
                    except Exception as e:
                        print "[ERROR] %s" % e
            print "[BUILDING] NSRL bloomfilter contains {} items.".format(len(bf))
            with open('nsrl.bloom', 'wb') as nb:
                bf.tofile(nb)
            print "[BUILDING] Complete"
    else:
        print("[ERROR] No such file or directory: %s", nsrl_path)

    return
开发者ID:blacktop,项目名称:docker-nsrl,代码行数:36,代码来源:build.py

示例3: main

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def main():
    if os.path.isfile(nsrl_path):
        print "BUILDING: Reading in NSRL Database"
        with open(nsrl_path) as f_line:
            # Strip off header
            _ = f_line.readline()
            print "BUILDING: Calculating number of hashes in NSRL..."
            num_lines = sum(bl.count("\n") for bl in blocks(f_line))
            print "BUILDING: There are %s hashes in the NSRL Database" % num_lines
        with open(nsrl_path) as f_nsrl:
            # Strip off header
            _ = f_nsrl.readline()
            print "BUILDING: Creating bloomfilter"
            bf = BloomFilter(num_lines, error_rate)
            print "BUILDING: Inserting hashes into bloomfilter"
            for line in f_nsrl:
                md5_hash = line.split(",")[1].strip('"')
                if md5_hash:
                    try:
                        bf.add(md5_hash)
                    except Exception as e:
                        print "ERROR: %s" % e
            print "BUILDING: NSRL bloomfilter contains {} items.".format(len(bf))
            with open('nsrl.bloom', 'wb') as nb:
                bf.tofile(nb)
            print "BUILDING: Complete"
    else:
        print("ERROR: No such file or directory: %s", nsrl_path)

    return
开发者ID:morallo,项目名称:docker-nsrl,代码行数:32,代码来源:build.py

示例4: BloomZip

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
class BloomZip(object):
    def __init__(self, name):
        super(BloomZip, self).__init__()
        self.__data = StringIO()
        self._name = name
        self._bf = None

        if os.path.isfile(self._name):
            with open(self._name, 'rb') as f:
                length = struct.unpack(">L", f.read(4))[0]
                self._bf = BloomFilter.fromfile(f, length)

    def contains(self, word):
        return word in self._bf

    def write(self, data):
        self.__data.write(data)

    def close(self):
        if self._bf is None and self.__data is None:
            return

        words = self.__data.getvalue().split()

        self._bf = BloomFilter(capacity=len(words) + 1)

        for word in words:
            self._bf.add(word, skip_check=True)

        def get_bl_size():
            t = tempfile.NamedTemporaryFile().name
            with open(t, 'w') as fn:
                self._bf.tofile(fn)
            s = os.path.getsize(t)
            os.remove(t)
            return s

        if os.path.isfile(self._name):
            return

        a = open(self._name, 'w')
        a.write(struct.pack(">L", get_bl_size()))
        self._bf.tofile(a)
        with GzipFile(self._name, 'w', fileobj=a) as f:
            f.write(self.__data.getvalue())
        a.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
        if exc_type is not None:
            print(exc_tb)
            raise exc_val
开发者ID:stiege,项目名称:bloomzip,代码行数:57,代码来源:bloomzip.py

示例5: to_bloom

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def to_bloom(filename):
    with open(filename, 'r') as f:
        b = BloomFilter(capacity=1000000, error_rate=0.001)

        for line in f:
            if line != "":
                b.add(line)

        new_filename = filename + ".bloom"
        out_f = open(new_filename, 'wb')
        b.tofile(out_f)
开发者ID:chubbymaggie,项目名称:Cardinal,代码行数:13,代码来源:to_big_bloom.py

示例6: record

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def record(url):
    """
    first time download tieba img
    create a bloomfliter for the next time downloading
    """
    numlist =getallnumlist(url)
 
    bloomfilter =BloomFilter(1000000)
    for number in numlist:
        bloomfilter.add(number)
    with open('./%s/check/bloomfilter' %(url[28:])  ,'ab+') as b:
        bloomfilter.tofile(b)
    #print 'pool'              
    
    multiprocessdownload(numlist)
开发者ID:xdemonhx,项目名称:tieba-imgdownload-spider,代码行数:17,代码来源:tiebaspider.py

示例7: compile

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def compile():
    boys = BloomFilter(capacity=703)
    girls = BloomFilter(capacity=1003)

    with open('sample_data/names.csv', 'r') as f:
        reader = csv.reader(f)
        reader.next()
        for row in reader:
            if float(row[2])<.0005:
                continue
            if row[3].lower() == 'boy':
                boys.add(row[1].lower())
            elif row[3].lower() == 'girl':
                girls.add(row[1].lower())

    with open('blooms/boys', 'w') as f:
        boys.tofile(f)
    with open('blooms/girls', 'w') as f:
        girls.tofile(f)
    print len(boys), len(girls)
开发者ID:Glank,项目名称:rdp,代码行数:22,代码来源:compile_names.py

示例8: compile

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def compile():
    uni_names = BloomFilter(capacity=719)
    name_strings = []
    with open('sample_data/uni_names.out', 'r') as f:
        for line in f:
            m = re.search(r'\| "(.*)"$', line.strip())
            if m:
                name = m.group(1).strip().lower()
                name_strings.append(name)
                uni_names.add(name)
    ngpol_filt = NGPOLFilter(4, name_strings)
    for name in name_strings:
        if name not in ngpol_filt:
            print name
    print ngpol_filt.min_rating
    print ngpol_filt.deviation
    ngpol_filt.clean()
    with open('blooms/uni_names','w') as f:
        uni_names.tofile(f)
    with open('ngpols/uni_names','w') as f:
        ngpol_filt.tofile(f)
    print len(uni_names)
开发者ID:Glank,项目名称:rdp,代码行数:24,代码来源:compile_uni_names.py

示例9: create_filter

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def create_filter(datafile, force=False):
    assert os.path.isfile(datafile)
    datadir, datafilename = os.path.split(datafile)
    filter_file = os.path.join(datadir, datafilename + ".filter")
    if force or not os.path.isfile(filter_file):
        bf = BloomFilter(capacity=1e6)
        with open(datafile) as df:
            line = next(df)
            try:
                while True:
                    if is_parent_line(line):
                        word, skips, _ = parse_parent_line(line)
                        bf.add(word)
                        for i in xrange(1, skips):
                            next(df)
                    line = next(df)
            except StopIteration:
                with open(filter_file, 'w') as ff:
                    bf.tofile(ff)
                del bf

        print("%s done." % filter_file)
开发者ID:yiransheng,项目名称:mumbler,代码行数:24,代码来源:bloomfilter.py

示例10: __init__

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
class UrlFilterHandler:

    # 第n次保存到文件中
    NUMS = 2000

    def __init__(self):
        try:
            with open(FILTER_FILE) as f:
                self.f = BloomFilter.fromfile(f)
        except IOError:
            self.f = BloomFilter(capacity=10000000, error_rate=0.001)
        self.num = 0

    def is_dup(self, url):
        self.num += 1
        if self.num > self.NUMS:
            self.save()
            self.num = 0
        return self.f.add(url)

    def save(self):
        with open(FILTER_FILE, 'w') as f:
            self.f.tofile(f)
开发者ID:luotigerlsx,项目名称:DataAnalysis_ML,代码行数:25,代码来源:urlfilter_svc.py

示例11: compile

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def compile():
    names = []
    bloom = BloomFilter(capacity=2494)
    with open('sample_data/actor_names.json', 'r') as f:
        j = json.load(f)
        for b in j['results']['bindings']:
            name = b['name']['value'].upper()
            name = ''.join(re.findall('[A-Z0-9]+',name))
            names.append(name)
            bloom.add(name)

    names = list(set(names))
    filt = NGPOLFilter(3, names, false_neg_rate=.01)
    filt.update_bounds()
    filt.clean()
    #create rating histogram
    ratings = [filt.ngpol.rate(n) for n in names]
    plot.hist(ratings, bins=50)
    plot.xlabel("NGram Rating")
    plot.ylabel("Names")
    plot.savefig("imgs/actor_names.png")
    plot.show()
    with open('blooms/actor_names','wb') as f:
        bloom.tofile(f)
    with open('ngpols/actor_names','wb') as f:
        filt.tofile(f)

    #create prob sets
    probset = NgramProbSet(3,names)
    probset2 = LengthProbSet(names)
    with open('probsets/actor_names', 'wb') as f:
        probset.tofile(f)
    with open('probsets/actor_names_len', 'wb') as f:
        probset2.tofile(f)

    print "compiled."
开发者ID:Glank,项目名称:rdp,代码行数:38,代码来源:compile_actor_names.py

示例12: build_one_case

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
def build_one_case(case_file):
    bf = BloomFilter(capacity=MAX_CAPACITY,error_rate=0.01)
    case_file_path = init.dir_case_files + case_file + format_case_file
    for line in open(case_file_path,"r").readlines():
        ls = line.strip().split(r'!#!')
        if len(ls) < 5:
            print "case_file: ",case_file_path," dump line: ",line
        func_name = ls[1]
        tid = ls[2]
        status = ls[3]
        time_stamp = ls[4]
        hash_code = ls[0]
        insert_func_stack = "insert into "+init.case_table_name+" values(%s,%s,%s,%s,%s)"
        param_func_stack = (case_file,func_name,tid,status,time_stamp)
        cursor.execute(insert_func_stack,param_func_stack)
        bf.add(hash_code)
    print case_file," bf_set size: ",len(bf)
    bf_filename = get_bf_filename(case_file)
    bf.tofile(open(bf_filename,"wb"))
    sql = "insert into CASE_VERSION VALUES(%s,%s,%s,%s)" 
    case_file_path = init.dir_case_files + case_file + format_case_file
    param = (case_file,case_file_path," ",bf_filename)
    cursor.execute(sql,param)
    return bf_filename
开发者ID:qgchenjianzi,项目名称:clang-tools,代码行数:26,代码来源:analyze.py

示例13: UidQueue

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
class UidQueue():
    """
    Uid queue, include queue and bloom filter
    """
    def __init__(self, max_count=200000, error_rate=0.001):
        """
        Initialize
        @param max_count: capacity of bloom filter
        @param error_rate: error_rate of bloom filter
        @return: None
        """
        self.queue   = Queue()
        self.bloom   = BloomFilter(capacity=max_count, error_rate=error_rate)
        self.crawled = 0
    
    @staticmethod
    def _remove_duplicate(list_in):
        """
        remove duplicated item in list
        @param list_in: list
        @return: None
        """
        return list(set(list_in))
    
    def dump(self, path, encoding):
        """
        Dump data to file
        @param path: path prefix
        @param encoding: file encoding
        @return: None 
        """
        try:
            print "Saving ... "
            with codecs.open(path+'-queue.bak', 'wb', encoding) as wf:
                tmp = {'queue': list(set(list(self.queue.queue))), 'count': self.crawled}
                json.dump(tmp, wf)
            with codecs.open(path+'-bloom.bak', 'wb') as wf:
                self.bloom.tofile(wf)
        except Exception as e:
            print "Dump Uid Queue Failed"
            print e
    
    def restore(self, path, encoding):
        """
        Restore data from file
        @param path: path prefix
        @param encoding: file encoding
        @return: None 
        """
        try:
            with codecs.open(path+'-bloom.bak', 'rb') as rf:
                self.bloom.fromfile(rf)
            with codecs.open(path+'-queue.bak', 'rb', encoding) as rf:
                tmp = json.load(rf)
                [self.queue.put(uid) for uid in tmp['queue']]
                self.crawled = tmp['count']       
            # set encoding=utf-8 is wrong, only deal with ascii
            
        except Exception as e:
            print "Restore Uid Queue Failed: ", e
    
    def _put_all(self, list_in, block, timeout):
        """
        Put all item in list to queue while item not in bloom
        @param list_in: Item from
        @param block: Is block for put open
        @param timeout: Timeout of put
        @return: None
        """
        [self.queue.put(uid, block, timeout) for uid in list_in]
    
    def extend(self, container, block=True, timeout=3):
        """
        Extend uid Queue, remove duplicated and put all in container to queue
        @param container: Where items in
        @param block: Is block for put open
        @param timeout: Timeout of put
        @return: Error return WRONG_TYPE
        """
        # TODO: seem like can not remove duplicate using set
        tmp = []
        for uid in container:
            if uid not in self.bloom and uid not in self.queue.queue:
                tmp.append(uid)
        self._put_all(list_in=tmp, block=block, timeout=timeout)
        
    
    def get(self, block=True, timeout=3):
        """
        Get uid from queue, and add it into bloom filter 
        @param block: Is block for put open
        @param timeout: Timeout of put
        @return: uid
        """
        uid = self.queue.get(block=block, timeout=timeout)
        self.bloom.add(uid)
        self.crawled += 1
        return uid
    
    def __len__(self):
#.........这里部分代码省略.........
开发者ID:ezirmusitua,项目名称:weibo-crawler,代码行数:103,代码来源:uid_queue.py

示例14: GadgetBox

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
class GadgetBox( object ):
  def __init__( self, name, size, dataDir="" ):
    self.dataDir = dataDir
    self.filter = BloomFilter( capacity=size, error_rate=0.1 )
    self.memory = None
    self.keys = None
    self.name = name
    self.size = size
    self.loadGadgets()

  def persist( self, keys, memory ):
    with open( os.path.join( self.dataDir, "%s.bloom" % self.name ), "wb" ) as f:
      self.filter.tofile( f )

    with open( os.path.join( self.dataDir, "%s.meta" % self.name ), "wb" ) as f:
      keyList = pb_pb2.KeyList()  
      
      for k in keys:
        kk = pb_pb2.Key()
        kk.key = k[0]
        kk.gadget.CopyFrom( k[1] )
        keyList.keys.add().CopyFrom( kk )

      f.write( keyList.SerializeToString() )

    return self

  def get( self, key ):
    with open( "%s/%s.data" % ( self.dataDir, self.name ), "rb" ) as f:
      f.seek( key.offset )
      ret = f.read( key.size )

    return ret

  def putGadget( self, key, gadget ):
    self.keys[key] = gadget

  def put( self, data ):
    if( self.memory == None ):
      self.memory = MemoryStore( self.name, size=self.size, dataDir=self.dataDir )
    return self.memory.put( data )

  def close( self ):
    if( self.memory != None ):
      self.memory.close()
    self.memory = None

  def loadGadgets( self ):
    if( self.keys == None ):
      data = cStringIO.StringIO()
      try:
        with open( os.path.join( self.dataDir, "%s.meta" % self.name ), "rb" ) as f:
          for i in itertools.count():
            d = os.read( f.fileno(), 1024 * 1024 )
            if not d:
              break
            else:
              data.write( d )
          keyList = pb_pb2.KeyList.FromString( data.getvalue() )
          self.keys = {}
          for key in keyList.keys:
            self.keys[key.key] = key.gadget
      except IOError:
        self.keys = {}
      finally:
        data.close()

    return self.keys
开发者ID:streed,项目名称:gadgetStore,代码行数:70,代码来源:gadget.py

示例15: open

# 需要导入模块: from pybloom import BloomFilter [as 别名]
# 或者: from pybloom.BloomFilter import tofile [as 别名]
import sys
from pybloom import BloomFilter
from unittest import TestSuite
if __name__=="__main__":
  ser=open("oom","rb")
  ser.seek(0)
  oom=BloomFilter.fromfile(ser)
  for f in os.listdir(sys.argv[1]):
	if f.find(".log") == -1:
		continue
	for line in open(os.path.join(sys.argv[1],f)):
		ss=line.strip().split("\t")
		if not (ss[len(ss)-1] in oom):
			print ss[len(ss)-1]


sys.exit(0)

oom = BloomFilter(capacity=1000*1000*200,error_rate=0.0001)
for f in os.listdir(sys.argv[1]):
	if f.find(".log") == -1:
		continue
	for line in open(os.path.join(sys.argv[1],f)):
		ss=line.strip().split("\t")
		oom.add(ss[len(ss)-1])
		if not (ss[len(ss)-1] in oom ):
			print ss[len(ss)-1]
ser=open("oom","wb")
oom.tofile(ser)
ser.close()
开发者ID:fikgol,项目名称:vulcan,代码行数:32,代码来源:urlset.py


注:本文中的pybloom.BloomFilter.tofile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。