当前位置: 首页>>代码示例>>Python>>正文


Python BloomFilter.add方法代码示例

本文整理汇总了Python中pybloomfilter.BloomFilter.add方法的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter.add方法的具体用法?Python BloomFilter.add怎么用?Python BloomFilter.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pybloomfilter.BloomFilter的用法示例。


在下文中一共展示了BloomFilter.add方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: LinkFilter

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class LinkFilter():
    
    def __init__(self, domain):
        self.file_index = '%s_%s' % (domain, 'index.bf')
        self.file_html = '%s_%s' % (domain, 'html.bf')

        if os.path.exists(self.file_index):
            self.bf_index = BloomFilter.open(self.file_index)
        else:
            self.bf_index = BloomFilter(100000000, 0.001, self.file_index)

        if os.path.exists(self.file_html):
            self.bf_html = BloomFilter.open(self.file_html)
        else:
            self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
    
    def index_filter(self, links):
        new_links = []
        for link in links:
            if not self.bf_index.add(link.url):
                new_links.append(link)
        return new_links

    def html_filter(self, links):
        new_links = []
        for link in links:
            #log.msg('This is a link : %s' % link, level=log.WARNING)
            if not self.bf_html.add(link.url):
                new_links.append(link)
        return new_links
开发者ID:wangjie1991,项目名称:crawler,代码行数:32,代码来源:linkfilter.py

示例2: main

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
def main():
   #Check for command line arguments
   if len(sys.argv) != 2:
      print 'Usage: %s [trace file]' % os.path.basename(sys.argv[0])
      sys.exit(1)

   #Read arguments from command line
   inFile = sys.argv[1]


   bf1 = BloomFilter(100000000, 0.001, 'bf1')   
   bf2 = BloomFilter(100000000, 0.001, 'bf2')
     
   outputFileName="converted-"+sys.argv[1]
   f = open(outputFileName, "a")



   for line in open(inFile,'r'):
      if (line[0:2]=="W," or line[0:2]=="R,"):
         hash1=int(hashlib.sha1(line[2:]).hexdigest(), 16) % (10 ** 10)
         hash2=int(hashlib.md5(line[2:]).hexdigest(), 16) % (10 ** 10)
         if (bf1.add(hash1) and bf2.add(hash2)):
         	f.write('%s,%d\n' % (line[0],hash1*10000) )
         else:
        	   f.write('%s,%d\n' % (line[0],hash2*10000) )  
      elif(line==''):
         break
      else:
         pass
   f.close()
开发者ID:theopengroup,项目名称:EAD,代码行数:33,代码来源:convert.py

示例3: URIBloomFilter

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class URIBloomFilter(BaseDupeFilter):
    def __init__(self, settings, debug = False):
        self.capacity = settings.getint("DUPEFILTER_CAPACITY")
        self.filename = settings.get("DUPEFILTER_FILENAME")
        self.debug = debug
        self.error_rate = 0.01
        self.logger = logging.getLogger(__name__)
        self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename) 
    
    @classmethod
    def from_settings(cls, settings):
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(settings, debug)
    def request_seen(self, request):
        fp = self.request_fingerprint(request)
        if self.check(fp):
            return True
        else:
            self.insert(fp)

    ###-------todo-------##
    def request_fingerprint(self, request):
        return request_fingerprint(request)
    
    def check(self, request):

        ret = request in self.bloom_filter_
        return ret
    
    def insert(self, request):
        self.bloom_filter_.add(request)
        #print len(self.bloom_filter_)
        #print self.bloom_filter_.hash_seeds
        #print self.bloom_filter_.num_bits
        #print self.bloom_filter_.num_hashes
    
    def reset(self):
        self.bloom_filter_.clear_all()
    
    def save(self):
        pass
    def load(self):
        self.bloom_filter_.sync()
        self.bloom_filter_.open("bloom.dump") 
        pass
    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s"
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
开发者ID:wuwenjunwwj,项目名称:inst_spider,代码行数:59,代码来源:bloom_filter.py

示例4: create_bf

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
def create_bf():
	bf = BloomFilter(count, error_rate, 'filter_base.bloom')
	keyDigest_list = []
	FILE = open(keyDigestFile, 'r')
	
	for i in range(count):
		keyDigest = FILE.read(keyDigestLen)
		keyDigest_list.append(keyDigest)
		
	FILE.close()
	
	for publicKeyID in keyDigest_list:
		bf.add(publicKeyID)
开发者ID:enzocxt,项目名称:bloomfilter,代码行数:15,代码来源:bloomfilter.py

示例5: process

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
def process(files):
    #Iterate over the lines of all files listed in sys.argv[1:], defaulting to sys.stdin if the list is empty.
    #If a filename is '-', it is also replaced by sys.stdin.
    if os.path.isfile(bloomfile):
        UNIQUES = BloomFilter.open(bloomfile)
    else:
        UNIQUES = BloomFilter(MAXUNIQUES, ACCUACY, bloomfile)

    for record in fileinput.input(files):
        record = str(record).strip()
        if not record in UNIQUES:
            UNIQUES.add(record)
            print record
    UNIQUES.sync()
    UNIQUES.close()
开发者ID:235,项目名称:data-utils,代码行数:17,代码来源:uniq.py

示例6: MongoDBPipeline

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class MongoDBPipeline(object):

    def __init__(self):
        connection = pymongo.MongoClient(
            settings['MONGODB_SERVER'],
            settings['MONGODB_PORT']
        )
        db = connection[settings['MONGODB_DB']]
        self.collection = db[settings['MONGODB_COLLECTION']]
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.si = SearchIndex()
        self.si.SearchInit()
        
    def process_item(self, item, spider):
        if self.bf.add(item['link']):#True if item in the BF
            raise DropItem("Duplicate item found: %s" % item)
        else:
            for data in item:
                if not data:
                    raise DropItem("Missing data!")
            self.collection.update({'link': item['link']}, dict(item), upsert=True)
            log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider)
            self.si.AddIndex(item)
            return item
        
    def __del__(self):
        self.si.IndexDone()
开发者ID:pianer,项目名称:SearchLaw,代码行数:29,代码来源:pipelines.py

示例7: initdb

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class URLBloomFilter:
    dbconn = None
    cur = None
    urlbf = None
    sql = None

    def initdb(self, host = 'localhost', user = 'muye', passwd = 'muye', db = 'muye', port = 3306, charset = 'utf8'):
        self.dbconn = MySQLConnection.MySQLConn()
        self.dbconn.connect(m_host = host, m_user = user, m_passwd = passwd, m_db = db)
        self.cur = self.dbconn.cursor()

    def initfilter(self, filename = './url.filter'):
        if os.path.isfile(filename):
            self.urlbf = BloomFilter.open(filename)
        else:
            self.urlbf = BloomFilter(10000000, 0.001, filename)

    def initsql(self, m_sql):
        self.sql = m_sql

    def add(self, url):
        if not self.urlbf.add(url):
            self.cur.execute(self.sql, url)
            return True
        else:
            return False

    def close(self):
        self.dbconn.close()
开发者ID:muye5,项目名称:muye5code,代码行数:31,代码来源:URLFilter.py

示例8: DuplicatesPipeline

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class DuplicatesPipeline(object):

    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()

    def process_item(self, item, spider):
        print '************%d pages visited!*****************' %len(self.bf)
        if self.bf.add(item['url']):#True if item in the BF
            raise DropItem("Duplicate item found: %s" % item)
        else:
            #print '%d pages visited!'% len(self.url_seen)
            self.save_to_file(item['url'],item['title'])
            self.si.AddIndex(item)
            return item

    def save_to_file(self,url,utitle):
        self.f_write.write(url)
        self.f_write.write('\t')
        self.f_write.write(utitle.encode('utf-8'))
        self.f_write.write('\n')

    def __del__(self):
        """docstring for __del__"""
        self.f_write.close()
        self.si.IndexDone()
开发者ID:PeinYu,项目名称:SearchEngine,代码行数:30,代码来源:pipelines.py

示例9: SpamCheck

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class SpamCheck (object): 
	def __init__(self):  
		# Setup the logging
	        self.ilog= logging.getLogger('prog')
		self.ilog.setLevel(logging.INFO)
	        self.console = logging.StreamHandler(sys.stderr)
	        self.console.setLevel(logging.INFO)
	        self.console.setFormatter(logging.Formatter('%(message)s'))
		self.ilog.addHandler(self.console)


		# Try loading the filter
	        try: 
	          self.__loadFilter__()
		  ilog.debug("loading filter.." ) 
		  
		# Create the filter if not present
		except: 
		       self.ilog.debug("Exception in loading ...." )
		       self.__create__()
		       self.ilog.debug("Creating the file ... ")

        def __loadFilter__(self): 
		self.bf = BloomFilter.open('filter.bloom')

	def __create__(self): 
		self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
		# Let us initalize the first time, it hacky but ok
		self.spam("000")
		# Generate the filter from a file
		with open("bad_numbers.txt") as f:
		    for nums in f:
		            self.bf.add(nums.rstrip())
			    self.ilog.debug(".")

	def spam(self, bad_entity): 
		with open("bad_numbers.txt","a+") as f: 
			f.write(bad_entity) 
			f.write("\n")
			self.ilog.info("Added bad entry to file")
		self.bf.add(bad_entity) 
		
	  	 
	def isSpam(self, entity): 
		return entity in self.bf 
开发者ID:skihero,项目名称:commandos,代码行数:47,代码来源:KillSpam.py

示例10: BLOOMDupeFilter

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class BLOOMDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""
 
    def __init__(self, path=None):
        self.file = None
        self.fingerprints = BloomFilter(3000000, 0.00001, 'bloomTemp')
 
    @classmethod
    def from_settings(cls, settings):
        return cls(job_dir(settings))
 
    def request_seen(self, request):
        fp = request.url
        if fp in self.fingerprints:
            return True
        self.fingerprints.add(fp)
 
    def close(self, reason):
        self.fingerprints = None
开发者ID:yidun55,项目名称:specialworker,代码行数:21,代码来源:MyBloomFilter.py

示例11: dedup

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
def dedup(fname):
    bf = BloomFilter(1E8, 0.01)
    
    with open(fname, 'r') as fin:
        with open('deduped.tsv', 'w') as fout:
            for line in fin:
                splitLine = line.split('\t')
                description = splitLine[5]
                if bf.add(md5.new(description).digest()):
                    continue
                else:
                    fout.write(line)
开发者ID:jisaacso,项目名称:team-thorn,代码行数:14,代码来源:deduper.py

示例12: LinkFilter

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class LinkFilter():
    def __init__(self):
        if os.path.exists('bloomfilter'):
            self.bloomfilter = BloomFilter.open('bloomfilter')
        else:
            self.bloomfilter = BloomFilter(1000000, 0.01, 'bloomfilter')

    def process(self, links):
        new_links = []
        for link in links:
            if not self.bloomfilter.add(link.url):
                new_links.append(link)
        return new_links
开发者ID:wangjie1991,项目名称:freesound,代码行数:15,代码来源:freesound_spider.py

示例13: DuplicatedFlowFilter

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class DuplicatedFlowFilter(object):

    def __init__(self):
        self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')

    def add(self, flow):
        """
        :param flow: the flow dict received from Proxy.
        :return: if the flow already in the filter.
        """
        f = (flow[METHOD], flow[URL])
        return self.bf.add(f)

    def __contains__(self, flow):
        f = (flow[METHOD], flow[URL])
        return self.bf.__contains__(f)
开发者ID:MatthewShao,项目名称:Centaur,代码行数:18,代码来源:filter.py

示例14: LinkFilter

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class LinkFilter():

    def __init__(self, name):
        self.name = name + ".bf"
        self.bf = BloomFilter(100000000, 0.01, self.name)
        '''
        if os.path.exists(self.name):
            self.bf = BloomFilter.open(self.name)
        else:
            self.bf = BloomFilter(100000000, 0.01, self.name)
        '''

    def link_filter(self, links):
        new_links = []
        for link in links:
            if not self.bf.add(link.url):
                new_links.append(link)
        return new_links
开发者ID:wangjie1991,项目名称:data_collect,代码行数:20,代码来源:linkfilter.py

示例15: createBloomFilter

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
def createBloomFilter(contentFile, filterFilename):
    bf = BloomFilter(10000000, 0.9999999, filterFilename)
    total = 0
    count = 0
    failed = 0
    with open(contentFile, "r") as f:
        for domain in f:
            total += 1
            d = domain.rstrip()

            if bf.add(d):
                count += 1
                print(d)
            else:
                failed += 1

    print "Total ", total
    print "Added ", count
    print "Conflicted", failed
开发者ID:LouisVN,项目名称:LR-Data,代码行数:21,代码来源:bloom.py


注:本文中的pybloomfilter.BloomFilter.add方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。