本文整理汇总了Python中pybloomfilter.BloomFilter.add方法的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter.add方法的具体用法?Python BloomFilter.add怎么用?Python BloomFilter.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pybloomfilter.BloomFilter
的用法示例。
在下文中一共展示了BloomFilter.add方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: LinkFilter
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class LinkFilter():
def __init__(self, domain):
self.file_index = '%s_%s' % (domain, 'index.bf')
self.file_html = '%s_%s' % (domain, 'html.bf')
if os.path.exists(self.file_index):
self.bf_index = BloomFilter.open(self.file_index)
else:
self.bf_index = BloomFilter(100000000, 0.001, self.file_index)
if os.path.exists(self.file_html):
self.bf_html = BloomFilter.open(self.file_html)
else:
self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
def index_filter(self, links):
new_links = []
for link in links:
if not self.bf_index.add(link.url):
new_links.append(link)
return new_links
def html_filter(self, links):
new_links = []
for link in links:
#log.msg('This is a link : %s' % link, level=log.WARNING)
if not self.bf_html.add(link.url):
new_links.append(link)
return new_links
示例2: main
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
def main():
#Check for command line arguments
if len(sys.argv) != 2:
print 'Usage: %s [trace file]' % os.path.basename(sys.argv[0])
sys.exit(1)
#Read arguments from command line
inFile = sys.argv[1]
bf1 = BloomFilter(100000000, 0.001, 'bf1')
bf2 = BloomFilter(100000000, 0.001, 'bf2')
outputFileName="converted-"+sys.argv[1]
f = open(outputFileName, "a")
for line in open(inFile,'r'):
if (line[0:2]=="W," or line[0:2]=="R,"):
hash1=int(hashlib.sha1(line[2:]).hexdigest(), 16) % (10 ** 10)
hash2=int(hashlib.md5(line[2:]).hexdigest(), 16) % (10 ** 10)
if (bf1.add(hash1) and bf2.add(hash2)):
f.write('%s,%d\n' % (line[0],hash1*10000) )
else:
f.write('%s,%d\n' % (line[0],hash2*10000) )
elif(line==''):
break
else:
pass
f.close()
示例3: URIBloomFilter
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class URIBloomFilter(BaseDupeFilter):
def __init__(self, settings, debug = False):
self.capacity = settings.getint("DUPEFILTER_CAPACITY")
self.filename = settings.get("DUPEFILTER_FILENAME")
self.debug = debug
self.error_rate = 0.01
self.logger = logging.getLogger(__name__)
self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename)
@classmethod
def from_settings(cls, settings):
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(settings, debug)
def request_seen(self, request):
fp = self.request_fingerprint(request)
if self.check(fp):
return True
else:
self.insert(fp)
###-------todo-------##
def request_fingerprint(self, request):
return request_fingerprint(request)
def check(self, request):
ret = request in self.bloom_filter_
return ret
def insert(self, request):
self.bloom_filter_.add(request)
#print len(self.bloom_filter_)
#print self.bloom_filter_.hash_seeds
#print self.bloom_filter_.num_bits
#print self.bloom_filter_.num_hashes
def reset(self):
self.bloom_filter_.clear_all()
def save(self):
pass
def load(self):
self.bloom_filter_.sync()
self.bloom_filter_.open("bloom.dump")
pass
def log(self, request, spider):
if self.debug:
msg = "Filtered duplicate request: %(request)s"
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
elif self.logdupes:
msg = ("Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
self.logdupes = False
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
示例4: create_bf
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
def create_bf():
bf = BloomFilter(count, error_rate, 'filter_base.bloom')
keyDigest_list = []
FILE = open(keyDigestFile, 'r')
for i in range(count):
keyDigest = FILE.read(keyDigestLen)
keyDigest_list.append(keyDigest)
FILE.close()
for publicKeyID in keyDigest_list:
bf.add(publicKeyID)
示例5: process
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
def process(files):
#Iterate over the lines of all files listed in sys.argv[1:], defaulting to sys.stdin if the list is empty.
#If a filename is '-', it is also replaced by sys.stdin.
if os.path.isfile(bloomfile):
UNIQUES = BloomFilter.open(bloomfile)
else:
UNIQUES = BloomFilter(MAXUNIQUES, ACCUACY, bloomfile)
for record in fileinput.input(files):
record = str(record).strip()
if not record in UNIQUES:
UNIQUES.add(record)
print record
UNIQUES.sync()
UNIQUES.close()
示例6: MongoDBPipeline
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(
settings['MONGODB_SERVER'],
settings['MONGODB_PORT']
)
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
self.si = SearchIndex()
self.si.SearchInit()
def process_item(self, item, spider):
if self.bf.add(item['link']):#True if item in the BF
raise DropItem("Duplicate item found: %s" % item)
else:
for data in item:
if not data:
raise DropItem("Missing data!")
self.collection.update({'link': item['link']}, dict(item), upsert=True)
log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider)
self.si.AddIndex(item)
return item
def __del__(self):
self.si.IndexDone()
示例7: initdb
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class URLBloomFilter:
dbconn = None
cur = None
urlbf = None
sql = None
def initdb(self, host = 'localhost', user = 'muye', passwd = 'muye', db = 'muye', port = 3306, charset = 'utf8'):
self.dbconn = MySQLConnection.MySQLConn()
self.dbconn.connect(m_host = host, m_user = user, m_passwd = passwd, m_db = db)
self.cur = self.dbconn.cursor()
def initfilter(self, filename = './url.filter'):
if os.path.isfile(filename):
self.urlbf = BloomFilter.open(filename)
else:
self.urlbf = BloomFilter(10000000, 0.001, filename)
def initsql(self, m_sql):
self.sql = m_sql
def add(self, url):
if not self.urlbf.add(url):
self.cur.execute(self.sql, url)
return True
else:
return False
def close(self):
self.dbconn.close()
示例8: DuplicatesPipeline
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class DuplicatesPipeline(object):
def __init__(self):
self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
self.f_write = open('visitedsites','w')
self.si = SearchIndex()
self.si.SearchInit()
def process_item(self, item, spider):
print '************%d pages visited!*****************' %len(self.bf)
if self.bf.add(item['url']):#True if item in the BF
raise DropItem("Duplicate item found: %s" % item)
else:
#print '%d pages visited!'% len(self.url_seen)
self.save_to_file(item['url'],item['title'])
self.si.AddIndex(item)
return item
def save_to_file(self,url,utitle):
self.f_write.write(url)
self.f_write.write('\t')
self.f_write.write(utitle.encode('utf-8'))
self.f_write.write('\n')
def __del__(self):
"""docstring for __del__"""
self.f_write.close()
self.si.IndexDone()
示例9: SpamCheck
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class SpamCheck (object):
def __init__(self):
# Setup the logging
self.ilog= logging.getLogger('prog')
self.ilog.setLevel(logging.INFO)
self.console = logging.StreamHandler(sys.stderr)
self.console.setLevel(logging.INFO)
self.console.setFormatter(logging.Formatter('%(message)s'))
self.ilog.addHandler(self.console)
# Try loading the filter
try:
self.__loadFilter__()
ilog.debug("loading filter.." )
# Create the filter if not present
except:
self.ilog.debug("Exception in loading ...." )
self.__create__()
self.ilog.debug("Creating the file ... ")
def __loadFilter__(self):
self.bf = BloomFilter.open('filter.bloom')
def __create__(self):
self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
# Let us initalize the first time, it hacky but ok
self.spam("000")
# Generate the filter from a file
with open("bad_numbers.txt") as f:
for nums in f:
self.bf.add(nums.rstrip())
self.ilog.debug(".")
def spam(self, bad_entity):
with open("bad_numbers.txt","a+") as f:
f.write(bad_entity)
f.write("\n")
self.ilog.info("Added bad entry to file")
self.bf.add(bad_entity)
def isSpam(self, entity):
return entity in self.bf
示例10: BLOOMDupeFilter
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class BLOOMDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""
def __init__(self, path=None):
self.file = None
self.fingerprints = BloomFilter(3000000, 0.00001, 'bloomTemp')
@classmethod
def from_settings(cls, settings):
return cls(job_dir(settings))
def request_seen(self, request):
fp = request.url
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
def close(self, reason):
self.fingerprints = None
示例11: dedup
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
def dedup(fname):
bf = BloomFilter(1E8, 0.01)
with open(fname, 'r') as fin:
with open('deduped.tsv', 'w') as fout:
for line in fin:
splitLine = line.split('\t')
description = splitLine[5]
if bf.add(md5.new(description).digest()):
continue
else:
fout.write(line)
示例12: LinkFilter
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class LinkFilter():
def __init__(self):
if os.path.exists('bloomfilter'):
self.bloomfilter = BloomFilter.open('bloomfilter')
else:
self.bloomfilter = BloomFilter(1000000, 0.01, 'bloomfilter')
def process(self, links):
new_links = []
for link in links:
if not self.bloomfilter.add(link.url):
new_links.append(link)
return new_links
示例13: DuplicatedFlowFilter
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class DuplicatedFlowFilter(object):
def __init__(self):
self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
def add(self, flow):
"""
:param flow: the flow dict received from Proxy.
:return: if the flow already in the filter.
"""
f = (flow[METHOD], flow[URL])
return self.bf.add(f)
def __contains__(self, flow):
f = (flow[METHOD], flow[URL])
return self.bf.__contains__(f)
示例14: LinkFilter
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
class LinkFilter():
def __init__(self, name):
self.name = name + ".bf"
self.bf = BloomFilter(100000000, 0.01, self.name)
'''
if os.path.exists(self.name):
self.bf = BloomFilter.open(self.name)
else:
self.bf = BloomFilter(100000000, 0.01, self.name)
'''
def link_filter(self, links):
new_links = []
for link in links:
if not self.bf.add(link.url):
new_links.append(link)
return new_links
示例15: createBloomFilter
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import add [as 别名]
def createBloomFilter(contentFile, filterFilename):
bf = BloomFilter(10000000, 0.9999999, filterFilename)
total = 0
count = 0
failed = 0
with open(contentFile, "r") as f:
for domain in f:
total += 1
d = domain.rstrip()
if bf.add(d):
count += 1
print(d)
else:
failed += 1
print "Total ", total
print "Added ", count
print "Conflicted", failed