本文整理汇总了Python中pybloomfilter.BloomFilter类的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter类的具体用法?Python BloomFilter怎么用?Python BloomFilter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BloomFilter类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, path=FILTER_PATH, debug=False):
if os.path.exists(FILTER_PATH):
self.url_filter = BloomFilter.open(FILTER_PATH)
else:
print "created a new bloom filter. "
self.url_filter = BloomFilter(100000, 0.00001, FILTER_PATH)
super(DuplicateFilter, self).__init__(path, debug)
示例2: create_ref_bloom_filter
def create_ref_bloom_filter(reference_file, error_rate, bf_file, format="fasta"):
"""From a given FASTA reference sequence creates a bloom filter file
from each read.
"""
if format == "fasta":
file_it = FastaIterator
record = lambda it: (seq.seq for seq in it)
elif format == "fastq":
file_it = FastqGeneralIterator
record = lambda it: (seq for _, seq, _ in it)
capacity = total_reads(reference_file)
with open(reference_file) as handle:
it = file_it(handle)
read_it = record(it)
read_len = 109
read_in = []
read = []
buffer = []
bf = BloomFilter(capacity, error_rate, bf_file)
sequence = read_it.next()
step = read_len
i = 0
while i < len(sequence):
read = sequence[i:i + read_len - 1]
i += step
print(read)
bf.update(read)
bf.close()
示例3: LinkFilter
class LinkFilter():
def __init__(self, domain):
self.file_index = '%s_%s' % (domain, 'index.bf')
self.file_html = '%s_%s' % (domain, 'html.bf')
if os.path.exists(self.file_index):
self.bf_index = BloomFilter.open(self.file_index)
else:
self.bf_index = BloomFilter(100000000, 0.001, self.file_index)
if os.path.exists(self.file_html):
self.bf_html = BloomFilter.open(self.file_html)
else:
self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
def index_filter(self, links):
new_links = []
for link in links:
if not self.bf_index.add(link.url):
new_links.append(link)
return new_links
def html_filter(self, links):
new_links = []
for link in links:
#log.msg('This is a link : %s' % link, level=log.WARNING)
if not self.bf_html.add(link.url):
new_links.append(link)
return new_links
示例4: main
def main():
#Check for command line arguments
if len(sys.argv) != 2:
print 'Usage: %s [trace file]' % os.path.basename(sys.argv[0])
sys.exit(1)
#Read arguments from command line
inFile = sys.argv[1]
bf1 = BloomFilter(100000000, 0.001, 'bf1')
bf2 = BloomFilter(100000000, 0.001, 'bf2')
outputFileName="converted-"+sys.argv[1]
f = open(outputFileName, "a")
for line in open(inFile,'r'):
if (line[0:2]=="W," or line[0:2]=="R,"):
hash1=int(hashlib.sha1(line[2:]).hexdigest(), 16) % (10 ** 10)
hash2=int(hashlib.md5(line[2:]).hexdigest(), 16) % (10 ** 10)
if (bf1.add(hash1) and bf2.add(hash2)):
f.write('%s,%d\n' % (line[0],hash1*10000) )
else:
f.write('%s,%d\n' % (line[0],hash2*10000) )
elif(line==''):
break
else:
pass
f.close()
示例5: __init__
def __init__(self, seeds, done_que, run_que):
self.showpercounts = 10
self.timeout = 5
self.starttime = time.time()
self.oldtime = 0
self.quit = 0
self.https_enable = 0
self.run_que = run_que
self.done_que = done_que
self.tasks = []
self.done = 1
self.errdone = set()
self.err = Error()
self.loadstate()
self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google',
'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' ))
self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv'))
self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl
self.poolsize = 60
self.poolmaxfree = 20
self.freecount = 0
self.down_pool = Pool(size=self.poolsize)
self.totalnettime = 0
self.cbcputime = 0
self.totaldownsize = 0
self.curspeed = 0
self.debugnosave = 1
self.tt = 1
self.done_sites_fname='done_sites.bin'
try:
self.bfdone = BloomFilter.open(self.done_sites_fname)
except:
self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M
if self.run_que.qsize() == 0:
for seed in seeds:
self.run_que.put( seed.split("http://")[1] )
if self.https_enable == 0:
self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I)
else:
self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I)
示例6: __init__
def __init__(self):
self.mysql = mysql.Mysql()
self.re = re
self.time = time
self.datetime = datetime
self.requests = requests
# 使用bloom_filter去重,每次从文件中读取dump.bloom
if os.path.isfile("new_filter.bloom"):
self.bf = BloomFilter.open("new_filter.bloom")
else:
self.bf = BloomFilter(10000000, 0.01, "new_filter.bloom")
示例7: __init__
def __init__(self, node_n, seen_persist, Q_logs=None):
self.node_n = node_n
self.Q_logs = Q_logs
self.total_crawled = 0
self.payloads_dropped = 0
# single variable for tracking whether node should be active or not
self.active = True
# crawl task Queue
# Priority Queue ~ [ (next_pull_time, host_addr, url, parent_page_stats, seed_dist, parent_url) ]
self.Q_crawl_tasks = Queue.PriorityQueue()
# host queue dict
# { host_addr: [(url, ref_page_stats, seed_dist, parent_url), ...] }
self.hqs = {}
# seen url check
# Bloom Filter ~ [ url ]
if seen_persist:
try:
self.seen = BloomFilter.open(BF_FILENAME)
except:
self.Q_logs.put('Error opening bloom filter, creating new one')
self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME)
else:
self.seen = BloomFilter(BF_CAPACITY, BF_ERROR_RATE, BF_FILENAME)
# DNS Cache
# { netloc: (host_addr, time_last_checked) }
self.DNScache = {}
# overflow url Queue
# Queue ~ [ (host_addr, url, ref_page_stats, seen_dist, parent_url) ]
self.Q_overflow_urls = Queue.Queue()
# host queue cleanup Queue
# Priority Queue ~ [ (time_to_delete, host_addr) ]
self.Q_hq_cleanup = Queue.PriorityQueue()
# active url count queue- for counting/tracking active
# Queue ~ [ True ]
self.Q_active_count = Queue.Queue()
# thread active url dict- a dict of active urls by thread using, for restart dump
# { thread_name: active_url }
# NOTE: note that there are problems with this methodology, but that errors will only lead
# to data redundancy (as opposed to omission)...
self.thread_active = {}
# Queue of messages to be sent to other nodes
# Queue ~ [ (node_num_to, url, seed_dist, parent_page_stats) ]
self.Q_to_other_nodes = Queue.Queue()
示例8: dedup
def dedup(fname):
bf = BloomFilter(1E8, 0.01)
with open(fname, 'r') as fin:
with open('deduped.tsv', 'w') as fout:
for line in fin:
splitLine = line.split('\t')
description = splitLine[5]
if bf.add(md5.new(description).digest()):
continue
else:
fout.write(line)
示例9: create_bf
def create_bf():
bf = BloomFilter(count, error_rate, 'filter_base.bloom')
keyDigest_list = []
FILE = open(keyDigestFile, 'r')
for i in range(count):
keyDigest = FILE.read(keyDigestLen)
keyDigest_list.append(keyDigest)
FILE.close()
for publicKeyID in keyDigest_list:
bf.add(publicKeyID)
示例10: __init__
def __init__(self, domain):
self.file_index = '%s_%s' % (domain, 'index.bf')
self.file_html = '%s_%s' % (domain, 'html.bf')
if os.path.exists(self.file_index):
self.bf_index = BloomFilter.open(self.file_index)
else:
self.bf_index = BloomFilter(100000000, 0.001, self.file_index)
if os.path.exists(self.file_html):
self.bf_html = BloomFilter.open(self.file_html)
else:
self.bf_html = BloomFilter(100000000, 0.001, self.file_html)
示例11: __init__
def __init__(self, start_url, basic_url):
self.basic_url = basic_url
self.start_url = start_url
self.mysql = mysql.Mysql()
self.re = re
self.time = time
self.datetime = datetime
self.requests = requests
# 使用bloom_filter去重,每次从文件中读取dump.bloom
if os.path.isfile('filter.bloom'):
self.bf = BloomFilter.open('filter.bloom')
else:
self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
示例12: __init__
def __init__(self):
bc = config.get_boolmfilter_config()
if os.path.exists(bc['bin_path']):
self.bloomfilter = BloomFilter.open(bc['bin_path'])
else:
self.bloomfilter = BloomFilter(
bc['capacity'], bc['wrong_rate'], bc['bin_path'])
示例13: __init__
def __init__(self, settings, debug = False):
self.capacity = settings.getint("DUPEFILTER_CAPACITY")
self.filename = settings.get("DUPEFILTER_FILENAME")
self.debug = debug
self.error_rate = 0.01
self.logger = logging.getLogger(__name__)
self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename)
示例14: __init__
def __init__(self):
self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
self.f_write = open('visitedsites','w')
self.si = SearchIndex()
self.si.SearchInit()
self.count_num = 0
self.db = MySQLdb.connect("localhost","root","","storecount")
self.cursor = self.db.cursor()
self.cursor.execute("DROP TABLE IF EXISTS POPULAR")
sql1 = """CREATE TABLE POPULAR(URL text(512),COUNT_MARK INT);"""
try:
self.cursor.execute(sql1)
self.db.commit()
# print "cao create"
except:
traceback.print_exc()
self.db.rollback()
# self.dbpool = adbapi.ConnectionPool('MySQLdb',
# host = '127.0.0.1',
# db = 'storecount',
# user = 'root',
# passwd = '',
# cursorclass = MySQLdb.cursors.DictCursor,
# charset = 'utf8',
# use_unicode = True)
self.mark = 0
示例15: DuplicatesPipeline
class DuplicatesPipeline(object):
def __init__(self):
self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
self.f_write = open('visitedsites','w')
self.si = SearchIndex()
self.si.SearchInit()
def process_item(self, item, spider):
print '************%d pages visited!*****************' %len(self.bf)
if self.bf.add(item['url']):#True if item in the BF
raise DropItem("Duplicate item found: %s" % item)
else:
#print '%d pages visited!'% len(self.url_seen)
self.save_to_file(item['url'],item['title'])
self.si.AddIndex(item)
return item
def save_to_file(self,url,utitle):
self.f_write.write(url)
self.f_write.write('\t')
self.f_write.write(utitle.encode('utf-8'))
self.f_write.write('\n')
def __del__(self):
"""docstring for __del__"""
self.f_write.close()
self.si.IndexDone()