本文整理汇总了Python中pybloomfilter.BloomFilter.sync方法的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter.sync方法的具体用法?Python BloomFilter.sync怎么用?Python BloomFilter.sync使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pybloomfilter.BloomFilter
的用法示例。
在下文中一共展示了BloomFilter.sync方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: URIBloomFilter
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import sync [as 别名]
class URIBloomFilter(BaseDupeFilter):
def __init__(self, settings, debug = False):
self.capacity = settings.getint("DUPEFILTER_CAPACITY")
self.filename = settings.get("DUPEFILTER_FILENAME")
self.debug = debug
self.error_rate = 0.01
self.logger = logging.getLogger(__name__)
self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename)
@classmethod
def from_settings(cls, settings):
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(settings, debug)
def request_seen(self, request):
fp = self.request_fingerprint(request)
if self.check(fp):
return True
else:
self.insert(fp)
###-------todo-------##
def request_fingerprint(self, request):
return request_fingerprint(request)
def check(self, request):
ret = request in self.bloom_filter_
return ret
def insert(self, request):
self.bloom_filter_.add(request)
#print len(self.bloom_filter_)
#print self.bloom_filter_.hash_seeds
#print self.bloom_filter_.num_bits
#print self.bloom_filter_.num_hashes
def reset(self):
self.bloom_filter_.clear_all()
def save(self):
pass
def load(self):
self.bloom_filter_.sync()
self.bloom_filter_.open("bloom.dump")
pass
def log(self, request, spider):
if self.debug:
msg = "Filtered duplicate request: %(request)s"
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
elif self.logdupes:
msg = ("Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
self.logdupes = False
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
示例2: process
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import sync [as 别名]
def process(files):
#Iterate over the lines of all files listed in sys.argv[1:], defaulting to sys.stdin if the list is empty.
#If a filename is '-', it is also replaced by sys.stdin.
if os.path.isfile(bloomfile):
UNIQUES = BloomFilter.open(bloomfile)
else:
UNIQUES = BloomFilter(MAXUNIQUES, ACCUACY, bloomfile)
for record in fileinput.input(files):
record = str(record).strip()
if not record in UNIQUES:
UNIQUES.add(record)
print record
UNIQUES.sync()
UNIQUES.close()
示例3: __init__
# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import sync [as 别名]
#.........这里部分代码省略.........
else:
self.hqs[i] = []
self.Q_hq_cleanup.put((now, i))
# if there are urls left over, add to appropriate queues
for url in urls:
self._init_add_url(url)
# subroutine for adding url to hq, assuming only one thread running (initialization)
def _init_add_url(self, url_in):
# basic cleaning operations on url
url = re.sub(r'/$', '', url_in)
# assume unseen and input to seen list, add to active count
self.seen.add(url)
# BLOCK certain urls based on manual block rgx
if re.search(BLOCK_URL_RGX, url) is not None:
return False
# get host IP address of url
url_parts = urlparse.urlsplit(url)
host_addr = self._get_and_log_addr(url_parts.netloc)
# if the page is not of a safe type log and do not proceed
if re.search(SAFE_PATH_RGX, url_parts.path) is None:
if DEBUG_MODE:
self.Q_logs.put("*UN-SAFE PAGE TYPE SKIPPED: %s" % (url,))
return False
# if DNS was resolved error already reported, do not proceed any further
if host_addr is None:
return False
# if the page belongs to another node, pass to message sending service
if DISTR_ON_FULL_URL:
url_node = hash(url) % NUMBER_OF_NODES
else:
url_node = hash(host_addr) % NUMBER_OF_NODES
if url_node != self.node_n:
self.Q_to_other_nodes.put((url_node, url, None, 0, None))
return False
# add to an existing hq, or create new one & log new crawl task, or add to overflow
self.Q_active_count.put(True)
self.total_crawled += 1
if DEBUG_MODE:
self.Q_logs.put("Active count: %s" % self.Q_active_count.qsize())
if self.hqs.has_key(host_addr):
self.hqs[host_addr].append((url, None, 0, None))
elif len(self.hqs) < HQ_TO_THREAD_RATIO*NUMBER_OF_CTHREADS:
self.hqs[host_addr] = []
self.Q_crawl_tasks.put((datetime.datetime.now(), host_addr, url, None, 0, None))
else:
self.Q_overflow_urls.put((host_addr, url, None, 0, None))
# routine called on abort (by user interrupt or by MAX_CRAWLED count being reached) to
# save current contents of all queues to disk & seen filter flushed for restart
def dump_for_restart(self):
# ensure url frontier deactivated
self.active = False
# get all urls in Q_crawl_tasks, hqs, or Q_overflow_urls
# only get urls as these will be re-injected through the initialize method of uf
with open(RESTART_DUMP, 'w') as f:
for thead_name, url in self.thread_active.iteritems():
if url is not None:
f.write(url + '\n')
while not self.Q_crawl_tasks.empty():
try:
r = self.Q_crawl_tasks.get(True, 1)
f.write(r[2] + '\n')
except:
continue
for host_addr, paths in self.hqs.iteritems():
for path in paths:
f.write(path[0] + '\n')
while not self.Q_to_other_nodes.empty():
try:
r = self.Q_to_other_nodes.get(True, 1)
f.write(r[1] + '\n')
except:
continue
while not self.Q_overflow_urls.empty():
try:
r = self.Q_overflow_urls.get(True, 1)
f.write(r[1] + '\n')
except:
continue
# ensure seen filter file is synced
self.seen.sync()