当前位置: 首页>>代码示例>>Python>>正文


Python BloomFilter.sync方法代码示例

本文整理汇总了Python中pybloomfilter.BloomFilter.sync方法的典型用法代码示例。如果您正苦于以下问题:Python BloomFilter.sync方法的具体用法?Python BloomFilter.sync怎么用?Python BloomFilter.sync使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pybloomfilter.BloomFilter的用法示例。


在下文中一共展示了BloomFilter.sync方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: URIBloomFilter

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import sync [as 别名]
class URIBloomFilter(BaseDupeFilter):
    def __init__(self, settings, debug = False):
        self.capacity = settings.getint("DUPEFILTER_CAPACITY")
        self.filename = settings.get("DUPEFILTER_FILENAME")
        self.debug = debug
        self.error_rate = 0.01
        self.logger = logging.getLogger(__name__)
        self.bloom_filter_ =BloomFilter(self.capacity, self.error_rate, self.filename) 
    
    @classmethod
    def from_settings(cls, settings):
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(settings, debug)
    def request_seen(self, request):
        fp = self.request_fingerprint(request)
        if self.check(fp):
            return True
        else:
            self.insert(fp)

    ###-------todo-------##
    def request_fingerprint(self, request):
        return request_fingerprint(request)
    
    def check(self, request):

        ret = request in self.bloom_filter_
        return ret
    
    def insert(self, request):
        self.bloom_filter_.add(request)
        #print len(self.bloom_filter_)
        #print self.bloom_filter_.hash_seeds
        #print self.bloom_filter_.num_bits
        #print self.bloom_filter_.num_hashes
    
    def reset(self):
        self.bloom_filter_.clear_all()
    
    def save(self):
        pass
    def load(self):
        self.bloom_filter_.sync()
        self.bloom_filter_.open("bloom.dump") 
        pass
    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s"
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
开发者ID:wuwenjunwwj,项目名称:inst_spider,代码行数:59,代码来源:bloom_filter.py

示例2: process

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import sync [as 别名]
def process(files):
    #Iterate over the lines of all files listed in sys.argv[1:], defaulting to sys.stdin if the list is empty.
    #If a filename is '-', it is also replaced by sys.stdin.
    if os.path.isfile(bloomfile):
        UNIQUES = BloomFilter.open(bloomfile)
    else:
        UNIQUES = BloomFilter(MAXUNIQUES, ACCUACY, bloomfile)

    for record in fileinput.input(files):
        record = str(record).strip()
        if not record in UNIQUES:
            UNIQUES.add(record)
            print record
    UNIQUES.sync()
    UNIQUES.close()
开发者ID:235,项目名称:data-utils,代码行数:17,代码来源:uniq.py

示例3: __init__

# 需要导入模块: from pybloomfilter import BloomFilter [as 别名]
# 或者: from pybloomfilter.BloomFilter import sync [as 别名]

#.........这里部分代码省略.........
      else:
        self.hqs[i] = []
        self.Q_hq_cleanup.put((now, i))

    # if there are urls left over, add to appropriate queues
    for url in urls:
      self._init_add_url(url)

  
  # subroutine for adding url to hq, assuming only one thread running (initialization)
  def _init_add_url(self, url_in):

    # basic cleaning operations on url
    url = re.sub(r'/$', '', url_in)

    # assume unseen and input to seen list, add to active count
    self.seen.add(url)

    # BLOCK certain urls based on manual block rgx
    if re.search(BLOCK_URL_RGX, url) is not None:
      return False

    # get host IP address of url
    url_parts = urlparse.urlsplit(url)
    host_addr = self._get_and_log_addr(url_parts.netloc)

    # if the page is not of a safe type log and do not proceed
    if re.search(SAFE_PATH_RGX, url_parts.path) is None:
      if DEBUG_MODE:
        self.Q_logs.put("*UN-SAFE PAGE TYPE SKIPPED: %s" % (url,))
      return False

    # if DNS was resolved error already reported, do not proceed any further
    if host_addr is None:
      return False

    # if the page belongs to another node, pass to message sending service
    if DISTR_ON_FULL_URL:
      url_node = hash(url) % NUMBER_OF_NODES
    else:
      url_node = hash(host_addr) % NUMBER_OF_NODES
    if url_node != self.node_n:
      self.Q_to_other_nodes.put((url_node, url, None, 0, None))
      return False

    # add to an existing hq, or create new one & log new crawl task, or add to overflow
    self.Q_active_count.put(True)
    self.total_crawled += 1
    if DEBUG_MODE:
      self.Q_logs.put("Active count: %s" % self.Q_active_count.qsize())
    if self.hqs.has_key(host_addr):
      self.hqs[host_addr].append((url, None, 0, None))
    elif len(self.hqs) < HQ_TO_THREAD_RATIO*NUMBER_OF_CTHREADS:
      self.hqs[host_addr] = []
      self.Q_crawl_tasks.put((datetime.datetime.now(), host_addr, url, None, 0, None))
    else:
      self.Q_overflow_urls.put((host_addr, url, None, 0, None))


  # routine called on abort (by user interrupt or by MAX_CRAWLED count being reached) to
  # save current contents of all queues to disk & seen filter flushed for restart
  def dump_for_restart(self):

    # ensure url frontier deactivated
    self.active = False
    
    # get all urls in Q_crawl_tasks, hqs, or Q_overflow_urls
    # only get urls as these will be re-injected through the initialize method of uf
    with open(RESTART_DUMP, 'w') as f:
      for thead_name, url in self.thread_active.iteritems():
        if url is not None:
          f.write(url + '\n')

      while not self.Q_crawl_tasks.empty():
        try:
          r = self.Q_crawl_tasks.get(True, 1)
          f.write(r[2] + '\n')
        except:
          continue

      for host_addr, paths in self.hqs.iteritems():
        for path in paths:
          f.write(path[0] + '\n')

      while not self.Q_to_other_nodes.empty():
        try:
          r = self.Q_to_other_nodes.get(True, 1)
          f.write(r[1] + '\n')
        except:
          continue

      while not self.Q_overflow_urls.empty():
        try:
          r = self.Q_overflow_urls.get(True, 1)
          f.write(r[1] + '\n')
        except:
          continue

    # ensure seen filter file is synced
    self.seen.sync()
开发者ID:abresler,项目名称:RL-crawler,代码行数:104,代码来源:urlFrontier.py


注:本文中的pybloomfilter.BloomFilter.sync方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。