当前位置: 首页>>代码示例>>Python>>正文


Python Scheduler.print_queue方法代码示例

本文整理汇总了Python中scheduler.Scheduler.print_queue方法的典型用法代码示例。如果您正苦于以下问题:Python Scheduler.print_queue方法的具体用法?Python Scheduler.print_queue怎么用?Python Scheduler.print_queue使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scheduler.Scheduler的用法示例。


在下文中一共展示了Scheduler.print_queue方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: crawl_spider

# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import print_queue [as 别名]
def crawl_spider(spider):
    ############### TRAIN SPIDER ##############
    if spider == "train":
        # initialize the scheduling queue
        q = Scheduler()              
        # initialize all of the pipelines
        pipeline = []
        for pipe in settings.PIPELINES:
            try:
                pipeline.append( getattr( pipelines, pipe )() )
            except: 
                print "Error: Unable to initialize %s pipe" % pipe
                quit()
        # initialize the spider
        # try:
        #     s = getattr(spiders, spider)()    
        # except:
        #     print "Error: It's likely that the input spider does not exist in spiders.py"
        #     quit()
        s = spiders.Train()
        #print s.__doc__
        # add all of the start links and known links to the top level of the queue
        for url in list(s.start_urls) + list(s.known_urls):
            q.add_link(url, 0)
        q.print_queue()
        # request urls while scheduler not empty and pass to to spider
        # add returned links to the queue
        # send returned items down the pipeline
        visits = 0
        while not q.is_empty():
            wait_between_requests() # wait a random small amount of time so we're less detectable
            url, level = q.get_next_link(what_level=True)
            print "Visit #%i, Q level %i, Q volume %i" % (visits, level, q.queue_volume())
            response = get_request(url)
            if response: 
                items, extracted_links = s.parse(response, level=level) # links and items are both links
                #print "exctracted links:", links
                add_to_queue(q, extracted_links) # manage the returned links
                send_down_pipeline(pipeline, items, s) # manage the returned items
                if settings.ASK_BETWEEN_REQUESTS: raw_input("Press ENTER to continue?")
                visits += 1 

        if q.is_empty(): print "CRAWL IS FINISHED: Queue is empty"
        #if visits >= settings.MAX_CRAWLS: print "CRAWL IS FINISHED: Crawled max number of urls (%i total)" % visits

    ################ TEST SPIDER ##############
    elif spider == "test":
        print "Test case"
        q = PriorityQueue()              
        queued_links = set()
        # initialize all of the pipelines
        pipeline = []
        for pipe in settings.PIPELINES:
            try:
                pipeline.append( getattr( pipelines, pipe )() )
            except: 
                print "Error: Unable to initialize %s pipe" % pipe
                quit()
        # initialize the spider
        # try:
        #     s = spiders.Test()    
        # except:
        #     print "Error: It's likely that the input spider does not exist in spiders.py"
        #     quit()
        #print s.__doc__
        s = spiders.Test()    
        # add all of the start links and known links to the top level of the queue
        q.put((-.1, s.start_urls[0]))
        queued_links.add(s.start_urls[0])
        # request urls while scheduler not empty and pass to to spider
        # add returned links to the queue
        # send returned items down the pipeline
        visits = 0
        while not q.empty():
            wait_between_requests() # wait a random small amount of time so we're less detectable
            priority, url = q.get()
            print "Q get:", -priority, url
            print "Visit #%i, Q volume %i" % (visits, q.qsize())
            response = get_request(url)
            if response: 
                items, extracted_links = s.parse(response, level=-priority) # links and items are both links
                # print "Extracted item: ",items
                #print "extracted links:", extracted_links
                #print "exctracted links:", links
                for link in extracted_links:
                    if link[1] not in queued_links:
                        # print link
                        q.put((-link[0], link[1]))
                        queued_links.add(link[1])
                    # else:
                        # print "We already queued %s" % link[1]
                send_down_pipeline(pipeline, items, s) # manage the returned items
                if settings.ASK_BETWEEN_REQUESTS: raw_input("Press ENTER to continue?")
                visits += 1 

        if q.empty(): print "CRAWL IS FINISHED: Queue is empty"
        #if visits >= settings.MAX_CRAWLS: print "CRAWL IS FINISHED: Crawled max number of urls (%i total)" % visits

    else:
        quit()
开发者ID:teffland,项目名称:FindIt,代码行数:102,代码来源:findit.py


注:本文中的scheduler.Scheduler.print_queue方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。