本文整理汇总了Python中scheduler.Scheduler.print_queue方法的典型用法代码示例。如果您正苦于以下问题:Python Scheduler.print_queue方法的具体用法?Python Scheduler.print_queue怎么用?Python Scheduler.print_queue使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scheduler.Scheduler
的用法示例。
在下文中一共展示了Scheduler.print_queue方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: crawl_spider
# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import print_queue [as 别名]
def crawl_spider(spider):
############### TRAIN SPIDER ##############
if spider == "train":
# initialize the scheduling queue
q = Scheduler()
# initialize all of the pipelines
pipeline = []
for pipe in settings.PIPELINES:
try:
pipeline.append( getattr( pipelines, pipe )() )
except:
print "Error: Unable to initialize %s pipe" % pipe
quit()
# initialize the spider
# try:
# s = getattr(spiders, spider)()
# except:
# print "Error: It's likely that the input spider does not exist in spiders.py"
# quit()
s = spiders.Train()
#print s.__doc__
# add all of the start links and known links to the top level of the queue
for url in list(s.start_urls) + list(s.known_urls):
q.add_link(url, 0)
q.print_queue()
# request urls while scheduler not empty and pass to to spider
# add returned links to the queue
# send returned items down the pipeline
visits = 0
while not q.is_empty():
wait_between_requests() # wait a random small amount of time so we're less detectable
url, level = q.get_next_link(what_level=True)
print "Visit #%i, Q level %i, Q volume %i" % (visits, level, q.queue_volume())
response = get_request(url)
if response:
items, extracted_links = s.parse(response, level=level) # links and items are both links
#print "exctracted links:", links
add_to_queue(q, extracted_links) # manage the returned links
send_down_pipeline(pipeline, items, s) # manage the returned items
if settings.ASK_BETWEEN_REQUESTS: raw_input("Press ENTER to continue?")
visits += 1
if q.is_empty(): print "CRAWL IS FINISHED: Queue is empty"
#if visits >= settings.MAX_CRAWLS: print "CRAWL IS FINISHED: Crawled max number of urls (%i total)" % visits
################ TEST SPIDER ##############
elif spider == "test":
print "Test case"
q = PriorityQueue()
queued_links = set()
# initialize all of the pipelines
pipeline = []
for pipe in settings.PIPELINES:
try:
pipeline.append( getattr( pipelines, pipe )() )
except:
print "Error: Unable to initialize %s pipe" % pipe
quit()
# initialize the spider
# try:
# s = spiders.Test()
# except:
# print "Error: It's likely that the input spider does not exist in spiders.py"
# quit()
#print s.__doc__
s = spiders.Test()
# add all of the start links and known links to the top level of the queue
q.put((-.1, s.start_urls[0]))
queued_links.add(s.start_urls[0])
# request urls while scheduler not empty and pass to to spider
# add returned links to the queue
# send returned items down the pipeline
visits = 0
while not q.empty():
wait_between_requests() # wait a random small amount of time so we're less detectable
priority, url = q.get()
print "Q get:", -priority, url
print "Visit #%i, Q volume %i" % (visits, q.qsize())
response = get_request(url)
if response:
items, extracted_links = s.parse(response, level=-priority) # links and items are both links
# print "Extracted item: ",items
#print "extracted links:", extracted_links
#print "exctracted links:", links
for link in extracted_links:
if link[1] not in queued_links:
# print link
q.put((-link[0], link[1]))
queued_links.add(link[1])
# else:
# print "We already queued %s" % link[1]
send_down_pipeline(pipeline, items, s) # manage the returned items
if settings.ASK_BETWEEN_REQUESTS: raw_input("Press ENTER to continue?")
visits += 1
if q.empty(): print "CRAWL IS FINISHED: Queue is empty"
#if visits >= settings.MAX_CRAWLS: print "CRAWL IS FINISHED: Crawled max number of urls (%i total)" % visits
else:
quit()