本文整理汇总了Python中page.Page.check方法的典型用法代码示例。如果您正苦于以下问题:Python Page.check方法的具体用法?Python Page.check怎么用?Python Page.check使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类page.Page
的用法示例。
在下文中一共展示了Page.check方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run_job
# 需要导入模块: from page import Page [as 别名]
# 或者: from page.Page import check [as 别名]
def run_job(self):
self.status = {}
self.status["scope"] = "running crawl job"
if self.query is None:
self.status["msg"] = "Unable to start crawl: no query has been set."
self.status["code"] = 600.1
self.status["status"] = False
return False
else:
query = Query(self.query)
seeds = self.collect_sources()
if self.db.sources.count() == 0:
self.status["msg"] = "Unable to start crawl: no seeds have been set."
self.status["code"] = 600.1
self.status["status"] = False
return False
else:
self.send_seeds_to_queue()
start = datetime.now()
if self.db.queue.count == 0:
self.status["msg"] = "Error while sending urls into queue: queue is empty"
self.status["code"] = 600.1
self.status["status"] = False
return False
else:
self.status["msg"] = "running crawl on %i sources with query '%s'" %(len(self.db.sources.distinct("url")), self.query)
while self.db.queue.count > 0:
for url in self.db.queue.distinct("url"):
if url != "":
page = Page(url)
if page.check() and page.request() and page.control():
article = page.extract("article")
if article.status is True:
if article.is_relevant(query):
self.db.results.insert(article.repr())
if article.outlinks is not None and len(article.outlinks) > 0:
self.db.queue.insert(article.outlinks)
else:
self.db.logs.insert(article.logs)
else:
self.db.logs.insert(page.status)
self.db.queue.remove({"url": url})
if self.db.queue.count() == 0:
break
if self.db.queue.count() == 0:
break
end = datetime.now()
elapsed = end - start
delta = end-start
self.status["msg"] = "%s. Crawl done sucessfully in %s s" %(self.status["msg"],str(elapsed))
self.status["status"] = True
return True
示例2: crawler
# 需要导入模块: from page import Page [as 别名]
# 或者: from page.Page import check [as 别名]
def crawler(docopt_args):
start = datetime.datetime.now()
db_name = docopt_args['<project>']
query = docopt_args['<query>']
db = Database(db_name)
db.create_colls()
while db.queue.count > 0:
print "beginning crawl"
print "Nombre de sources dans la base", db.sources.count()
print "Nombre d'url à traiter", len(db.queue.distinct("url"))
for url in db.queue.distinct("url"):
if url not in db.results.find({"url":url}):
p = Page(url, query)
if p.check() and p.request() and p.control() and p.extract():
#print "Links", p.outlinks
db.results.update(p.info, {'$push': {"date": datetime.datetime.today()}}, upsert=True)
#db.results.insert(p.info)
if p.outlinks is not None:
try:
for n_url in p.outlinks:
if n_url is not None or n_url not in db.queue.find({"url":n_url}) or n_url not in db.results.find({"url":n_url}) or n_url not in db.log.find({"url":n_url}):
# Checking correct url before is problematic
# next_p = Page(n_url, query)
# if next_p.clean_url(p.url) is not None:
db.queue.insert({"url":n_url})
except mongo_err:
db.log.udpate({"url":url, "error_type": "pymongo error inserting outlinks", "query": self.query, "status":False},{'$push': {"date": datetime.datetime.today()}}, upsert=True)
elif p.error_type != 0:
''' if the page is not relevant do not store in db'''
db.log.update(p.bad_status(),{"date": {'$push': datetime.datetime.today()}}, upsert=True)
else:
continue
db.queue.remove({"url": url})
if db.queue.count() == 0:
print db.stats()
break
if db.queue.count() == 0:
print db.stats()
break
end = datetime.datetime.now()
elapsed = end - start
print "crawl finished, %i results and %i sources are stored in Mongo Database: %s in %s" %(db.results.count(),db.sources.count(),db_name, elapsed)
return