本文整理汇总了Python中article.Article.export方法的典型用法代码示例。如果您正苦于以下问题:Python Article.export方法的具体用法?Python Article.export怎么用?Python Article.export使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类article.Article
的用法示例。
在下文中一共展示了Article.export方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: controled_crawl
# 需要导入模块: from article import Article [as 别名]
# 或者: from article.Article import export [as 别名]
def controled_crawl(self):
while self.queue.count() > 0:
for item in self.queue.find().sort('depth', pymongo.ASCENDING):
logger.info(item["depth"])
#logger.info("url %s depth %d" %(item["url"], item['depth']))
p = Page(item["url"], item["source_url"],item["depth"], item["date"], True)
if p.fetch():
a = Article(p.url,p.html, p.source_url, p.depth,p.date, True)
if a.extract():
logging.info("extracted")
if a.filter(self.query, self.directory):
logging.info("valid")
if a.check_depth(a.depth):
a.fetch_links()
if len(a.links) > 0:
for url, domain in zip(a.links, a.domains):
if url not in self.queue.distinct("url") and url not in self.results.distinct("url") and url not in self.logs.distinct("url"):
self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
logging.info("Inserted %d nexts url" %len(a.links))
try:
self.results.insert(a.export())
except pymongo.errors.DuplicateKeyError:
logging.info("Exists already")
else:
try:
self.logs.insert(a.log())
except pymongo.errors.DuplicateKeyError:
logging.info("Exists already")
else:
try:
self.logs.insert(p.log())
except pymongo.errors.DuplicateKeyError:
logging.info("Exists already")
self.queue.remove(item)
logging.info("Processing %i urls"%self.queue.count())
if self.queue.count() == 0:
break
if self.queue.count() == 0:
break
if self.results.count() > 200000:
self.queue.drop()
break
示例2: crawler
# 需要导入模块: from article import Article [as 别名]
# 或者: from article.Article import export [as 别名]
def crawler(self):
logging.info("Crawler activated with query filter %s" %self.target)
# if self.sources.nb == 0:
# sys.exit("Error: no sources found in the project.")
try:
self.project.load_sources()
self.project.load_queue()
self.project.load_logs()
except AttributeError:
self.load_project()
#logging.info("Begin crawl with %i active urls"%self.sources.active_nb)
self.push_to_queue()
logging.info("Processing %i urls"%self.queue.count())
#print self.queue.list
while self.queue.count() > 0:
for item in self.queue.find().sort([("depth", 1)]):
if item["url"] in self.results.distinct("url"):
logging.info("in results")
self.queue.remove(item)
elif item["url"] in self.logs.distinct("url"):
logging.info("in logs")
self.queue.remove(item)
else:
#print "Treating", item["url"], item["depth"]
try:
p = Page(item["url"], item["source_url"],item["depth"], item["date"], True)
except KeyError:
p = Page(item["url"], item["source_url"],item["depth"], self.date, True)
if p.download():
a = Article(p.url,p.html, p.source_url, p.depth,p.date, True)
if a.extract():
#Targeted crawk filtering for pertinency
if self.target:
if a.filter(self.query, self.directory):
if a.check_depth(a.depth):
a.fetch_links()
if len(a.links) > 0:
for url, domain in zip(a.links, a.domains):
if url not in self.queue.distinct("url") and url not in self.results.distinct("url"):
self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
if self.debug: logging.info("\t-inserted %d nexts url" %len(a.links))
try:
self.results.insert(a.export())
except pymongo.errors.DuplicateKeyError:
#self.results.update(a.export())
pass
else:
logging.debug("depth exceeded")
self.logs.insert(a.log())
else:
logging.debug("Not relevant")
self.logs.insert(a.log())
else:
if a.check_depth(a.depth):
a.fetch_links()
if len(a.links) > 0:
for url, domain in zip(a.links, a.domains):
try:
self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
except pymongo.errors.DuplicateKeyError:
pass
if self.debug: logging.info("\t-inserted %d nexts url" %len(a.links))
try:
self.results.insert(a.export())
except pymongo.errors.DuplicateKeyError:
pass
else:
logging.debug("Depth exceeded")
try:
self.logs.insert(a.log())
except pymongo.errors.DuplicateKeyError:
self.logs.update({"url":a.url}, {"$push":{"msg": a.msg}})
else:
logging.debug("Error Extracting")
try:
self.logs.insert(a.log())
except pymongo.errors.DuplicateKeyError:
self.logs.update({"url":a.url}, {"$push":{"msg": a.msg}})
else:
logging.debug("Error Downloading")
self.logs.insert(p.log())
self.queue.remove(item)
logging.info("Processing %i urls"%self.queue.count())
if self.queue.nb == 0:
break
if self.queue.nb == 0:
break
#.........这里部分代码省略.........