当前位置: 首页>>代码示例>>Python>>正文


Python Article.fetch_links方法代码示例

本文整理汇总了Python中article.Article.fetch_links方法的典型用法代码示例。如果您正苦于以下问题:Python Article.fetch_links方法的具体用法?Python Article.fetch_links怎么用?Python Article.fetch_links使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在article.Article的用法示例。


在下文中一共展示了Article.fetch_links方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: controled_crawl

# 需要导入模块: from article import Article [as 别名]
# 或者: from article.Article import fetch_links [as 别名]
	def controled_crawl(self):
		while self.queue.count() > 0:
			for item in self.queue.find().sort('depth', pymongo.ASCENDING):
				logger.info(item["depth"])
				#logger.info("url %s depth %d" %(item["url"], item['depth']))
				
				p = Page(item["url"], item["source_url"],item["depth"], item["date"], True)
				
				if p.fetch():
					a = Article(p.url,p.html, p.source_url, p.depth,p.date, True)
					if a.extract(): 
						logging.info("extracted")
						if a.filter(self.query, self.directory):
							logging.info("valid")
							if a.check_depth(a.depth):
								
								a.fetch_links()
								if len(a.links) > 0:
									for url, domain in zip(a.links, a.domains):
										if url not in self.queue.distinct("url") and url not in self.results.distinct("url") and url not in self.logs.distinct("url"):
											self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
											
									logging.info("Inserted %d nexts url" %len(a.links))
								try:
									
									self.results.insert(a.export())
								except pymongo.errors.DuplicateKeyError:
									logging.info("Exists already")
									
									
					else:
						try:
							self.logs.insert(a.log())
						except pymongo.errors.DuplicateKeyError:
							logging.info("Exists already")
							
				else:
					try:
						self.logs.insert(p.log())
					except pymongo.errors.DuplicateKeyError:
						logging.info("Exists already")
						
						
				self.queue.remove(item)
				logging.info("Processing %i urls"%self.queue.count())
				if self.queue.count() == 0:
					break
			if self.queue.count() == 0:
				break
			if self.results.count() > 200000:
				self.queue.drop()
				break
开发者ID:malstor,项目名称:crawtext,代码行数:54,代码来源:crawtext.py

示例2: crawler

# 需要导入模块: from article import Article [as 别名]
# 或者: from article.Article import fetch_links [as 别名]
	def crawler(self):
		logging.info("Crawler activated with query filter %s" %self.target)
		# if self.sources.nb == 0:
		# 	sys.exit("Error: no sources found in the project.")
		try:
			self.project.load_sources()
			self.project.load_queue()
			self.project.load_logs()
		except AttributeError:
			self.load_project()





		#logging.info("Begin crawl with %i active urls"%self.sources.active_nb)
		self.push_to_queue()
		logging.info("Processing %i urls"%self.queue.count())



		#print self.queue.list

		while self.queue.count() > 0:
			for item in self.queue.find().sort([("depth", 1)]):
				if item["url"] in self.results.distinct("url"):
					logging.info("in results")
					self.queue.remove(item)

				elif item["url"] in self.logs.distinct("url"):
					logging.info("in logs")
					self.queue.remove(item)
				else:
					#print "Treating", item["url"], item["depth"]
					try:
						p = Page(item["url"], item["source_url"],item["depth"], item["date"], True)
					except KeyError:
						p = Page(item["url"], item["source_url"],item["depth"], self.date, True)
					if p.download():
						a = Article(p.url,p.html, p.source_url, p.depth,p.date, True)
						if a.extract():
							#Targeted crawk filtering for pertinency
							if self.target:
								if a.filter(self.query, self.directory):
									if a.check_depth(a.depth):
										a.fetch_links()
										if len(a.links) > 0:
											for url, domain in zip(a.links, a.domains):
												if url not in self.queue.distinct("url") and url not in self.results.distinct("url"):
													self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
													if self.debug: logging.info("\t-inserted %d nexts url" %len(a.links))
												try:
													self.results.insert(a.export())
												except pymongo.errors.DuplicateKeyError:
													#self.results.update(a.export())
													pass

									else:
										logging.debug("depth exceeded")
										self.logs.insert(a.log())
								else:
									logging.debug("Not relevant")
									self.logs.insert(a.log())
							else:
								if a.check_depth(a.depth):
									a.fetch_links()
									if len(a.links) > 0:
										for url, domain in zip(a.links, a.domains):
											try:
												self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
											except pymongo.errors.DuplicateKeyError:
												pass
												if self.debug: logging.info("\t-inserted %d nexts url" %len(a.links))
											try:
												self.results.insert(a.export())
											except pymongo.errors.DuplicateKeyError:
												pass
								else:
									logging.debug("Depth exceeded")
									try:
										self.logs.insert(a.log())
									except pymongo.errors.DuplicateKeyError:
										self.logs.update({"url":a.url}, {"$push":{"msg": a.msg}})

						else:
							logging.debug("Error Extracting")
							try:
								self.logs.insert(a.log())
							except pymongo.errors.DuplicateKeyError:
								self.logs.update({"url":a.url}, {"$push":{"msg": a.msg}})
					else:
						logging.debug("Error Downloading")
						self.logs.insert(p.log())

					self.queue.remove(item)
					logging.info("Processing %i urls"%self.queue.count())
				if self.queue.nb == 0:
					break
			if self.queue.nb == 0:
				break
#.........这里部分代码省略.........
开发者ID:malstor,项目名称:crawtext,代码行数:103,代码来源:crawtext.py


注:本文中的article.Article.fetch_links方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。