当前位置: 首页>>代码示例>>Python>>正文


Python Extractor.getAllFromCategory方法代码示例

本文整理汇总了Python中extractor.Extractor.getAllFromCategory方法的典型用法代码示例。如果您正苦于以下问题:Python Extractor.getAllFromCategory方法的具体用法?Python Extractor.getAllFromCategory怎么用?Python Extractor.getAllFromCategory使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在extractor.Extractor的用法示例。


在下文中一共展示了Extractor.getAllFromCategory方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: spider

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import getAllFromCategory [as 别名]
	def spider(self, root, pages = True, subcategories = True, action = "traverse", preclean = False, depth = 1):
		if preclean: self.graphdb.clear()
		seen_key = "URL_SEEN"
		queue_key = "URL_QUEUE"
		ex = Extractor()
		batch = neo4j.WriteBatch(self.graphdb)

		queue_empty = lambda: self.fdb.scard(queue_key) == 0
		seen = lambda x: self.fdb.sismember(seen_key, x)
		visit = lambda x: self.fdb.sadd(seen_key, x)
		dequeue = lambda: self.fdb.spop(queue_key)
		enqueue = lambda x: self.fdb.sadd(queue_key, self._encode_str(x))

		if action == "traverse":
			enqueue(root)
			while not queue_empty():
				current = dequeue()
				print current
				if current and current.strip() and not seen(current):
					visit(current)
					result = ex.getAllFromCategory(current)
					self.updateBatch(batch, type = neo4j.Node, node = {'name': current, 'class': self.CATEGORY})
					if pages:
						for page in result['pages']:
							print "{0}\tp:{1}".format(current[:15], page)
							self.incr_rel(page, current, self.CATEGORY_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': page, 'class': self.ARTICLE})
							links = ex.getWikiLinks(page)
							for a in links:
								print "{0}\tp:{1}\t{2}".format(current[:15], page, a)
								self.incr_rel(a, page, self.SIBLING_REL)
								self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE})
					if subcategories:
						for subcat in result['categories']:
							print "{0}\tc:{1}".format(current, subcat)
							self.incr_rel(subcat, current, self.SUBCAT_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': subcat, 'class': self.CATEGORY})
							enqueue(subcat)
		elif action == "crawl":
			enqueue(root)
			while not queue_empty():
				topic = dequeue()
				if topic and topic.strip() and not seen(topic):
					visit(topic)
					result = ex.extract(topic)
					depth -= 1
					self.updateBatch(batch, type = neo4j.Node, node = {'name': topic, 'class': result['type']})
					if result['type'] == self.CATEGORY:
						pass
					elif result['type'] == self.ARTICLE:
						for a in result['links']:
							self.incr_rel(a, topic, self.SIBLING_REL)
							print "adding: ", a
							self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE})
							if depth > 0: enqueue(a)
						for c in result['categories']:
							self.incr_rel(a, topic, self.CATEGORY_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': c, 'class': self.CATEGORY})
					elif result['type'] == self.DISAMBIGUATION:
						for a in result['links']:
							self.incr_rel(a, topic, self.DISAMB_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.DISAMBIGUATION})
		print "FINISHED WITH THE NODES..."
		for k in self.fdb.smembers(self.rel_key):
			print "REL:", k
			try:
				nodes = k.split(":", 2)
				rel = nodes[0]
				n1 = self.node_index.get('name', nodes[1])[0]
				n2 = self.node_index.get('name', nodes[2])[0]
				self.updateBatch(batch, type = neo4j.Relationship, rel = {'node1': n1, 'rel': rel, 'weight': 1, 'node2': n2})
			except Exception as e:
				print "REL EXCEPTION: ", e
		print "DONE>>>>>>>>>>>>>>>"
开发者ID:saurabhkb,项目名称:tailor,代码行数:76,代码来源:crawler.py


注:本文中的extractor.Extractor.getAllFromCategory方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。