當前位置: 首頁>>代碼示例>>Python>>正文


Python Extractor.getWikiLinks方法代碼示例

本文整理匯總了Python中extractor.Extractor.getWikiLinks方法的典型用法代碼示例。如果您正苦於以下問題:Python Extractor.getWikiLinks方法的具體用法?Python Extractor.getWikiLinks怎麽用?Python Extractor.getWikiLinks使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在extractor.Extractor的用法示例。


在下文中一共展示了Extractor.getWikiLinks方法的2個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: generateFeatures

# 需要導入模塊: from extractor import Extractor [as 別名]
# 或者: from extractor.Extractor import getWikiLinks [as 別名]
	def generateFeatures(self):
		'''
		Has been hardcoded for wikipedia
		For each category, fetch Wiki-pages from list.txt
		Store keywords (links in the specified section)in features.txt
		'''
		e = Extractor()
		print self.categories
		for name in self.categories:
			print name
			f = open("%s/%s/%s" % (self.config.get(self.section, "CLASSES_FILE"), name, self.config.get(self.section, "LIST_FILE")), "r")
			g = open("%s/%s/%s" % (self.config.get(self.section, "CLASSES_FILE"), name, self.config.get(self.section, "FEATURE_FILE")), "w")
			for page in f:
				print page
				pagetok = page.strip().split('\t')
				try: section = pagetok[1]
				except: section = 0
				links = e.getWikiLinks(pagetok[0], section = section)
				for feature in links:
					units = set(self.clean(feature).split('_'))
					for unit in units:
						unit = self.stemmer.stem(unit)
						if self.valid(unit):
							g.write("%s," % unit)
				g.write("\n")
			f.close()
			g.close()
開發者ID:coep-rankweb,項目名稱:Webclassifier,代碼行數:29,代碼來源:wiki_source.py

示例2: spider

# 需要導入模塊: from extractor import Extractor [as 別名]
# 或者: from extractor.Extractor import getWikiLinks [as 別名]
	def spider(self, root, pages = True, subcategories = True, action = "traverse", preclean = False, depth = 1):
		if preclean: self.graphdb.clear()
		seen_key = "URL_SEEN"
		queue_key = "URL_QUEUE"
		ex = Extractor()
		batch = neo4j.WriteBatch(self.graphdb)

		queue_empty = lambda: self.fdb.scard(queue_key) == 0
		seen = lambda x: self.fdb.sismember(seen_key, x)
		visit = lambda x: self.fdb.sadd(seen_key, x)
		dequeue = lambda: self.fdb.spop(queue_key)
		enqueue = lambda x: self.fdb.sadd(queue_key, self._encode_str(x))

		if action == "traverse":
			enqueue(root)
			while not queue_empty():
				current = dequeue()
				print current
				if current and current.strip() and not seen(current):
					visit(current)
					result = ex.getAllFromCategory(current)
					self.updateBatch(batch, type = neo4j.Node, node = {'name': current, 'class': self.CATEGORY})
					if pages:
						for page in result['pages']:
							print "{0}\tp:{1}".format(current[:15], page)
							self.incr_rel(page, current, self.CATEGORY_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': page, 'class': self.ARTICLE})
							links = ex.getWikiLinks(page)
							for a in links:
								print "{0}\tp:{1}\t{2}".format(current[:15], page, a)
								self.incr_rel(a, page, self.SIBLING_REL)
								self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE})
					if subcategories:
						for subcat in result['categories']:
							print "{0}\tc:{1}".format(current, subcat)
							self.incr_rel(subcat, current, self.SUBCAT_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': subcat, 'class': self.CATEGORY})
							enqueue(subcat)
		elif action == "crawl":
			enqueue(root)
			while not queue_empty():
				topic = dequeue()
				if topic and topic.strip() and not seen(topic):
					visit(topic)
					result = ex.extract(topic)
					depth -= 1
					self.updateBatch(batch, type = neo4j.Node, node = {'name': topic, 'class': result['type']})
					if result['type'] == self.CATEGORY:
						pass
					elif result['type'] == self.ARTICLE:
						for a in result['links']:
							self.incr_rel(a, topic, self.SIBLING_REL)
							print "adding: ", a
							self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE})
							if depth > 0: enqueue(a)
						for c in result['categories']:
							self.incr_rel(a, topic, self.CATEGORY_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': c, 'class': self.CATEGORY})
					elif result['type'] == self.DISAMBIGUATION:
						for a in result['links']:
							self.incr_rel(a, topic, self.DISAMB_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.DISAMBIGUATION})
		print "FINISHED WITH THE NODES..."
		for k in self.fdb.smembers(self.rel_key):
			print "REL:", k
			try:
				nodes = k.split(":", 2)
				rel = nodes[0]
				n1 = self.node_index.get('name', nodes[1])[0]
				n2 = self.node_index.get('name', nodes[2])[0]
				self.updateBatch(batch, type = neo4j.Relationship, rel = {'node1': n1, 'rel': rel, 'weight': 1, 'node2': n2})
			except Exception as e:
				print "REL EXCEPTION: ", e
		print "DONE>>>>>>>>>>>>>>>"
開發者ID:saurabhkb,項目名稱:tailor,代碼行數:76,代碼來源:crawler.py


注:本文中的extractor.Extractor.getWikiLinks方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。