本文整理汇总了Python中extractor.Extractor.getAllFromCategory方法的典型用法代码示例。如果您正苦于以下问题:Python Extractor.getAllFromCategory方法的具体用法?Python Extractor.getAllFromCategory怎么用?Python Extractor.getAllFromCategory使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类extractor.Extractor
的用法示例。
在下文中一共展示了Extractor.getAllFromCategory方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: spider
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import getAllFromCategory [as 别名]
def spider(self, root, pages = True, subcategories = True, action = "traverse", preclean = False, depth = 1):
if preclean: self.graphdb.clear()
seen_key = "URL_SEEN"
queue_key = "URL_QUEUE"
ex = Extractor()
batch = neo4j.WriteBatch(self.graphdb)
queue_empty = lambda: self.fdb.scard(queue_key) == 0
seen = lambda x: self.fdb.sismember(seen_key, x)
visit = lambda x: self.fdb.sadd(seen_key, x)
dequeue = lambda: self.fdb.spop(queue_key)
enqueue = lambda x: self.fdb.sadd(queue_key, self._encode_str(x))
if action == "traverse":
enqueue(root)
while not queue_empty():
current = dequeue()
print current
if current and current.strip() and not seen(current):
visit(current)
result = ex.getAllFromCategory(current)
self.updateBatch(batch, type = neo4j.Node, node = {'name': current, 'class': self.CATEGORY})
if pages:
for page in result['pages']:
print "{0}\tp:{1}".format(current[:15], page)
self.incr_rel(page, current, self.CATEGORY_REL)
self.updateBatch(batch, type = neo4j.Node, node = {'name': page, 'class': self.ARTICLE})
links = ex.getWikiLinks(page)
for a in links:
print "{0}\tp:{1}\t{2}".format(current[:15], page, a)
self.incr_rel(a, page, self.SIBLING_REL)
self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE})
if subcategories:
for subcat in result['categories']:
print "{0}\tc:{1}".format(current, subcat)
self.incr_rel(subcat, current, self.SUBCAT_REL)
self.updateBatch(batch, type = neo4j.Node, node = {'name': subcat, 'class': self.CATEGORY})
enqueue(subcat)
elif action == "crawl":
enqueue(root)
while not queue_empty():
topic = dequeue()
if topic and topic.strip() and not seen(topic):
visit(topic)
result = ex.extract(topic)
depth -= 1
self.updateBatch(batch, type = neo4j.Node, node = {'name': topic, 'class': result['type']})
if result['type'] == self.CATEGORY:
pass
elif result['type'] == self.ARTICLE:
for a in result['links']:
self.incr_rel(a, topic, self.SIBLING_REL)
print "adding: ", a
self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE})
if depth > 0: enqueue(a)
for c in result['categories']:
self.incr_rel(a, topic, self.CATEGORY_REL)
self.updateBatch(batch, type = neo4j.Node, node = {'name': c, 'class': self.CATEGORY})
elif result['type'] == self.DISAMBIGUATION:
for a in result['links']:
self.incr_rel(a, topic, self.DISAMB_REL)
self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.DISAMBIGUATION})
print "FINISHED WITH THE NODES..."
for k in self.fdb.smembers(self.rel_key):
print "REL:", k
try:
nodes = k.split(":", 2)
rel = nodes[0]
n1 = self.node_index.get('name', nodes[1])[0]
n2 = self.node_index.get('name', nodes[2])[0]
self.updateBatch(batch, type = neo4j.Relationship, rel = {'node1': n1, 'rel': rel, 'weight': 1, 'node2': n2})
except Exception as e:
print "REL EXCEPTION: ", e
print "DONE>>>>>>>>>>>>>>>"