本文整理汇总了Python中scrapy.http.request.Request.meta['crawldepth']方法的典型用法代码示例。如果您正苦于以下问题:Python Request.meta['crawldepth']方法的具体用法?Python Request.meta['crawldepth']怎么用?Python Request.meta['crawldepth']使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.http.request.Request
的用法示例。
在下文中一共展示了Request.meta['crawldepth']方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_page
# 需要导入模块: from scrapy.http.request import Request [as 别名]
# 或者: from scrapy.http.request.Request import meta['crawldepth'] [as 别名]
def parse_page(self, response):
if response.meta.has_key('crawldepth'):
depth = response.meta['crawldepth']
else:
# Set search depth here
depth = 1
log.msg('Depth = %s' % str(depth), level=log.INFO)
if not isinstance(response, HtmlResponse):
log.msg('Not an HTML file: %s' % response.url, level=log.WARNING)
return
log.msg('Response from: %s' % response.url, level=log.INFO)
url_bf.add(response.url)
# TODO: Extract page title
extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode())
cleaned_text = extractor.getText()
# Eliminate duplicates
keywordset = set(keywordlist)
found_list = []
for keyword in keywordset: # TODO: Is there a more efficient way to do this?
# Look at word boundaries to match entire words only
if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)):
found_list.append(keyword)
# Parse this page
item = BiffleItem()
if (len(found_list) > 0):
item['url'] = response.url
item['body'] = cleaned_text
item['keywords'] = ', '.join(found_list)
item['process_date'] = datetime.today()
log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO)
self.map_keyword_count(found_list)
yield item
if (depth > 0):
# Find the next requests and yield those
hxs = HtmlXPathSelector(response)
links = hxs.select('//a/@href').extract()
log.msg('Links on page: %s' % len(links), level=log.INFO)
depth -= 1
log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO)
for l in links:
l = urlparse.urljoin(response.url, l)
if (l in url_bf):
pass
#log.msg('Duplicate URL found: %s' % l, level=log.INFO)
else:
url_bf.add(l)
#log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO)
# Decrement depth for next layer of links
#callback = lambda response, depth = depth: self.parse_page(response, depth)
callback = lambda response: self.parse_page(response)
request = Request(l, callback=callback)
request.meta['crawldepth'] = depth
yield request