本文整理汇总了Python中scrapy.Request.meta["urls"]方法的典型用法代码示例。如果您正苦于以下问题:Python Request.meta["urls"]方法的具体用法?Python Request.meta["urls"]怎么用?Python Request.meta["urls"]使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.Request
的用法示例。
在下文中一共展示了Request.meta["urls"]方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta["urls"] [as 别名]
def parse(self, response):
"""Parse Philpapers JSON file into a HEP record."""
jsonresponse = json.loads(response.body_as_unicode())
for jsonrecord in jsonresponse:
urls_in_record = jsonrecord.get("links")
if urls_in_record:
link = urls_in_record[0]
request = Request(link, callback=self.scrape_for_pdf)
request.meta["urls"] = urls_in_record
request.meta["jsonrecord"] = jsonrecord
yield request
else:
response.meta["urls"] = []
request.meta["jsonrecord"] = jsonrecord
yield self.build_item(response)
示例2: parse_node
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta["urls"] [as 别名]
def parse_node(self, response, node):
"""Parse Alpha web page into a HEP record."""
authors = self.get_authors(node)
title = node.xpath("./td[3]/span/span/text()").extract()
date = node.xpath("./td[4]/span/span/text()").extract()
urls = self.get_splash_links(node)
response.meta["node"] = node
response.meta["authors"] = authors
response.meta["title"] = title
response.meta["date"] = date
if not urls:
return self.build_item(response)
request = Request(urls[0], callback=self.scrape_for_pdf)
request.meta["node"] = node
request.meta["authors"] = authors
request.meta["urls"] = urls
request.meta["title"] = title
request.meta["date"] = date
return request
示例3: parse_node
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta["urls"] [as 别名]
def parse_node(self, response, node):
"""Iterate through all the record nodes in the XML.
With each node it checks if direct link exists, and sends
a request to scrape the direct link or calls build_item() to build
the HEPrecord.
"""
urls_in_record = self.get_urls_in_record(node)
direct_link = self.find_direct_links(urls_in_record)
if not direct_link and urls_in_record:
# Probably all links lead to same place, so take first
link = urls_in_record[0]
request = Request(link, callback=self.scrape_for_pdf)
request.meta["urls"] = urls_in_record
request.meta["record"] = node.extract()
return request
elif direct_link:
response.meta["direct_link"] = direct_link
response.meta["urls"] = urls_in_record
response.meta["record"] = node.extract()
return self.build_item(response)
示例4: parse_node
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta["urls"] [as 别名]
def parse_node(self, response, node):
"""Parse MAGIC web page into a HEP record."""
urls = self.get_splash_links(node)
title = node.xpath(".//a/text()").extract_first()
author_date = node.xpath(".//br/following-sibling::text()").extract()
try:
date = author_date[1].strip().strip("()")
except IndexError:
date = ''
if not urls:
response.meta["title"] = title
response.meta["date"] = date
return self.build_item(response)
request = Request(urls[0], callback=self.scrape_for_pdf)
request.meta["urls"] = urls
request.meta["title"] = title
request.meta["date"] = date
request.meta["handle_httpstatus_list"] = self.ERROR_CODES
return request
示例5: parse_node
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta["urls"] [as 别名]
def parse_node(self, response, node):
"""Iterate through all the record nodes in the XML.
With each node it checks if splash page link exists, and sends
a request to scrape the abstract or calls `build_item` to build
the HEPrecord.
"""
urls_in_record = self.get_urls_in_record(node)
direct_links, splash_links = self.find_direct_links(urls_in_record)
if not splash_links:
response.meta["urls"] = urls_in_record
response.meta["record"] = node.extract()
if direct_links:
response.meta["direct_links"] = direct_links
return self.build_item(response)
link = splash_links[0]
request = Request(link, callback=self.scrape_for_abstract)
request.meta["urls"] = urls_in_record
request.meta["record"] = node.extract()
if direct_links:
request.meta["direct_links"] = direct_links
return request