本文整理汇总了Python中scrapy.http.Request方法的典型用法代码示例。如果您正苦于以下问题:Python http.Request方法的具体用法?Python http.Request怎么用?Python http.Request使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.http
的用法示例。
在下文中一共展示了http.Request方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: start_requests
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def start_requests(self):
url = 'https://www.assetstore.unity3d.com/login'
yield Request(
url = url,
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'www.assetstore.unity3d.com',
'Referer': 'https://www.assetstore.unity3d.com/en/',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
'Firefox/50.0',
'X-Kharma-Version': '0',
'X-Requested-With': 'UnityAssetStore',
'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41',
},
meta = {
},
dont_filter = True,
callback = self.get_unity_version,
errback = self.error_parse,
)
示例2: parse_1
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def parse_1(self, response):
info('Parse '+response.url)
#sel = Selector(response)
#v = sel.css('.gs_ggs a::attr(href)').extract()
#import pdb; pdb.set_trace()
x = self.parse_with_rules(response, self.list_css_rules, dict)
items = []
if len(x) > 0:
items = x[0]['.gs_r']
pp.pprint(items)
import pdb; pdb.set_trace()
# return self.parse_with_rules(response, self.css_rules, googlescholarItem)
for item in items:
if item['related-url'] == '' or item['related-type'] != '[PDF]':
continue
url = item['related-url']
info('pdf-url: ' + url)
yield Request(url, callback=self.save_pdf)
示例3: request_to_dict
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def request_to_dict(self, request):
'''
Convert Request object to a dict.
modified from scrapy.utils.reqser
'''
req_dict = {
# urls should be safe (safe_string_url)
'url': to_unicode(request.url),
'method': request.method,
'headers': dict(request.headers),
'body': request.body,
'cookies': request.cookies,
'meta': request.meta,
'_encoding': request._encoding,
'priority': request.priority,
'dont_filter': request.dont_filter,
# callback/errback are assumed to be a bound instance of the spider
'callback': None if request.callback is None else request.callback.__name__,
'errback': None if request.errback is None else request.errback.__name__,
}
return req_dict
示例4: process_spider_output
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def process_spider_output(self, response, result, spider):
'''
Ensures the meta data from the response is passed
through in any Request's generated from the spider
'''
self.logger.debug("processing meta passthrough middleware")
for x in result:
# only operate on requests
if isinstance(x, Request):
self.logger.debug("found request")
# pass along all known meta fields, only if
# they were not already set in the spider's new request
for key in list(response.meta.keys()):
if key not in x.meta:
x.meta[key] = response.meta[key]
yield x
示例5: evaluate
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def evaluate(self, meta_object,
text, expected_raw, expected_requests):
request = Request(url='http://www.drudgereport.com',
meta=meta_object)
response = HtmlResponse('drudge.url', body=text, request=request,
encoding='utf8')
raw_item_count = 0
request_count = 0
for x in self.spider.parse(response):
if isinstance(x, RawResponseItem):
raw_item_count = raw_item_count + 1
elif isinstance(x, Request):
request_count = request_count + 1
self.assertEqual(raw_item_count, expected_raw)
self.assertEqual(request_count, expected_requests)
示例6: get_request
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def get_request(self):
req = None
# required
req = Request('http://ex.com')
req.meta['crawlid'] = "abc123"
req.meta['appid'] = "myapp"
req.meta['url'] = "http://ex.com"
req.meta['spiderid'] = "link"
req.meta["attrs"] = None
req.meta["allowed_domains"] = None
req.meta["allow_regex"] = None
req.meta["deny_regex"] = None
req.meta["deny_extensions"] = None
req.meta['curdepth'] = 0
req.meta["maxdepth"] = 0
req.meta['priority'] = 0
req.meta['retry_times'] = 0
req.meta['expires'] = 0
req.meta['useragent'] = None
req.meta['cookie'] = None
return req
示例7: parse_ph_key
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def parse_ph_key(self,response):
selector = Selector(response)
logging.debug('request url:------>' + response.url)
# logging.info(selector)
divs = selector.xpath('//div[@class="phimage"]')
for div in divs:
viewkey = re.findall('viewkey=(.*?)"',div.extract())
# logging.debug(viewkey)
yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],callback = self.parse_ph_info)
url_next = selector.xpath('//a[@class="orangeButton" and text()="Next"]/@href').extract()
# logging.debug(url_next)
if url_next:
# if self.test:
logging.debug(' next page:---------->' + self.host+url_next[0])
yield Request(url=self.host+url_next[0],callback=self.parse_ph_key)
# self.test = False
示例8: init_request
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def init_request(self):
"""This function is called before crawling starts."""
# Do not start a request on error,
# simply return nothing and quit scrapy
if self.abort:
return
logging.info('All set, start crawling with depth: ' + str(self.max_depth))
# Do a login
if self.config['login']['enabled']:
# Start with login first
logging.info('Login required')
return Request(url=self.login_url, callback=self.login)
else:
# Start with pase function
logging.info('Not login required')
return Request(url=self.base_url, callback=self.parse)
#----------------------------------------------------------------------
示例9: parse
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def parse(self, response):
for a in response.xpath("//dd/a"):
url = a.xpath("./@href").extract()[0]
text = a.xpath("./text()").extract()[0]
items = text.split(u'升级软件')
version = items[-1].strip()
product = items[0].strip().split(u'(')[0].split(' ')[0]
yield Request(
url=self.base_url.format(url),
headers={"Referer": response.url},
meta={
"product":product,
"version":version,
},
callback=self.parse_product)
示例10: parse_product
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def parse_product(self, response):
# Find the "Software and Firmware" tab link to get to the product-range-download page
meta = response.meta
meta['dont_redirect'] = True
for link in response.css('a.tab-link'):
href = link.xpath('@href').extract_first()
if href.endswith(u'software-firmware-tab'):
logging.debug("Requesting SW+FW page for %s at %s",
response.meta['product'], urlparse.urljoin(response.url, href))
yield Request(
url=urlparse.urljoin(response.url, href),
headers={"Referer": response.url},
meta=meta,
callback=self.parse_product_sw_fw)
break
else:
logging.debug("Did not find a 'Software and Firmware' tab for %s",
response.meta['product'])
示例11: parse
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def parse(self, response):
if not response.xpath(
"//form[@id='productSearchForm']//input[@name='category']/@value").extract()[0]:
for category in response.xpath("//form[@id='productSearchForm']/div[1]//ul[@class='select-options']//a/@data-id").extract():
yield FormRequest.from_response(response,
formname="productSearchForm",
formdata={
"category": category},
callback=self.parse)
elif not response.xpath("//form[@id='productSearchForm']//input[@name='subCategory']/@value").extract()[0]:
for subcategory in response.xpath("//form[@id='productSearchForm']/div[2]//ul[@class='select-options']//a/@data-id").extract():
yield FormRequest.from_response(response,
formname="productSearchForm",
formdata={
"subCategory": subcategory},
callback=self.parse)
else:
for product in response.xpath("//form[@id='productSearchForm']/div[3]//ul[@class='select-options']//a/@data-id").extract():
yield Request(
url=urlparse.urljoin(
response.url, "/us/support-product?pid=%s" % (product)),
headers={"Referer": response.url},
callback=self.parse_product)
示例12: parse_json
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def parse_json(self, response):
json_response = json.loads(response.body_as_unicode())
if json_response:
for entry in json_response:
yield Request(
url=urlparse.urljoin(
self.base_path, "/getMenuList.html?action=getsubcatlist&catid=%s&appPath=us" % entry["id"]),
meta={"cid": entry["id"]},
headers={"Referer": response.url,
"X-Requested-With": "XMLHttpRequest"},
callback=self.parse_json)
else:
yield Request(
url=urlparse.urljoin(
self.base_path, "phppage/down-load-model-list.html?showEndLife=false&catid={}&appPath=us".format(response.meta["cid"])),
headers={"Referer": response.url,
"X-Requested-With": "XMLHttpRequest"},
callback=self.parse_products)
示例13: parse_product_version
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def parse_product_version(self, response):
# <div class="hardware-version">
if response.xpath("//div[@class=\"hardware-version\"]").extract():
for i in [1, 2]:
yield Request(
url = response.url.replace(".html", "-V{}.html".format(i)),
meta = {"product": response.meta['product'],
"version": "V{}".format(int(i)+1),
},
callback = self.parse_product)
else: #only for v1?
yield Request(
url = response.url + "?again=true",
meta = {"product": response.meta['product'],
"version": "V1"
},
callback = self.parse_product)
示例14: start_requests
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse,
errback=self.parse_error,
dont_filter=True,
meta={
"current_request_traversal_page_count": 0,
"spider_config": self.spider_config,
"manifest": self.manifest
}
)
示例15: _build_request
# 需要导入模块: from scrapy import http [as 别名]
# 或者: from scrapy.http import Request [as 别名]
def _build_request(self, rule, link):
headers = {}
user_agent_header = os.environ.get("WCP_REQUEST_HEADERS_USER_AGENT")
if user_agent_header:
headers = {"User-Agent": user_agent_header}
r = Request(url=link.url, headers=headers, callback=self._response_downloaded)
r.meta.update(rule=rule, link_text=link.text)
return r