本文整理汇总了Python中scrapy_splash.SplashRequest方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy_splash.SplashRequest方法的具体用法?Python scrapy_splash.SplashRequest怎么用?Python scrapy_splash.SplashRequest使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy_splash
的用法示例。
在下文中一共展示了scrapy_splash.SplashRequest方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse(self, response):
#print '=========================', response.url
pattern_list = re.compile(r'http://www.mogujie.com/book/\w+/\d+')
#print '+++++++++++++++++++++++++', pattern_list.findall(response.body)
'''
for item_list in pattern_list.findall(response.body):
req = Request(url = item_list, callback = self.parse_list)
yield req
'''
'''
req = Request(url = 'http://www.mogujie.com/book/clothing/50249/', callback = self.parse_list, meta={
'splash': {
'endpoint': 'render.html'
},
#'dont_send_headers': True,
})
'''
for item_list in pattern_list.findall(response.body):
#req = SplashRequest(url = 'http://www.mogujie.com/book/clothing/50249/', callback = self.parse_list)
req = SplashRequest(url = item_list, callback = self.parse_list)
yield req
示例2: parse
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse(self,response):
ls=response.xpath('//li[@data-sku]/@data-sku').extract()
for urltail in ls:
url=urljoin('https://item.jd.com/1665416.html',urltail)+'.html'
#url=urljoin('https://item.jd.com',urltail)
yield SplashRequest(url,callback=self.item_parse,args={'wait':0.5})
# next_pages=[]
# a=range(1,201,2)
# b=range(1,5939,60)
# for i in range(100):
# str_a=str(a[i])
# str_b=str(b[i])
# print str_a,str_b
# time.sleep(10)
# for new_url in next_pages:
示例3: parse_list
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse_list(self, response):
url = response.meta['splash']['args']['url']
pattern = re.compile(r'http://www.mogujie.com/book/\w+/\d+/')
if (pattern.match(url)):
page = int(pattern.split(url)[1])
url = pattern.findall(url)[0]
page += 1
url = url + str(page)
else:
url = url + '/2'
print '+++++++++++++++++++++++++ Next url:', url
req = SplashRequest(url = url, callback = self.parse_list)
yield req
pattern_detail = re.compile(r'http://shop.mogujie.com/detail/.{7}')
for item_url in pattern_detail.findall(response.body):
req = Request(url = item_url, callback = self.parse_item)
yield req
示例4: parse_list
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse_list(self, response):
#print '+++++++++++++++++++++++++443', response.url
url = response.meta['splash']['args']['url']
print '&&&&&&&&&&&&&&&&&&&&&&&&&', response.status, url
pattern = re.compile(r'http://www.mogujie.com/book/\w+/\d+/')
if (pattern.match(url)):
page = int(pattern.split(url)[1])
url = pattern.findall(url)[0]
page += 1
url = url + str(page)
else:
url = url + '/2'
print '+++++++++++++++++++++++++', url
req = SplashRequest(url = url, callback = self.parse_list)
yield req
#print '+++++++++++++++++++++++++', response.url
pattern_detail = re.compile(r'http://shop.mogujie.com/detail/.{7}')
#print '==========================', len(pattern_detail.findall(response.body))
for item_url in pattern_detail.findall(response.body):
req = SplashRequest(url = item_url, callback = self.parse_item)
yield req
示例5: media_request
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def media_request(self, url):
kwargs = dict(
url=url,
priority=-2,
meta={'download_slot': (
'{} documents'.format(urlsplit(url).netloc)),
},
)
if using_splash(self.crawler.settings):
return SplashRequest(
endpoint='execute',
args={'lua_source': self.lua_source},
slot_policy=SlotPolicy.SCRAPY_DEFAULT,
**kwargs)
else:
return Request(**kwargs)
示例6: test_autologin_request
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def test_autologin_request():
crawler = make_crawler(
base_settings(), SPLASH_URL='http://192.168.99.100:8050')
mw = AutologinMiddleware('http://127.0.0.1:8089', crawler)
al_request = mw._login_request(scrapy.Request('http://example.com'))
data = json.loads(al_request.body.decode('utf-8'))
assert al_request.dont_filter
assert al_request.meta['proxy'] is None
assert data['url'] == 'http://example.com'
assert data['settings']['USER_AGENT'] == crawler.settings.get('USER_AGENT')
assert data['settings'].get('SPLASH_URL') is None
al_request = mw._login_request(SplashRequest('http://example.com'))
data = json.loads(al_request.body.decode('utf-8'))
assert data['url'] == 'http://example.com'
assert data['settings']['SPLASH_URL'] == crawler.settings.get('SPLASH_URL')
示例7: start_requests
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, endpoint='render.html', args={'wait': 0.5})
示例8: parse
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse(self, response):
pattern_list = re.compile(r'http://www.mogujie.com/book/\w+/\d+')
for item_list in pattern_list.findall(response.body):
req = SplashRequest(url = item_list, callback = self.parse_list)
yield req
示例9: parse_item
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse_item(self, response):
mongo = self.db.url
# url = response.meta['splash']['args']['url'] # Used for SplashRequest
url = response.url
url_trim = url.split('?')[0]
if mongo.find_one({"url": url_trim}):
print "&&&&&&&&&&&&&&&&&&&&&&&&& This URL has been crawled &&&&&&&&&&&&&&&&&&&&&&&&&"
return
# Insert the new link into MongoDB
newone = {
"url": url_trim,
"time": time.time(),
}
mongo.insert_one(newone)
page = Selector(response)
title = page.xpath('//span[@itemprop="name"]/text()').extract_first()
images = page.xpath('//img[@id="J_BigImg"]/@src').extract_first()
availability = page.xpath('//dd[@class="num clearfix"]/div[@class="J_GoodsStock goods-stock fl"]/text()').extract_first()
status = response.status
item = FashionItem()
item['url'] = url_trim
item['title'] = title.encode('utf-8')
item['images'] = images
item['availability'] = availability.encode('utf-8')
item['status'] = status
return item
示例10: start_requests
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def start_requests(self):
yield SplashRequest('http://quotes.toscrape.com/js')
示例11: parse
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('span small::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
next_page = response.css('li.next > a::attr(href)').extract_first()
if next_page:
yield SplashRequest(response.urljoin(next_page))
示例12: test_dupe_filter_with_splash
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def test_dupe_filter_with_splash():
dupe_filter = DupeFilter()
url_fp = lambda url: dupe_filter.request_fingerprint(SplashRequest(url))
assert url_fp('http://example.com') == \
url_fp('https://example.com')
assert url_fp('http://www.example.com/foo?a=1&b=1') == \
url_fp('http://example.com/foo?b=1&a=1')
assert url_fp('http://example.com/foo') != \
url_fp('http://example.com/bar')
assert url_fp('http://www.example.com/foo?a=1&b=1') == \
url_fp('http://example.com/foo?b=1&a=1')
assert url_fp('http://www.example.com/foo#a') != \
url_fp('http://example.com/foo#b')
示例13: make_request
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def make_request(
self, url, callback=None, meta=None, cls=None, **kwargs):
callback = callback or self.parse
cls = cls or (SplashRequest if self.use_splash else Request)
if self.use_splash:
settings = self.settings
splash_args = {
'lua_source': self.lua_source,
'js_source': self.js_source,
'run_hh': settings.getbool('RUN_HH'),
'return_png': settings.getbool('SCREENSHOT'),
'images_enabled': settings.getbool('IMAGES_ENABLED'),
}
for s in ['VIEWPORT_WIDTH', 'VIEWPORT_HEIGHT',
'SCREENSHOT_WIDTH', 'SCREENSHOT_HEIGHT']:
if self.settings.get(s):
splash_args[s.lower()] = self.settings.getint(s)
if self.settings.getbool('ADBLOCK'):
splash_args['filters'] = 'fanboy-annoyance,easylist'
if self.settings.getbool('FORCE_TOR'):
splash_args['proxy'] = 'tor'
kwargs.update(dict(
args=splash_args,
endpoint='execute',
cache_args=['lua_source', 'js_source'],
))
meta = meta or {}
meta['avoid_dup_content'] = True
return cls(url, callback=callback, meta=meta, **kwargs)
示例14: start_requests
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html',
args={'wait':0.5})
示例15: splash_request
# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def splash_request(*args, **kwargs):
kwargs['endpoint'] = 'execute'
splash_args = kwargs.setdefault('args', {})
splash_args['lua_source'] = LUA_SOURCE
return SplashRequest(*args, **kwargs)