当前位置: 首页>>代码示例>>Python>>正文


Python scrapy_splash.SplashRequest方法代码示例

本文整理汇总了Python中scrapy_splash.SplashRequest方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy_splash.SplashRequest方法的具体用法?Python scrapy_splash.SplashRequest怎么用?Python scrapy_splash.SplashRequest使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy_splash的用法示例。


在下文中一共展示了scrapy_splash.SplashRequest方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse(self, response):
        #print '=========================', response.url
        pattern_list = re.compile(r'http://www.mogujie.com/book/\w+/\d+')
        #print '+++++++++++++++++++++++++', pattern_list.findall(response.body)

        '''
        for item_list in pattern_list.findall(response.body):
            req = Request(url = item_list, callback = self.parse_list)
            yield req
        '''

        '''
        req = Request(url = 'http://www.mogujie.com/book/clothing/50249/', callback = self.parse_list, meta={
                'splash': {
                    'endpoint': 'render.html'
                },
                #'dont_send_headers': True,
        })
        '''

        for item_list in pattern_list.findall(response.body):
            #req = SplashRequest(url = 'http://www.mogujie.com/book/clothing/50249/', callback = self.parse_list)
            req = SplashRequest(url = item_list, callback = self.parse_list)
            yield req 
开发者ID:Xinghaoz,项目名称:first-crawler,代码行数:26,代码来源:mogujie_mac.py

示例2: parse

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse(self,response):
        ls=response.xpath('//li[@data-sku]/@data-sku').extract()
        for urltail in ls:
            url=urljoin('https://item.jd.com/1665416.html',urltail)+'.html'        
            #url=urljoin('https://item.jd.com',urltail)  
            yield SplashRequest(url,callback=self.item_parse,args={'wait':0.5}) 

        # next_pages=[]
        # a=range(1,201,2)
        # b=range(1,5939,60)
         
        # for i in range(100):
        #     str_a=str(a[i])
        #     str_b=str(b[i])
        #     print str_a,str_b
        
            
           # time.sleep(10)
            
            
        # for new_url in next_pages: 
开发者ID:SaberAlexander,项目名称:multithread-crawler,代码行数:23,代码来源:JDSpider original.py

示例3: parse_list

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse_list(self, response):
        url = response.meta['splash']['args']['url']
        pattern = re.compile(r'http://www.mogujie.com/book/\w+/\d+/')

        if (pattern.match(url)):
            page = int(pattern.split(url)[1])
            url = pattern.findall(url)[0]
            page += 1
            url = url + str(page)
        else:
            url = url + '/2'

        print '+++++++++++++++++++++++++ Next url:', url
        req = SplashRequest(url = url, callback = self.parse_list)
        yield req

        pattern_detail = re.compile(r'http://shop.mogujie.com/detail/.{7}')
        for item_url in pattern_detail.findall(response.body):
            req = Request(url = item_url, callback = self.parse_item)
            yield req 
开发者ID:Xinghaoz,项目名称:first-crawler,代码行数:22,代码来源:mogujie.py

示例4: parse_list

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse_list(self, response):
        #print '+++++++++++++++++++++++++443', response.url
        url = response.meta['splash']['args']['url']
        print '&&&&&&&&&&&&&&&&&&&&&&&&&', response.status, url
        pattern = re.compile(r'http://www.mogujie.com/book/\w+/\d+/')

        if (pattern.match(url)):
            page = int(pattern.split(url)[1])
            url = pattern.findall(url)[0]
            page += 1
            url = url + str(page)
        else:
            url = url + '/2'

        print '+++++++++++++++++++++++++', url
        req = SplashRequest(url = url, callback = self.parse_list)
        yield req

        #print '+++++++++++++++++++++++++', response.url
        pattern_detail = re.compile(r'http://shop.mogujie.com/detail/.{7}')
        #print '==========================', len(pattern_detail.findall(response.body))
        for item_url in pattern_detail.findall(response.body):
            req = SplashRequest(url = item_url, callback = self.parse_item)
            yield req 
开发者ID:Xinghaoz,项目名称:first-crawler,代码行数:26,代码来源:mogujie_mac.py

示例5: media_request

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def media_request(self, url):
        kwargs = dict(
            url=url,
            priority=-2,
            meta={'download_slot': (
                '{} documents'.format(urlsplit(url).netloc)),
            },
        )
        if using_splash(self.crawler.settings):
            return SplashRequest(
                endpoint='execute',
                args={'lua_source': self.lua_source},
                slot_policy=SlotPolicy.SCRAPY_DEFAULT,
                **kwargs)
        else:
            return Request(**kwargs) 
开发者ID:TeamHG-Memex,项目名称:undercrawler,代码行数:18,代码来源:media_pipeline.py

示例6: test_autologin_request

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def test_autologin_request():
    crawler = make_crawler(
        base_settings(), SPLASH_URL='http://192.168.99.100:8050')
    mw = AutologinMiddleware('http://127.0.0.1:8089', crawler)
    al_request = mw._login_request(scrapy.Request('http://example.com'))
    data = json.loads(al_request.body.decode('utf-8'))
    assert al_request.dont_filter
    assert al_request.meta['proxy'] is None
    assert data['url'] == 'http://example.com'
    assert data['settings']['USER_AGENT'] == crawler.settings.get('USER_AGENT')
    assert data['settings'].get('SPLASH_URL') is None

    al_request = mw._login_request(SplashRequest('http://example.com'))
    data = json.loads(al_request.body.decode('utf-8'))
    assert data['url'] == 'http://example.com'
    assert data['settings']['SPLASH_URL'] == crawler.settings.get('SPLASH_URL') 
开发者ID:TeamHG-Memex,项目名称:autologin-middleware,代码行数:18,代码来源:test_integration.py

示例7: start_requests

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url=url, callback=self.parse, endpoint='render.html', args={'wait': 0.5}) 
开发者ID:JainulV,项目名称:myplanB,代码行数:5,代码来源:courses.py

示例8: parse

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse(self, response):
        pattern_list = re.compile(r'http://www.mogujie.com/book/\w+/\d+')

        for item_list in pattern_list.findall(response.body):
            req = SplashRequest(url = item_list, callback = self.parse_list)
            yield req 
开发者ID:Xinghaoz,项目名称:first-crawler,代码行数:8,代码来源:mogujie.py

示例9: parse_item

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse_item(self, response):
        mongo = self.db.url
        # url = response.meta['splash']['args']['url'] # Used for SplashRequest
        url = response.url
        url_trim = url.split('?')[0]
        if mongo.find_one({"url": url_trim}):
    	    print "&&&&&&&&&&&&&&&&&&&&&&&&& This URL has been crawled &&&&&&&&&&&&&&&&&&&&&&&&&"
    	    return

        # Insert the new link into MongoDB
        newone = {
            "url": url_trim,
            "time": time.time(),
        }
        mongo.insert_one(newone)

        page = Selector(response)
        title = page.xpath('//span[@itemprop="name"]/text()').extract_first()
        images = page.xpath('//img[@id="J_BigImg"]/@src').extract_first()
        availability = page.xpath('//dd[@class="num clearfix"]/div[@class="J_GoodsStock goods-stock fl"]/text()').extract_first()
        status = response.status

        item = FashionItem()
        item['url'] = url_trim
        item['title'] = title.encode('utf-8')
        item['images'] = images
        item['availability'] = availability.encode('utf-8')
        item['status'] = status
        return item 
开发者ID:Xinghaoz,项目名称:first-crawler,代码行数:31,代码来源:mogujie.py

示例10: start_requests

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def start_requests(self):
        yield SplashRequest('http://quotes.toscrape.com/js') 
开发者ID:scrapinghub,项目名称:scrapy-training,代码行数:4,代码来源:quotes-js.py

示例11: parse

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }
        next_page = response.css('li.next > a::attr(href)').extract_first()
        if next_page:
            yield SplashRequest(response.urljoin(next_page)) 
开发者ID:scrapinghub,项目名称:scrapy-training,代码行数:12,代码来源:quotes-js.py

示例12: test_dupe_filter_with_splash

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def test_dupe_filter_with_splash():
    dupe_filter = DupeFilter()
    url_fp = lambda url: dupe_filter.request_fingerprint(SplashRequest(url))
    assert url_fp('http://example.com') == \
           url_fp('https://example.com')
    assert url_fp('http://www.example.com/foo?a=1&b=1') == \
           url_fp('http://example.com/foo?b=1&a=1')
    assert url_fp('http://example.com/foo') != \
           url_fp('http://example.com/bar')
    assert url_fp('http://www.example.com/foo?a=1&b=1') == \
           url_fp('http://example.com/foo?b=1&a=1')
    assert url_fp('http://www.example.com/foo#a') != \
           url_fp('http://example.com/foo#b') 
开发者ID:TeamHG-Memex,项目名称:undercrawler,代码行数:15,代码来源:test_dupe_filter.py

示例13: make_request

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def make_request(
            self, url, callback=None, meta=None, cls=None, **kwargs):
        callback = callback or self.parse
        cls = cls or (SplashRequest if self.use_splash else Request)
        if self.use_splash:
            settings = self.settings
            splash_args = {
                'lua_source': self.lua_source,
                'js_source': self.js_source,
                'run_hh': settings.getbool('RUN_HH'),
                'return_png': settings.getbool('SCREENSHOT'),
                'images_enabled': settings.getbool('IMAGES_ENABLED'),
            }
            for s in ['VIEWPORT_WIDTH', 'VIEWPORT_HEIGHT',
                      'SCREENSHOT_WIDTH', 'SCREENSHOT_HEIGHT']:
                if self.settings.get(s):
                    splash_args[s.lower()] = self.settings.getint(s)
            if self.settings.getbool('ADBLOCK'):
                splash_args['filters'] = 'fanboy-annoyance,easylist'
            if self.settings.getbool('FORCE_TOR'):
                splash_args['proxy'] = 'tor'
            kwargs.update(dict(
                args=splash_args,
                endpoint='execute',
                cache_args=['lua_source', 'js_source'],
            ))
        meta = meta or {}
        meta['avoid_dup_content'] = True
        return cls(url, callback=callback, meta=meta, **kwargs) 
开发者ID:TeamHG-Memex,项目名称:undercrawler,代码行数:31,代码来源:spiders.py

示例14: start_requests

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url=url,
                                callback=self.parse,
                                endpoint='render.html',
                                args={'wait':0.5}) 
开发者ID:zseta,项目名称:scrapyfundamentals,代码行数:8,代码来源:JSScraper.py

示例15: splash_request

# 需要导入模块: import scrapy_splash [as 别名]
# 或者: from scrapy_splash import SplashRequest [as 别名]
def splash_request(*args, **kwargs):
    kwargs['endpoint'] = 'execute'
    splash_args = kwargs.setdefault('args', {})
    splash_args['lua_source'] = LUA_SOURCE
    return SplashRequest(*args, **kwargs) 
开发者ID:TeamHG-Memex,项目名称:autologin-middleware,代码行数:7,代码来源:splash.py


注:本文中的scrapy_splash.SplashRequest方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。