當前位置: 首頁>>代碼示例>>Python>>正文


Python scrapy_splash.SplashRequest方法代碼示例

本文整理匯總了Python中scrapy_splash.SplashRequest方法的典型用法代碼示例。如果您正苦於以下問題:Python scrapy_splash.SplashRequest方法的具體用法?Python scrapy_splash.SplashRequest怎麽用?Python scrapy_splash.SplashRequest使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在scrapy_splash的用法示例。


在下文中一共展示了scrapy_splash.SplashRequest方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: parse

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def parse(self, response):
        #print '=========================', response.url
        pattern_list = re.compile(r'http://www.mogujie.com/book/\w+/\d+')
        #print '+++++++++++++++++++++++++', pattern_list.findall(response.body)

        '''
        for item_list in pattern_list.findall(response.body):
            req = Request(url = item_list, callback = self.parse_list)
            yield req
        '''

        '''
        req = Request(url = 'http://www.mogujie.com/book/clothing/50249/', callback = self.parse_list, meta={
                'splash': {
                    'endpoint': 'render.html'
                },
                #'dont_send_headers': True,
        })
        '''

        for item_list in pattern_list.findall(response.body):
            #req = SplashRequest(url = 'http://www.mogujie.com/book/clothing/50249/', callback = self.parse_list)
            req = SplashRequest(url = item_list, callback = self.parse_list)
            yield req 
開發者ID:Xinghaoz,項目名稱:first-crawler,代碼行數:26,代碼來源:mogujie_mac.py

示例2: parse

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def parse(self,response):
        ls=response.xpath('//li[@data-sku]/@data-sku').extract()
        for urltail in ls:
            url=urljoin('https://item.jd.com/1665416.html',urltail)+'.html'        
            #url=urljoin('https://item.jd.com',urltail)  
            yield SplashRequest(url,callback=self.item_parse,args={'wait':0.5}) 

        # next_pages=[]
        # a=range(1,201,2)
        # b=range(1,5939,60)
         
        # for i in range(100):
        #     str_a=str(a[i])
        #     str_b=str(b[i])
        #     print str_a,str_b
        
            
           # time.sleep(10)
            
            
        # for new_url in next_pages: 
開發者ID:SaberAlexander,項目名稱:multithread-crawler,代碼行數:23,代碼來源:JDSpider original.py

示例3: parse_list

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def parse_list(self, response):
        url = response.meta['splash']['args']['url']
        pattern = re.compile(r'http://www.mogujie.com/book/\w+/\d+/')

        if (pattern.match(url)):
            page = int(pattern.split(url)[1])
            url = pattern.findall(url)[0]
            page += 1
            url = url + str(page)
        else:
            url = url + '/2'

        print '+++++++++++++++++++++++++ Next url:', url
        req = SplashRequest(url = url, callback = self.parse_list)
        yield req

        pattern_detail = re.compile(r'http://shop.mogujie.com/detail/.{7}')
        for item_url in pattern_detail.findall(response.body):
            req = Request(url = item_url, callback = self.parse_item)
            yield req 
開發者ID:Xinghaoz,項目名稱:first-crawler,代碼行數:22,代碼來源:mogujie.py

示例4: parse_list

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def parse_list(self, response):
        #print '+++++++++++++++++++++++++443', response.url
        url = response.meta['splash']['args']['url']
        print '&&&&&&&&&&&&&&&&&&&&&&&&&', response.status, url
        pattern = re.compile(r'http://www.mogujie.com/book/\w+/\d+/')

        if (pattern.match(url)):
            page = int(pattern.split(url)[1])
            url = pattern.findall(url)[0]
            page += 1
            url = url + str(page)
        else:
            url = url + '/2'

        print '+++++++++++++++++++++++++', url
        req = SplashRequest(url = url, callback = self.parse_list)
        yield req

        #print '+++++++++++++++++++++++++', response.url
        pattern_detail = re.compile(r'http://shop.mogujie.com/detail/.{7}')
        #print '==========================', len(pattern_detail.findall(response.body))
        for item_url in pattern_detail.findall(response.body):
            req = SplashRequest(url = item_url, callback = self.parse_item)
            yield req 
開發者ID:Xinghaoz,項目名稱:first-crawler,代碼行數:26,代碼來源:mogujie_mac.py

示例5: media_request

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def media_request(self, url):
        kwargs = dict(
            url=url,
            priority=-2,
            meta={'download_slot': (
                '{} documents'.format(urlsplit(url).netloc)),
            },
        )
        if using_splash(self.crawler.settings):
            return SplashRequest(
                endpoint='execute',
                args={'lua_source': self.lua_source},
                slot_policy=SlotPolicy.SCRAPY_DEFAULT,
                **kwargs)
        else:
            return Request(**kwargs) 
開發者ID:TeamHG-Memex,項目名稱:undercrawler,代碼行數:18,代碼來源:media_pipeline.py

示例6: test_autologin_request

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def test_autologin_request():
    crawler = make_crawler(
        base_settings(), SPLASH_URL='http://192.168.99.100:8050')
    mw = AutologinMiddleware('http://127.0.0.1:8089', crawler)
    al_request = mw._login_request(scrapy.Request('http://example.com'))
    data = json.loads(al_request.body.decode('utf-8'))
    assert al_request.dont_filter
    assert al_request.meta['proxy'] is None
    assert data['url'] == 'http://example.com'
    assert data['settings']['USER_AGENT'] == crawler.settings.get('USER_AGENT')
    assert data['settings'].get('SPLASH_URL') is None

    al_request = mw._login_request(SplashRequest('http://example.com'))
    data = json.loads(al_request.body.decode('utf-8'))
    assert data['url'] == 'http://example.com'
    assert data['settings']['SPLASH_URL'] == crawler.settings.get('SPLASH_URL') 
開發者ID:TeamHG-Memex,項目名稱:autologin-middleware,代碼行數:18,代碼來源:test_integration.py

示例7: start_requests

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url=url, callback=self.parse, endpoint='render.html', args={'wait': 0.5}) 
開發者ID:JainulV,項目名稱:myplanB,代碼行數:5,代碼來源:courses.py

示例8: parse

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def parse(self, response):
        pattern_list = re.compile(r'http://www.mogujie.com/book/\w+/\d+')

        for item_list in pattern_list.findall(response.body):
            req = SplashRequest(url = item_list, callback = self.parse_list)
            yield req 
開發者ID:Xinghaoz,項目名稱:first-crawler,代碼行數:8,代碼來源:mogujie.py

示例9: parse_item

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def parse_item(self, response):
        mongo = self.db.url
        # url = response.meta['splash']['args']['url'] # Used for SplashRequest
        url = response.url
        url_trim = url.split('?')[0]
        if mongo.find_one({"url": url_trim}):
    	    print "&&&&&&&&&&&&&&&&&&&&&&&&& This URL has been crawled &&&&&&&&&&&&&&&&&&&&&&&&&"
    	    return

        # Insert the new link into MongoDB
        newone = {
            "url": url_trim,
            "time": time.time(),
        }
        mongo.insert_one(newone)

        page = Selector(response)
        title = page.xpath('//span[@itemprop="name"]/text()').extract_first()
        images = page.xpath('//img[@id="J_BigImg"]/@src').extract_first()
        availability = page.xpath('//dd[@class="num clearfix"]/div[@class="J_GoodsStock goods-stock fl"]/text()').extract_first()
        status = response.status

        item = FashionItem()
        item['url'] = url_trim
        item['title'] = title.encode('utf-8')
        item['images'] = images
        item['availability'] = availability.encode('utf-8')
        item['status'] = status
        return item 
開發者ID:Xinghaoz,項目名稱:first-crawler,代碼行數:31,代碼來源:mogujie.py

示例10: start_requests

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def start_requests(self):
        yield SplashRequest('http://quotes.toscrape.com/js') 
開發者ID:scrapinghub,項目名稱:scrapy-training,代碼行數:4,代碼來源:quotes-js.py

示例11: parse

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }
        next_page = response.css('li.next > a::attr(href)').extract_first()
        if next_page:
            yield SplashRequest(response.urljoin(next_page)) 
開發者ID:scrapinghub,項目名稱:scrapy-training,代碼行數:12,代碼來源:quotes-js.py

示例12: test_dupe_filter_with_splash

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def test_dupe_filter_with_splash():
    dupe_filter = DupeFilter()
    url_fp = lambda url: dupe_filter.request_fingerprint(SplashRequest(url))
    assert url_fp('http://example.com') == \
           url_fp('https://example.com')
    assert url_fp('http://www.example.com/foo?a=1&b=1') == \
           url_fp('http://example.com/foo?b=1&a=1')
    assert url_fp('http://example.com/foo') != \
           url_fp('http://example.com/bar')
    assert url_fp('http://www.example.com/foo?a=1&b=1') == \
           url_fp('http://example.com/foo?b=1&a=1')
    assert url_fp('http://www.example.com/foo#a') != \
           url_fp('http://example.com/foo#b') 
開發者ID:TeamHG-Memex,項目名稱:undercrawler,代碼行數:15,代碼來源:test_dupe_filter.py

示例13: make_request

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def make_request(
            self, url, callback=None, meta=None, cls=None, **kwargs):
        callback = callback or self.parse
        cls = cls or (SplashRequest if self.use_splash else Request)
        if self.use_splash:
            settings = self.settings
            splash_args = {
                'lua_source': self.lua_source,
                'js_source': self.js_source,
                'run_hh': settings.getbool('RUN_HH'),
                'return_png': settings.getbool('SCREENSHOT'),
                'images_enabled': settings.getbool('IMAGES_ENABLED'),
            }
            for s in ['VIEWPORT_WIDTH', 'VIEWPORT_HEIGHT',
                      'SCREENSHOT_WIDTH', 'SCREENSHOT_HEIGHT']:
                if self.settings.get(s):
                    splash_args[s.lower()] = self.settings.getint(s)
            if self.settings.getbool('ADBLOCK'):
                splash_args['filters'] = 'fanboy-annoyance,easylist'
            if self.settings.getbool('FORCE_TOR'):
                splash_args['proxy'] = 'tor'
            kwargs.update(dict(
                args=splash_args,
                endpoint='execute',
                cache_args=['lua_source', 'js_source'],
            ))
        meta = meta or {}
        meta['avoid_dup_content'] = True
        return cls(url, callback=callback, meta=meta, **kwargs) 
開發者ID:TeamHG-Memex,項目名稱:undercrawler,代碼行數:31,代碼來源:spiders.py

示例14: start_requests

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url=url,
                                callback=self.parse,
                                endpoint='render.html',
                                args={'wait':0.5}) 
開發者ID:zseta,項目名稱:scrapyfundamentals,代碼行數:8,代碼來源:JSScraper.py

示例15: splash_request

# 需要導入模塊: import scrapy_splash [as 別名]
# 或者: from scrapy_splash import SplashRequest [as 別名]
def splash_request(*args, **kwargs):
    kwargs['endpoint'] = 'execute'
    splash_args = kwargs.setdefault('args', {})
    splash_args['lua_source'] = LUA_SOURCE
    return SplashRequest(*args, **kwargs) 
開發者ID:TeamHG-Memex,項目名稱:autologin-middleware,代碼行數:7,代碼來源:splash.py


注:本文中的scrapy_splash.SplashRequest方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。