当前位置: 首页>>代码示例>>Python>>正文


Python scrapy.Request方法代码示例

本文整理汇总了Python中scrapy.Request方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.Request方法的具体用法?Python scrapy.Request怎么用?Python scrapy.Request使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy的用法示例。


在下文中一共展示了scrapy.Request方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: retry_middleware_response

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def retry_middleware_response(request):
    """
    Fixture to simplify creating a crawler
    with an activated middleware and going through
    the request-response cycle.

    Executes process_response() method of the middleware.
    """
    settings, status = request.param

    crawler = get_crawler(Spider, settings_dict=settings)
    spider = crawler._create_spider('foo')
    mw = RetryUserAgentMiddleware.from_crawler(crawler)

    req = Request('http://www.scrapytest.org/')
    rsp = Response(req.url, body=b'', status=status)

    yield mw.process_response(req, rsp, spider) 
开发者ID:alecxe,项目名称:scrapy-fake-useragent,代码行数:20,代码来源:test_retry_middleware.py

示例2: retry_middleware_exception

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def retry_middleware_exception(request):
    """
    Fixture to simplify creating a crawler
    with an activated retry middleware and going through
    the request-response cycle.

    Executes process_exception() method of the middleware.
    """
    settings, exception = request.param

    crawler = get_crawler(Spider, settings_dict=settings)
    spider = crawler._create_spider('foo')
    mw = RetryUserAgentMiddleware.from_crawler(crawler)

    req = Request('http://www.scrapytest.org/')

    yield mw.process_exception(req, exception, spider) 
开发者ID:alecxe,项目名称:scrapy-fake-useragent,代码行数:19,代码来源:test_retry_middleware.py

示例3: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
        datas = json.loads(response.body)
        item = DoubanMovieItem()
        if datas:
            for data in datas:
                item['ranking'] = data['rank']
                item['movie_name'] = data['title']
                item['score'] = data['score']
                item['score_num'] = data['vote_count']
                yield item

            # 如果datas存在数据则对下一页进行采集
            page_num = re.search(r'start=(\d+)', response.url).group(1)
            page_num = 'start=' + str(int(page_num)+20)
            next_url = re.sub(r'start=\d+', page_num, response.url)
            yield Request(next_url, headers=self.headers) 
开发者ID:Wooden-Robot,项目名称:scrapy-tutorial,代码行数:18,代码来源:douban_ajax_spider.py

示例4: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
        item = DoubanMovieItem()
        movies = response.xpath('//ol[@class="grid_view"]/li')
        for movie in movies:
            item['ranking'] = movie.xpath(
                './/div[@class="pic"]/em/text()').extract()[0]
            item['movie_name'] = movie.xpath(
                './/div[@class="hd"]/a/span[1]/text()').extract()[0]
            item['score'] = movie.xpath(
                './/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()[0]
            item['score_num'] = movie.xpath(
                './/div[@class="star"]/span/text()').re(ur'(\d+)人评价')[0]
            yield item

        next_url = response.xpath('//span[@class="next"]/a/@href').extract()
        if next_url:
            next_url = 'https://movie.douban.com/top250' + next_url[0]
            yield Request(next_url, headers=self.headers) 
开发者ID:Wooden-Robot,项目名称:scrapy-tutorial,代码行数:21,代码来源:douban_spider.py

示例5: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
        # if self.counter > 2:
        #     return
        # else:
        #     self.counter += 1

        for book in response.css('article.product_pod'):
            try:
                bname = book.xpath('./h3/a/@title').extract_first()
                price = book.css('p.price_color::text').extract()[0]
                # yield {'name': bname, 'price': price}

                bookit = BooksItem()
                bookit['name'] = bname
                bookit['price'] = price
                yield bookit

            except Exception as e:
                print(e)

        #
        next_url = response.css('li.next a::attr(href)').extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse) 
开发者ID:makelove,项目名称:Python_Master_Courses,代码行数:27,代码来源:books.py

示例6: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
        for tr in response.xpath('//tbody/tr'):
            try:
                ip = tr.xpath('td[@data-title="IP"]/text()').extract()[0]
                port = tr.xpath('td[@data-title="PORT"]/text()').extract()[0]
                http_type = tr.xpath('td[@data-title="类型"]/text()').extract()[0].lower()
                # print(http_type,ip,port)
            except Exception as e:
                # print(e)
                continue

            #
            url = '%s://httpbin.org/ip' % http_type
            proxy = '%s://%s:%s' % (http_type, ip, port)

            meta = {
                'proxy': proxy,
                'dont_retry': True,
                'download_timeout': 10,
                #
                '_proxy_scheme': http_type,
                '_proxy_ip': ip,
                'port': port
            }
            yield Request(url, callback=self.check_available, meta=meta, dont_filter=True) 
开发者ID:makelove,项目名称:Python_Master_Courses,代码行数:27,代码来源:ip_proxy.py

示例7: start_requests

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def start_requests(self):
    tracks_list = [
      { 'title': 'Algorithms', 'name': 'algorithms' },
      { 'title': 'Data Structures', 'name': 'data-structures' },
      { 'title': 'Mathematics', 'name': 'mathematics' },
      ]
    for i, track in enumerate(tracks_list):
      tracks.append({
        'title': track['title'],
        'name': track['name'],
        'chapters': [],
        })
      url = 'https://www.hackerrank.com/rest/contests/master/tracks/' + track['name'] + '/chapters'
      yield scrapy.Request(url=url, callback=functools.partial(self.parse_chapters, d={
        'track-id': i,
        })) 
开发者ID:yznpku,项目名称:HackerRank,代码行数:18,代码来源:update-challenge-list.py

示例8: parse_chapters

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse_chapters(self, response, d):
    json_object = json.loads(response.text)
    for i, chapter in enumerate(json_object['models']):
      tracks[d['track-id']]['chapters'].append({
        'title': chapter['name'],
        'name': chapter['slug'],
        'challenges': [None] * chapter['challenges_count'],
        })
      for offset in range(0, chapter['challenges_count'], 10):
        url = 'https://www.hackerrank.com/rest/contests/master/categories/' \
              + tracks[d['track-id']]['name'] + '%7C' + chapter['slug'] \
              + '/challenges?offset=' + str(offset) + '&limit=10'
        yield scrapy.Request(url=url, callback=functools.partial(self.parse_page, d={
          'track-id': d['track-id'],
          'chapter-id': i,
          'offset': offset,
          })) 
开发者ID:yznpku,项目名称:HackerRank,代码行数:19,代码来源:update-challenge-list.py

示例9: main_list_parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def main_list_parse(self, response):
		for sel in response.xpath('//div[@class="wrapper-piclist"]/ul/li'):
			item = AlbumItem()
			item['level'] = 1
			item['title'] = sel.xpath('div[2]/div[1]/p/a/text()').extract_first()
			item['img_url'] = sel.xpath('div[1]/a/img/@src').extract_first()
			item['main_url'] = sel.xpath('div[2]/div[1]/p/a/@href').extract_first()
			item['type_id'] = 0
			update_status = sel.xpath('div[1]/a/div/div/p/span/text()').extract_first().strip()
			item['status'] = 1 if update_status[0] == u'共' else 0

			if item['title'] is not None and item['main_url'] is not None:
				yield item
				yield scrapy.Request(response.urljoin(item['main_url']), callback=self.video_list_parse, errback=self.errback_httpbin)
		
		no_page = response.xpath('//span[@class="curPage"]/following-sibling::span[@class="noPage"]').extract_first()
		# to crawl next page
		if no_page is None:
			next_page_url = response.xpath('//div[@class="mod-page"]/a[last()]/@href').extract_first()
			print('visit next page url: ', next_page_url)
			yield scrapy.Request(response.urljoin(next_page_url), callback=self.main_list_parse, errback=self.errback_httpbin) 
开发者ID:czs0x55aa,项目名称:video_url_crawler_demo,代码行数:23,代码来源:aiqiyi_spider.py

示例10: collection

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def collection(self, response):
        self.update_process(response, ".column-label .count-badge::text", 'Crawling collections...')
        image_items = response.css('._image-items.js-legacy-mark-unmark-list li.image-item')
        all_collection_urls = []

        for image_item in image_items:
            # 对于已经删除的图片 可能会包含在image_items中,但无法提取bookmark,在转为int时报错,程序到此终止
            # 在image_page会再检查一遍fav_num
            item_url = image_item.css('a.work._work::attr(href)').extract_first('')
            pid = item_url.split('illust_id=')[-1]
            if pid in self.collection_set:
                continue
            img_bookmark = image_item.css('ul li a.bookmark-count._ui-tooltip::text').extract_first('')
            if img_bookmark and int(img_bookmark) >= self.MIN_FAV:
                all_collection_urls.append(item_url)
        all_collection_urls = [parse.urljoin(response.url, url) for url in all_collection_urls]
        next_page_url = response.css('.column-order-menu .pager-container .next ._button::attr(href)').extract_first("")
        # ???
        if self.tryNextPage(next_page_url):
            next_page_url = parse.urljoin(response.url, next_page_url)
            yield scrapy.Request(next_page_url, headers=self.header, callback=self.collection)
        for url in all_collection_urls:
            yield scrapy.Request(url, headers=self.header, callback=self.image_page) 
开发者ID:vicety,项目名称:Pixiv-Crawler,代码行数:25,代码来源:pixiv-beta.py

示例11: search

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def search(self, response):
        # for debug
        if self.process > self.maxsize:
            return
        js_text = response.css("div.layout-body div._unit input#js-mount-point-search-result-list::attr(data-items)").extract_first('Not Found')
        if js_text == "Not Found":
            print("json接口变动,烦请issue")
        js = json.loads(js_text)
        self.update_process(response, '._unit .column-header span.count-badge::text', 'Searching {0}'.format(cf.get('SRH', 'TAGS')))
        all_works_url = []
        for image_item in js:
            if image_item["bookmarkCount"] >= self.MIN_FAV:
                all_works_url.append(('https://www.pixiv.net/member_illust.php?mode=medium&illust_id={0}'.format(image_item["illustId"]),
                                      image_item['bookmarkCount']))
        next_page_url = response.css('.column-order-menu .pager-container .next ._button::attr(href)').extract_first("")
        if self.tryNextPage(next_page_url):
            next_page_url = parse.urljoin(response.url, next_page_url)
            yield scrapy.Request(next_page_url, headers=self.header, callback=self.search)
        for url, bookmarkCount in all_works_url:
            request = scrapy.Request(url, headers=self.header, callback=self.image_page)  # 就是这里改成提取数据
            request.meta['collection'] = bookmarkCount
            yield request 
开发者ID:vicety,项目名称:Pixiv-Crawler,代码行数:24,代码来源:pixiv-beta.py

示例12: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
        news_in_page = response.css('.listRight li h2 a')
        if not news_in_page:
            return

        for news in news_in_page:
            url = news.css('a::attr(href)').extract_first()
            if ROOT_URL not in url:
                url = ROOT_URL + url
            url = response.urljoin(url)
            yield scrapy.Request(url, callback=self.parse_news)
        if 'next_page' in response.meta:
            meta = {'next_page': response.meta['next_page'] + 1}
        else:
            meta = {'next_page': 2}
        next_url = PAGE_URL + '?page=' + str(meta['next_page'])
        yield scrapy.Request(next_url, callback=self.parse, meta=meta) 
开发者ID:TaiwanStat,项目名称:Taiwan-news-crawlers,代码行数:19,代码来源:china_spider.py

示例13: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
        section = response.css('section.nclnbx.slvl.clearmen, article.nclns')
        for part in section:
            if part.css('header.schh h1::text'):
                category = part.css('header.schh h1::text').extract_first()
                category = category.strip()
            else:
                meta = {'category': category}
                for news in part.css('ul.fillup li'):
                    if 'eat-travel' in news.css(
                            "a::attr(href)").extract_first():
                        continue
                    elif 'entertainment.appledaily' in news.css(
                            "a::attr(href)").extract_first():
                        url = news.css("a::attr(href)").extract_first()
                    elif 'http' in news.css("a::attr(href)").extract_first():
                        url = news.css("a::attr(href)").extract_first()
                    else:
                        url = "http://www.appledaily.com.tw{}".format(
                            news.css("a::attr(href)").extract_first())
                    if url:
                        url = response.urljoin(url)
                        yield scrapy.Request(
                            url, callback=self.parse_news, meta=meta) 
开发者ID:TaiwanStat,项目名称:Taiwan-news-crawlers,代码行数:26,代码来源:apple_spider.py

示例14: start_requests

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def start_requests(self):
        urls = [
            'http://news.ltn.com.tw/list/newspaper/focus/',
            'http://news.ltn.com.tw/list/newspaper/politics/',
            'http://news.ltn.com.tw/list/newspaper/society/',
            'http://news.ltn.com.tw/list/newspaper/local/',
            'http://news.ltn.com.tw/list/newspaper/life/',
            'http://news.ltn.com.tw/list/newspaper/opinion/',
            'http://news.ltn.com.tw/list/newspaper/world/',
            'http://news.ltn.com.tw/list/newspaper/business/',
            'http://news.ltn.com.tw/list/newspaper/sports/',
            'http://news.ltn.com.tw/list/newspaper/entertainment/',
            'http://news.ltn.com.tw/list/newspaper/consumer/',
            'http://news.ltn.com.tw/list/newspaper/supplement/'
        ]

        day = datetime.timedelta(days=1)
        current_time = NEWS_DATE_BEGIN

        while current_time <= TODAY:
            date = current_time.strftime('%Y%m%d')
            for url in urls:
                target = url + date
                yield scrapy.Request(target, callback=self.parse_news_list)
            current_time += day 
开发者ID:TaiwanStat,项目名称:Taiwan-news-crawlers,代码行数:27,代码来源:liberty_tag_spider.py

示例15: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
        current_page_index = int(
            response.css('.pagination li.current a::text').extract_first())

        newses_time_str = response.css('.article_list li span::text').extract()
        newses_time = [
            datetime.strptime(i, '%Y/%m/%d %H:%M').date()
            for i in newses_time_str
        ]
        is_over_today = False

        for t in newses_time:
            if t < TODAY:
                is_over_today = True

        if not is_over_today:
            next_url = 'http://www.cna.com.tw/list/aall-' + str(
                current_page_index + 1) + '.aspx'
            yield scrapy.Request(next_url, callback=self.parse)

        for news in response.css('div.article_list li a'):
            url = response.urljoin(news.css('a::attr(href)').extract_first())
            yield scrapy.Request(url, callback=self.parse_news) 
开发者ID:TaiwanStat,项目名称:Taiwan-news-crawlers,代码行数:25,代码来源:cna_spider.py


注:本文中的scrapy.Request方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。