本文整理汇总了Python中scrapy.Request方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.Request方法的具体用法?Python scrapy.Request怎么用?Python scrapy.Request使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy
的用法示例。
在下文中一共展示了scrapy.Request方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: retry_middleware_response
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def retry_middleware_response(request):
"""
Fixture to simplify creating a crawler
with an activated middleware and going through
the request-response cycle.
Executes process_response() method of the middleware.
"""
settings, status = request.param
crawler = get_crawler(Spider, settings_dict=settings)
spider = crawler._create_spider('foo')
mw = RetryUserAgentMiddleware.from_crawler(crawler)
req = Request('http://www.scrapytest.org/')
rsp = Response(req.url, body=b'', status=status)
yield mw.process_response(req, rsp, spider)
示例2: retry_middleware_exception
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def retry_middleware_exception(request):
"""
Fixture to simplify creating a crawler
with an activated retry middleware and going through
the request-response cycle.
Executes process_exception() method of the middleware.
"""
settings, exception = request.param
crawler = get_crawler(Spider, settings_dict=settings)
spider = crawler._create_spider('foo')
mw = RetryUserAgentMiddleware.from_crawler(crawler)
req = Request('http://www.scrapytest.org/')
yield mw.process_exception(req, exception, spider)
示例3: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
datas = json.loads(response.body)
item = DoubanMovieItem()
if datas:
for data in datas:
item['ranking'] = data['rank']
item['movie_name'] = data['title']
item['score'] = data['score']
item['score_num'] = data['vote_count']
yield item
# 如果datas存在数据则对下一页进行采集
page_num = re.search(r'start=(\d+)', response.url).group(1)
page_num = 'start=' + str(int(page_num)+20)
next_url = re.sub(r'start=\d+', page_num, response.url)
yield Request(next_url, headers=self.headers)
示例4: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
item = DoubanMovieItem()
movies = response.xpath('//ol[@class="grid_view"]/li')
for movie in movies:
item['ranking'] = movie.xpath(
'.//div[@class="pic"]/em/text()').extract()[0]
item['movie_name'] = movie.xpath(
'.//div[@class="hd"]/a/span[1]/text()').extract()[0]
item['score'] = movie.xpath(
'.//div[@class="star"]/span[@class="rating_num"]/text()'
).extract()[0]
item['score_num'] = movie.xpath(
'.//div[@class="star"]/span/text()').re(ur'(\d+)人评价')[0]
yield item
next_url = response.xpath('//span[@class="next"]/a/@href').extract()
if next_url:
next_url = 'https://movie.douban.com/top250' + next_url[0]
yield Request(next_url, headers=self.headers)
示例5: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
# if self.counter > 2:
# return
# else:
# self.counter += 1
for book in response.css('article.product_pod'):
try:
bname = book.xpath('./h3/a/@title').extract_first()
price = book.css('p.price_color::text').extract()[0]
# yield {'name': bname, 'price': price}
bookit = BooksItem()
bookit['name'] = bname
bookit['price'] = price
yield bookit
except Exception as e:
print(e)
#
next_url = response.css('li.next a::attr(href)').extract_first()
if next_url:
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
示例6: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
for tr in response.xpath('//tbody/tr'):
try:
ip = tr.xpath('td[@data-title="IP"]/text()').extract()[0]
port = tr.xpath('td[@data-title="PORT"]/text()').extract()[0]
http_type = tr.xpath('td[@data-title="类型"]/text()').extract()[0].lower()
# print(http_type,ip,port)
except Exception as e:
# print(e)
continue
#
url = '%s://httpbin.org/ip' % http_type
proxy = '%s://%s:%s' % (http_type, ip, port)
meta = {
'proxy': proxy,
'dont_retry': True,
'download_timeout': 10,
#
'_proxy_scheme': http_type,
'_proxy_ip': ip,
'port': port
}
yield Request(url, callback=self.check_available, meta=meta, dont_filter=True)
示例7: start_requests
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def start_requests(self):
tracks_list = [
{ 'title': 'Algorithms', 'name': 'algorithms' },
{ 'title': 'Data Structures', 'name': 'data-structures' },
{ 'title': 'Mathematics', 'name': 'mathematics' },
]
for i, track in enumerate(tracks_list):
tracks.append({
'title': track['title'],
'name': track['name'],
'chapters': [],
})
url = 'https://www.hackerrank.com/rest/contests/master/tracks/' + track['name'] + '/chapters'
yield scrapy.Request(url=url, callback=functools.partial(self.parse_chapters, d={
'track-id': i,
}))
示例8: parse_chapters
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse_chapters(self, response, d):
json_object = json.loads(response.text)
for i, chapter in enumerate(json_object['models']):
tracks[d['track-id']]['chapters'].append({
'title': chapter['name'],
'name': chapter['slug'],
'challenges': [None] * chapter['challenges_count'],
})
for offset in range(0, chapter['challenges_count'], 10):
url = 'https://www.hackerrank.com/rest/contests/master/categories/' \
+ tracks[d['track-id']]['name'] + '%7C' + chapter['slug'] \
+ '/challenges?offset=' + str(offset) + '&limit=10'
yield scrapy.Request(url=url, callback=functools.partial(self.parse_page, d={
'track-id': d['track-id'],
'chapter-id': i,
'offset': offset,
}))
示例9: main_list_parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def main_list_parse(self, response):
for sel in response.xpath('//div[@class="wrapper-piclist"]/ul/li'):
item = AlbumItem()
item['level'] = 1
item['title'] = sel.xpath('div[2]/div[1]/p/a/text()').extract_first()
item['img_url'] = sel.xpath('div[1]/a/img/@src').extract_first()
item['main_url'] = sel.xpath('div[2]/div[1]/p/a/@href').extract_first()
item['type_id'] = 0
update_status = sel.xpath('div[1]/a/div/div/p/span/text()').extract_first().strip()
item['status'] = 1 if update_status[0] == u'共' else 0
if item['title'] is not None and item['main_url'] is not None:
yield item
yield scrapy.Request(response.urljoin(item['main_url']), callback=self.video_list_parse, errback=self.errback_httpbin)
no_page = response.xpath('//span[@class="curPage"]/following-sibling::span[@class="noPage"]').extract_first()
# to crawl next page
if no_page is None:
next_page_url = response.xpath('//div[@class="mod-page"]/a[last()]/@href').extract_first()
print('visit next page url: ', next_page_url)
yield scrapy.Request(response.urljoin(next_page_url), callback=self.main_list_parse, errback=self.errback_httpbin)
示例10: collection
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def collection(self, response):
self.update_process(response, ".column-label .count-badge::text", 'Crawling collections...')
image_items = response.css('._image-items.js-legacy-mark-unmark-list li.image-item')
all_collection_urls = []
for image_item in image_items:
# 对于已经删除的图片 可能会包含在image_items中,但无法提取bookmark,在转为int时报错,程序到此终止
# 在image_page会再检查一遍fav_num
item_url = image_item.css('a.work._work::attr(href)').extract_first('')
pid = item_url.split('illust_id=')[-1]
if pid in self.collection_set:
continue
img_bookmark = image_item.css('ul li a.bookmark-count._ui-tooltip::text').extract_first('')
if img_bookmark and int(img_bookmark) >= self.MIN_FAV:
all_collection_urls.append(item_url)
all_collection_urls = [parse.urljoin(response.url, url) for url in all_collection_urls]
next_page_url = response.css('.column-order-menu .pager-container .next ._button::attr(href)').extract_first("")
# ???
if self.tryNextPage(next_page_url):
next_page_url = parse.urljoin(response.url, next_page_url)
yield scrapy.Request(next_page_url, headers=self.header, callback=self.collection)
for url in all_collection_urls:
yield scrapy.Request(url, headers=self.header, callback=self.image_page)
示例11: search
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def search(self, response):
# for debug
if self.process > self.maxsize:
return
js_text = response.css("div.layout-body div._unit input#js-mount-point-search-result-list::attr(data-items)").extract_first('Not Found')
if js_text == "Not Found":
print("json接口变动,烦请issue")
js = json.loads(js_text)
self.update_process(response, '._unit .column-header span.count-badge::text', 'Searching {0}'.format(cf.get('SRH', 'TAGS')))
all_works_url = []
for image_item in js:
if image_item["bookmarkCount"] >= self.MIN_FAV:
all_works_url.append(('https://www.pixiv.net/member_illust.php?mode=medium&illust_id={0}'.format(image_item["illustId"]),
image_item['bookmarkCount']))
next_page_url = response.css('.column-order-menu .pager-container .next ._button::attr(href)').extract_first("")
if self.tryNextPage(next_page_url):
next_page_url = parse.urljoin(response.url, next_page_url)
yield scrapy.Request(next_page_url, headers=self.header, callback=self.search)
for url, bookmarkCount in all_works_url:
request = scrapy.Request(url, headers=self.header, callback=self.image_page) # 就是这里改成提取数据
request.meta['collection'] = bookmarkCount
yield request
示例12: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
news_in_page = response.css('.listRight li h2 a')
if not news_in_page:
return
for news in news_in_page:
url = news.css('a::attr(href)').extract_first()
if ROOT_URL not in url:
url = ROOT_URL + url
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_news)
if 'next_page' in response.meta:
meta = {'next_page': response.meta['next_page'] + 1}
else:
meta = {'next_page': 2}
next_url = PAGE_URL + '?page=' + str(meta['next_page'])
yield scrapy.Request(next_url, callback=self.parse, meta=meta)
示例13: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
section = response.css('section.nclnbx.slvl.clearmen, article.nclns')
for part in section:
if part.css('header.schh h1::text'):
category = part.css('header.schh h1::text').extract_first()
category = category.strip()
else:
meta = {'category': category}
for news in part.css('ul.fillup li'):
if 'eat-travel' in news.css(
"a::attr(href)").extract_first():
continue
elif 'entertainment.appledaily' in news.css(
"a::attr(href)").extract_first():
url = news.css("a::attr(href)").extract_first()
elif 'http' in news.css("a::attr(href)").extract_first():
url = news.css("a::attr(href)").extract_first()
else:
url = "http://www.appledaily.com.tw{}".format(
news.css("a::attr(href)").extract_first())
if url:
url = response.urljoin(url)
yield scrapy.Request(
url, callback=self.parse_news, meta=meta)
示例14: start_requests
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def start_requests(self):
urls = [
'http://news.ltn.com.tw/list/newspaper/focus/',
'http://news.ltn.com.tw/list/newspaper/politics/',
'http://news.ltn.com.tw/list/newspaper/society/',
'http://news.ltn.com.tw/list/newspaper/local/',
'http://news.ltn.com.tw/list/newspaper/life/',
'http://news.ltn.com.tw/list/newspaper/opinion/',
'http://news.ltn.com.tw/list/newspaper/world/',
'http://news.ltn.com.tw/list/newspaper/business/',
'http://news.ltn.com.tw/list/newspaper/sports/',
'http://news.ltn.com.tw/list/newspaper/entertainment/',
'http://news.ltn.com.tw/list/newspaper/consumer/',
'http://news.ltn.com.tw/list/newspaper/supplement/'
]
day = datetime.timedelta(days=1)
current_time = NEWS_DATE_BEGIN
while current_time <= TODAY:
date = current_time.strftime('%Y%m%d')
for url in urls:
target = url + date
yield scrapy.Request(target, callback=self.parse_news_list)
current_time += day
示例15: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Request [as 别名]
def parse(self, response):
current_page_index = int(
response.css('.pagination li.current a::text').extract_first())
newses_time_str = response.css('.article_list li span::text').extract()
newses_time = [
datetime.strptime(i, '%Y/%m/%d %H:%M').date()
for i in newses_time_str
]
is_over_today = False
for t in newses_time:
if t < TODAY:
is_over_today = True
if not is_over_today:
next_url = 'http://www.cna.com.tw/list/aall-' + str(
current_page_index + 1) + '.aspx'
yield scrapy.Request(next_url, callback=self.parse)
for news in response.css('div.article_list li a'):
url = response.urljoin(news.css('a::attr(href)').extract_first())
yield scrapy.Request(url, callback=self.parse_news)