本文整理汇总了Python中scrapy.Request.meta['url']方法的典型用法代码示例。如果您正苦于以下问题:Python Request.meta['url']方法的具体用法?Python Request.meta['url']怎么用?Python Request.meta['url']使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.Request
的用法示例。
在下文中一共展示了Request.meta['url']方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: initStartRequests
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta['url'] [as 别名]
def initStartRequests(self, keyword):
"""
初始化起始request
:param keyword:
:return:
"""
keyword = self.getUnicode(keyword)
url = self.baseURL
url[1] = keyword
request = Request(url=''.join(url))
request.meta['keyword'] = keyword
request.meta['url'] = ''.join(url)
self.keywordsAndPages[keyword] = 1 # 每一个关键字开始爬取的都是第一页
return request
示例2: createNextPageRequest
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta['url'] [as 别名]
def createNextPageRequest(self, keyword, pn):
"""
针对每一个关键字,生成一个request,可以增长页面的
:param keyword: 需要查询的关键字
:param pn: 0代表第一页,10代表第二页,20代表第三页……
:return:一个request
"""
tem = self.baseURL
tem[1] = keyword
url = ''.join(tem) + '&pn=' + str(pn)
request = Request(url=url)
request.meta['keyword'] = keyword
request.meta['url'] = url
return request
示例3: parse
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta['url'] [as 别名]
def parse(self, response):
self.num += 1
if response.status == 200:
true = True # 千万不要删除
false = False
tem = re.findall(r"^jsonp_search\((.*?)\)$", response.body)
results = eval(tem[0])['data']
for result in results:
item = MusicSearchspiderItem()
item['platform'] = u"天天动听"
item['keyword'] = response.meta['keyword']
item['resultUrl'] = response.url
if 'audition_list' in result.keys():
item['targetUrl'] = result['audition_list'][0]['url']
else:
item['targetUrl'] = ''
# item['program'] = result['song_name']
item['album'] = ''
item['author'] = result['singer_name']
item['createDate'] = datetime.datetime.now()
item['status'] = 0
item['processDate'] = datetime.datetime.now()
item['checkStatus'] = 0
item['searchTask'] = None if self.searchTaskId == -1 else self.searchTaskId
item['project'] = None if self.projectId == -1 else self.projectId
item['program'] = self.program
if not item['targetUrl'] in self.songsURLS: # 去重操作
if self.filter(targetTitle=item['program'], author=item['author']): # 过滤操作
self.songsURLS.add(item['targetUrl'])
yield item
logging.info(u'===这一页有%s条数据===' % results.__len__())
if results.__len__() == 50: # 判断还有没有下一页
keyword = response.meta['keyword']
self.keywordsAndPages[keyword] += 1
pageNum = self.keywordsAndPages[keyword]
keyword = self.getUnicode(keyword)
nextURL = self.baseURL
nextURL[1] = keyword
nextURL[3] = str(pageNum)
nextURL = ''.join(nextURL)
if pageNum < (self.limit + 1):
logging.info(u"===现在爬取的关键字是: %s===", keyword)
logging.info(u"===现在爬取的关键字的page num是: %s===", pageNum)
request = Request(url=nextURL)
request.meta['keyword'] = keyword
request.meta['url'] = nextURL
yield request
else:
logging.info(response.status)
示例4: parse
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta['url'] [as 别名]
def parse(self, response):
self.num += 1
if response.status == 200:
results = json.loads(response.body)['result']['songs']
for result in results:
item = MusicSearchspiderItem()
item['platform'] = u"网易音乐"
item['keyword'] = response.meta['keyword']
item['resultUrl'] = response.url
item['targetUrl'] = ''
item['program'] = self.program
# item['program'] = result['filename']
item['targetTitle'] = result['name']
item['album'] = result['album']['name']
item['author'] = result['artists'][0]['name']
item['unique_code'] = item['program'] + item['author']
item['createDate'] = datetime.datetime.now()
item['status'] = 0
item['processDate'] = datetime.datetime.now()
item['checkStatus'] = 0
item['searchTask'] = None if self.searchTaskId == -1 else self.searchTaskId
item['project'] = None if self.projectId == -1 else self.projectId
if not item['unique_code'] in self.unique_codes: # 去重操作
if self.filter(targetTitle=item['targetTitle'], author=item['author']): # 过滤操作
self.unique_codes.add(item['unique_code'])
yield item
logging.info(u'===这一页有%s条数据===' % results.__len__())
if results.__len__() == 100: # 判断还有没有下一页
keyword = response.meta['keyword']
self.keywordsAndPages[keyword] += 100
pageNum = self.keywordsAndPages[keyword]
keyword = self.getUnicode(keyword)
nextURL = self.baseURL
nextURL[1] = keyword
nextURL[4] = str(pageNum)
nextURL = ''.join(nextURL)
if pageNum < self.limit * 100:
logging.info(u"===现在爬取的关键字是: %s===", keyword)
logging.info(u"===现在爬取的关键字的page num是: %s===", pageNum)
request = Request(url=nextURL)
request.meta['keyword'] = keyword
request.meta['url'] = nextURL
request.headers.appendlist("Referer", 'http://music.163.com')
request.headers.appendlist("Cookies", 'appver=2.0.2')
request.method = "POST"
yield request
else:
logging.info(response.status)
示例5: createNextPageRequest
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta['url'] [as 别名]
def createNextPageRequest(self, keyword, pn):
"""
针对每一个关键字,生成一个request,可以增长页面的
:param keyword: 需要查询的关键字
:param pn: 1代表第一页,2代表第二页,3代表第三页……
:return:一个request
"""
tem = self.baseURL
tem[1] = keyword
tem[3] = str(pn)
url = ''.join(tem)
request = Request(url=url)
request.meta['keyword'] = keyword
request.meta['url'] = url
logging.info(u"===进行下一页===")
return request
示例6: initStartRequests
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta['url'] [as 别名]
def initStartRequests(self, keyword):
"""
初始化起始request
:param keyword:
:return:
"""
keyword = self.getUnicode(keyword)
url = self.baseURL
url[1] = keyword
request = Request(url=''.join(url))
request.meta['keyword'] = keyword
request.meta['url'] = ''.join(url)
request.headers.appendlist("Referer", 'http://music.163.com')
request.headers.appendlist("Cookies", 'appver=2.0.2')
request.method = "POST"
self.keywordsAndPages[keyword] = 0
return request
示例7: parse
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta['url'] [as 别名]
def parse(self, response):
self.num += 1
if response.status == 200:
results = response.css("div.list ul li.clearfix")
for result in results:
item = MusicSearchspiderItem()
item['platform'] = u"酷我音乐"
item['keyword'] = response.meta['keyword']
item['resultUrl'] = response.url
item['targetUrl'] = self.getUnicode(
''.join(result.xpath("./p[@class='m_name']/a[@title]/@href").extract())).strip()
# item['program'] = self.getUnicode(
# ''.join(result.xpath("./p[@class='m_name']/a[@title]/@title").extract())).strip()
item['album'] = self.getUnicode(
''.join(result.xpath("./p[@class='a_name']/a[@title]/@title").extract())).strip()
item['author'] = self.getUnicode(
''.join(result.xpath("./p[@class='s_name']/a[@title]/@title").extract())).strip()
item['createDate'] = datetime.datetime.now()
item['status'] = 0
item['processDate'] = datetime.datetime.now()
item['checkStatus'] = 0
item['searchTask'] = None if self.searchTaskId == -1 else self.searchTaskId
item['project'] = None if self.projectId == -1 else self.projectId
item['program'] = self.program
if not item['targetUrl'] in self.songsURLS: # 去重操作
if self.filter(targetTitle=item['program'], author=item['author']): # 过滤操作
self.songsURLS.add(item['targetUrl'])
yield item
nextA=response.css("div.page a")[-2]
if nextA.xpath("./text()")[0].extract().strip()==u'下一页':#判断是否存在下一页
keyword = response.meta['keyword']
self.keywordsAndPages[keyword] += 1
pageNum = self.keywordsAndPages[keyword]
nextURL = u'http://sou.kuwo.cn'+self.getUnicode(''.join(
nextA.xpath("./@href").extract())).strip()
if pageNum < (self.limit):
logging.info(u"==现在爬取的关键字是: %s", keyword)
logging.info(u"==现在爬取的关键字的page num是: %s", pageNum)
request = Request(url=nextURL)
request.meta['keyword'] = keyword
request.meta['url'] = nextURL
yield request
else:
logging.info(response.status)
示例8: parse
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta['url'] [as 别名]
def parse(self, response):
self.num += 1
if response.status == 200:
results = response.css("tbody tr")
for result in results:
item = MusicSearchspiderItem()
item['platform'] = u"虾米音乐"
item['keyword'] = response.meta['keyword']
item['resultUrl'] = response.url
item['targetUrl'] = self.getUnicode(
''.join(result.xpath("./td[@class='song_name']/a[@target]/@href").extract())).strip()
# item['program'] = self.getUnicode(
# ''.join(result.xpath("./td[@class='song_name']/a[@target]/@title").extract())).strip()
item['album'] = self.getUnicode(
''.join(result.xpath("./td[@class='song_album']/a[@target]/@title").extract())).strip()
item['author'] = self.getUnicode(
''.join(result.xpath("./td[@class='song_artist']/a[@target]/text()").extract())).strip()
item['createDate'] = datetime.datetime.now()
item['status'] = 0
item['processDate'] = datetime.datetime.now()
item['checkStatus'] = 0
item['searchTask'] = None if self.searchTaskId == -1 else self.searchTaskId
item['project'] = None if self.projectId == -1 else self.projectId
item['program'] = self.program
if not item['targetUrl'] in self.songsURLS: # 去重操作
if self.filter(targetTitle=item['program'], author=item['author']): # 过滤操作
self.songsURLS.add(item['targetUrl'])
yield item
if response.xpath("//a[@class='p_redirect_l']/@href"):
keyword = response.meta['keyword']
self.keywordsAndPages[keyword] += 1
pageNum = self.keywordsAndPages[keyword]
nextURL = u'http://www.xiami.com' + self.getUnicode(''.join(
response.xpath("//div[@class='all_page']/a[@class='p_redirect_l']/@href").extract())).strip()
if pageNum < (self.limit):
logging.info(u"==现在爬取的关键字是: %s", keyword)
logging.info(u"==现在爬取的关键字的page num是: %s", pageNum)
request = Request(url=nextURL)
request.meta['keyword'] = keyword
request.meta['url'] = nextURL
yield request
else:
logging.info(response.status)
示例9: parse
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta['url'] [as 别名]
def parse(self, response):
self.num += 1
if response.status == 200:
results = response.css("div.song-item")
for result in results:
item = MusicSearchspiderItem()
item['platform'] = u"百度音乐"
item['keyword'] = response.meta['keyword']
item['resultUrl'] = response.meta['url']
item['targetUrl'] = u"http://music.baidu.com" + self.getUnicode(
''.join(result.xpath("./span[@class='song-title']/a[@data-songdata]/@href").extract())).strip()
# item['program'] = self.getUnicode(
# ''.join(result.xpath("./span[@class='song-title']//text()").extract())).strip()
item['album'] = self.getUnicode(
''.join(result.xpath("./span[@class='album-title']//text()").extract())).strip()
item['author'] = self.getUnicode(
''.join(result.xpath("./span[@class='singer']//text()").extract())).strip()
item['createDate'] = datetime.datetime.now()
item['status'] = 0
item['processDate'] = datetime.datetime.now()
item['checkStatus'] = 0
item['searchTask'] = None if self.searchTaskId == -1 else self.searchTaskId
item['project'] = None if self.projectId == -1 else self.projectId
item['program'] = self.program
if not item['targetUrl'] in self.songsURLS: # 去重操作
if self.filter(targetTitle=item['program'], author=item['author']): # 过滤操作
self.songsURLS.add(item['targetUrl'])
yield item
if response.xpath("//div[@class='page-inner']/a[@class='page-navigator-next']/@href"):
keyword = response.meta['keyword']
self.keywordsAndPages[keyword] += 1
pageNum = self.keywordsAndPages[keyword]
nextURL = u"http://music.baidu.com" + self.getUnicode(''.join(response.xpath(
"//div[@class='page-inner']/a[@class='page-navigator-next']/@href").extract())).strip()
if pageNum < (self.limit):
logging.info(u"==现在爬取的关键字是: %s", keyword)
logging.info(u"==现在爬取的关键字的page num是: %s", pageNum)
request = Request(url=nextURL)
request.meta['keyword'] = keyword
request.meta['url'] = nextURL
yield request
else:
logging.info(response.status)
示例10: parse
# 需要导入模块: from scrapy import Request [as 别名]
# 或者: from scrapy.Request import meta['url'] [as 别名]
def parse(self, response):
self.num += 1
if response.status == 200:
results = eval(response.body)['data']['info']
for result in results:
item = MusicSearchspiderItem()
item['platform'] = u"酷狗音乐"
item['keyword'] = response.meta['keyword']
item['resultUrl'] = response.url
item['targetUrl'] = ''
item['program'] = self.program
# item['program'] = result['filename']
item['album'] = result['album_name']
item['author'] = result['singername']
item['unique_code'] = item['program'] + item['author']
item['createDate'] = datetime.datetime.now()
item['status'] = 0
item['processDate'] = datetime.datetime.now()
item['checkStatus'] = 0
item['searchTask'] = None if self.searchTaskId == -1 else self.searchTaskId
item['project'] = None if self.projectId == -1 else self.projectId
if not item['unique_code'] in self.unique_codes: # 去重操作
if self.filter(targetTitle=item['program'], author=item['author']): # 过滤操作
self.unique_codes.add(item['unique_code'])
yield item
logging.info(u'===这一页有%s条数据===' % results.__len__())
if results.__len__() == 50: # 判断还有没有下一页
keyword = response.meta['keyword']
self.keywordsAndPages[keyword] += 1
pageNum = self.keywordsAndPages[keyword]
keyword = self.getUnicode(keyword)
nextURL = self.baseURL
nextURL[1] = keyword
nextURL[3] = str(pageNum)
nextURL = ''.join(nextURL)
if pageNum < (self.limit + 1):
logging.info(u"===现在爬取的关键字是: %s===", keyword)
logging.info(u"===现在爬取的关键字的page num是: %s===", pageNum)
request = Request(url=nextURL)
request.meta['keyword'] = keyword
request.meta['url'] = nextURL
yield request
else:
logging.info(response.status)