当前位置: 首页>>代码示例>>Python>>正文


Python request.Request类代码示例

本文整理汇总了Python中scrapy.http.request.Request的典型用法代码示例。如果您正苦于以下问题:Python Request类的具体用法?Python Request怎么用?Python Request使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Request类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_user

    def parse_user(self, response):
        item = MFWItem()

        item['uid'] = response.meta['uid']
        item['name'] = response.xpath(
            '//div[@class="MAvaName"]/text()').extract_first()
        item['level'] = int(response.xpath(
            '//span[@class="MAvaLevel flt1"]/a/@title').extract_first().split('.')[-1])
        if item['level'] <= 3:
            return
        item['tags'] = response.xpath(
            '//div[@class="its_tags"]//i[contains(@class, "on")]/../@title').extract()
        item['attention'] = [int(i) for i in response.xpath(
            '//div[@class="MAvaMore clearfix"]//a/text()').extract()]
        item['groups'] = response.xpath(
            '//div[@class="MGroupDetail"]//a[@class="name"]/text()').extract()
        item['dynamic'] = response.xpath(
            '//span[@class="time"]/text()').extract()
        item['download'] = []
        infos = response.xpath('//div[@class="common_block relative_info"]/p')
        for info in infos:
            if u'刚刚下载了' in ''.join(info.xpath('text()').extract()):

                item['download'].append({'time': info.xpath(
                    'span[@class="time"]/text()').extract_first(), 'name': info.xpath('a/text()').extract()[-1]})

        item['note'] = {}
        item['path'] = []
        item['review'] = []
        item['together'] = []
        note = response.xpath(u'//a[@title="TA的游记"]/@href').extract_first()
        req = Request(urljoin(response.url, note), callback=self.parse_note)
        req.meta['item'] = item
        yield req
开发者ID:twelfthing,项目名称:socool,代码行数:34,代码来源:mafengwo.py

示例2: parse

	def parse(self, response):
		sel = Selector(response)
		item = DicksItem()		
		if "&page=" in response.url: # Extracting the Page Number and then using that to assign sort.
			pagenumber = float(response.url.split("&page=")[-1]) 
		else:
			pagenumber = 1		
		t = 0 + ((pagenumber-1)*48)
		item["Sort_Order"] = {}
		
		producturls= sel.xpath("//div[@class='prod-details']/h2/a/@href").extract()
		productnames = sel.xpath("//div[@class='prod-details']/h2/a/@title").extract()		
		
		for url,name in zip(producturls,productnames):
			item["Sort_Order"]["http://www.dickssportinggoods.com"+url] = t
			t=t+1
			
		for i in range(len(urllist)): #comparing the Category URL and assigning LYS Categorization
			if urllist[i] == response.url:
				item['Category'] = lyscat[i]
				item['id1'] = priceid[i]
				break
		
		for url,name in zip(producturls,productnames):       
			if "Fitbit" not in name:         
				request=Request("http://www.dickssportinggoods.com"+url, self.product_page)
				request.meta["item"] = item
				yield request
开发者ID:Diwahars,项目名称:scrapers,代码行数:28,代码来源:Dicks.py

示例3: getItem

    def getItem(self, school):
        item = SchoolItem()
        logo = school.xpath('div/div[contains(@class,"school_m_img fl")]/a/img/@src').extract()
        item["logo"] = logo[0] if logo else ""

        # name province city area under school_m_main
        school_main = school.xpath('div/div[contains(@class,"school_m_main fl")]')
        name = school_main.xpath("li/h3/a/text()").extract()
        item["name"] = name[0] if name else ""
        item["province"] = ""
        item["city"] = ""
        item["area"] = ""
        tempLocation = school_main.xpath("li[2]/b/text()").extract()
        if tempLocation:
            location = tempLocation[0].split()
            item["province"] = location[0] if len(location) > 0 else ""
            item["city"] = location[1] if len(location) > 1 else ""
            item["area"] = location[2] if len(location) > 2 else ""

        catagery = school_main.xpath("li[3]/b/text()").extract()
        schoolType = school_main.xpath("li[4]/ol[1]/b/text()").extract()
        level = school_main.xpath("li[4]/ol[2]/b/text()").extract()
        item["level"] = level[0] if level else ""
        item["catagery"] = catagery[0] if catagery else ""
        item["schoolType"] = schoolType[0] if schoolType else ""

        # address and phone under school_m_lx
        addressAndPhone = school.xpath('ul[contains(@class,"school_m_lx")]')
        address = addressAndPhone.xpath("li[1]/b/text()").extract()
        item["address"] = address[0] if address else ""
        item["phone"] = addressAndPhone.xpath("li[2]/b/text()").extract()
        schoollUrl = school_main.xpath("li/h3/a/@href").extract()[0]
        request = Request(schoollUrl, callback=self.parse_schoolIntroUrl)
        request.meta["item"] = item
        return request
开发者ID:lindanXmu,项目名称:GreatSchool,代码行数:35,代码来源:xuexiao.py

示例4: parse_restaurants_en

	def parse_restaurants_en(self,response):
		sel=Selector(response)
		item = response.meta['item']
		descriptionpath=sel.xpath("//*[@id='idContentScroll']")
		description=descriptionpath.xpath("span[@itemprop='description']/p//text()").extract()
		timetable=descriptionpath.xpath("span[@itemprop='description']/p[2]//text()").extract()
		timetable2=descriptionpath.xpath("span[@itemprop='description']/p[3]//text()").extract()
		categoryPath=sel.xpath("//*[@id='gastronomy-content']/section[2]/div/section[1]/section/div/ul/li[2]/p[2]")
		category=categoryPath.xpath("a/strong/text()").extract()
		if len(description)>0:
			item['description_en']=' '.join(description)
		else:
			item['description_en']=''
		if len(category)>0:
			item['category_en']=['Restaurant',category.pop()]	
		else:
			item['category_en']=['Restaurant','Others']
		if len(timetable)>0:
			if len(timetable2)>0:
				item['timetable_en']=' '.join([timetable.pop(),timetable2.pop()])
			else:
				item['timetable_en']=timetable.pop()
		else:
			item['timetable_en']=''
		link=response.url
		link=link.replace("/en/","/eu/")
		request=Request(link,callback=self.parse_restaurants_eu)
		request.meta['item']=item
		yield request		
开发者ID:AritzBi,项目名称:BilbaoTourismScraper,代码行数:29,代码来源:restaurantesBTurismo_spider.py

示例5: parse

    def parse(self, response):
        ''' Parse response from start urls (/channels)
            
            Channels are groups by category. So, this spider extracts the 
            category of each channel, and constructs a request with the meta 
            information of the category (that information would not be 
            available from the channel page otherwise)
        '''
        self.logger.debug("Parse url {}".format(response.url))        

        cat_container = response.xpath('/html/body/div[1]/div/article/div')
        
        # Channels are grouped by category in containers with class '.channel-category'
        for cat in cat_container.css('.channel-category'):
            # extract the title of the category
            cat_title = cat.xpath('h2/text()').extract_first()            
            # extract the link to the channel pages
            for channel in cat.css('ul.channel-grid li'):
                link = channel.xpath('a//@href').extract_first()
                full_link = loaders.contextualize(link, base_url=response.url)
                # Construct request               
                request = Request(full_link, callback=self.parse_channel)
                request.meta['category'] = cat_title
                
                yield request
开发者ID:miguelcb84,项目名称:ewe-scrapers,代码行数:25,代码来源:ifttt_spiders.py

示例6: _parse_symptom_question

    def _parse_symptom_question(self, response):
        symptom_question_item = response.meta.get('symptom_questions')
        # print response.url
        if not symptom_question_item:
            symptom_question_item = SymptomQuestionItem()
            symptom_question_item['symptom_name'] = response.meta['symptom_item']['name']
            symptom_question_item['qids'] = []

        # parse
        urls = response.xpath('//div[@class="p_list_li"]/div[@class="p_list_cent"]/div[@class="p_list_centt"]/dl/dt/a/@href').extract()
        symptom_question_item['qids'] += [u.split('/')[-1].split('.')[0] for u in urls]

        # last_url = response.xpath('//div[@class="portldet-content"]/a/@href').extract()[-1]
        next_url = response.xpath('//div[@class="portlet-content"]/a[text()="下一页 >"]/@href').extract()
        if not next_url:
             # 所有页都处理完了
            print symptom_question_item
            yield symptom_question_item
        else:
            url = next_url[0]
            # print url
            # print symptom_question_item['qids']
            request = Request(url, dont_filter=True, callback=self._parse_symptom_question)
            request.meta['symptom_questions'] = symptom_question_item
            # print request
            yield request
开发者ID:whypro,项目名称:medical_crawler,代码行数:26,代码来源:a120ask.py

示例7: parse_history

 def parse_history(self,response):
     #Parse Price History Table
     house = response.meta['item']
     tax_url = house['tax_url']
     price_history = []
     pattern = r' { "html": "(.*)" }'
     html = re.search(pattern, response.body).group(1)
     html = re.sub(r'\\"', r'"', html)  # Correct escaped quotes
     html = re.sub(r'\\/', r'/', html)  # Correct escaped forward
     if (html != ""):
         soup = BeautifulSoup(html)
         table = soup.find('table')
         table_body = table.find('tbody')
         rows = table_body.find_all('tr')
         for row in rows:
             cols = row.find_all('td')
             cols = [ele for ele in cols]
             cols = cols[:3]
             if (cols[2].find('span') != None):
                 date = cols[0].get_text()
                 event = cols[1].get_text()
                 price = cols[2].find('span').get_text()
                 price_history.append([date, event, price])
         #Store history as JSON string    
         house['price_history'] = json.dumps(price_history)
     tax_request = Request(tax_url, 
                       callback=self.parse_taxes)
     tax_request.meta['item'] = house
     
     return tax_request
开发者ID:KaroAntonio,项目名称:Guru,代码行数:30,代码来源:zillow_spider.py

示例8: parse_page

	def parse_page(self, response):
		if response.meta.has_key('crawldepth'):
			depth = response.meta['crawldepth']
		else:
		#       Set search depth here
			depth = 1
		log.msg('Depth = %s' % str(depth), level=log.INFO)
		if not isinstance(response, HtmlResponse):
		    log.msg('Not an HTML file: %s' % response.url, level=log.WARNING)
		    return

		log.msg('Response from: %s' % response.url, level=log.INFO)
		url_bf.add(response.url)
	
		# TODO: Extract page title
	
		extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode())
		cleaned_text = extractor.getText()

		# Eliminate duplicates
		keywordset = set(keywordlist)

		found_list = []
		for keyword in keywordset: # TODO: Is there a more efficient way to do this?
			# Look at word boundaries to match entire words only
			if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)):
				found_list.append(keyword)

		# Parse this page		
		item = BiffleItem()
		if (len(found_list) > 0):
			item['url'] = response.url
			item['body'] = cleaned_text
			item['keywords'] = ', '.join(found_list)
			item['process_date'] = datetime.today()
			log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO)
			self.map_keyword_count(found_list)
			yield item

		if (depth > 0):	
			# Find the next requests and yield those
			hxs = HtmlXPathSelector(response)
			links = hxs.select('//a/@href').extract()
			log.msg('Links on page: %s' % len(links), level=log.INFO)
			depth -= 1
			log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO)
			for l in links:
				l = urlparse.urljoin(response.url, l)
				if (l in url_bf):
					pass
					#log.msg('Duplicate URL found: %s' % l, level=log.INFO)
				else:
					url_bf.add(l)
					#log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO)
					# Decrement depth for next layer of links
					#callback = lambda response, depth = depth: self.parse_page(response, depth)			
					callback = lambda response: self.parse_page(response)
					request = Request(l, callback=callback)
					request.meta['crawldepth'] = depth
					yield request
开发者ID:Akibalogh,项目名称:biffle-prototype,代码行数:60,代码来源:biffle_spider.py

示例9: parse_monuments_en

	def parse_monuments_en(self,response):
		sel=Selector(response)
		monument=sel.xpath('//div[@class="col-50 content-desc"]')
		title=monument.xpath("h2[@class='big sec-color']/text()").extract()
		summary=''.join(monument.xpath("div[@id='idContentScroll']/span/p//text()").extract())
		informationLink=monument.xpath("div[@id='idContentScroll']/span/a/@href").extract()
		item = response.meta['item']
		if len(informationLink)>0:
			item['informationLink_en']=informationLink.pop()
		else:
			item['informationLink_en']=response.url
		if len(title)>0:
			item['name_en']=title.pop()
		else:
			item['name_en']=''
		if len(summary)>0:
			item['description_en']=summary
		else:
			item['description_en']=''
		if len(informationLink)>0:
			item['informationLink']=informationLink.pop()
		else:
			item['informationLink']=response.url
		
		euLink=sel.xpath('//*[@id="eu"]/@href').extract()
		request=Request(self.BASE+str(euLink.pop()),callback=self.parse_monuments_eu)
		request.meta['item']=item
		yield request
开发者ID:AritzBi,项目名称:BilbaoTourismScraper,代码行数:28,代码来源:patrimonioBTurismo_spider.py

示例10: parse_disease

    def parse_disease(self, response):
        """解析【疾病】页面"""
        disease_item = DiseaseItem()
        disease_item['url'] = response.url

        _name = response.xpath('//div[@class="p_lbox1"]/div[@class="p_lboxti"]/h3')
        disease_item['name'] = _name.xpath('text()').extract()[0]
        _other_name = _name.xpath('var/text()').extract()
        if _other_name:
            begin = _other_name[0].find(':') + 1
            end = _other_name[0].rfind(')')
            disease_item['aliases'] = re.split(',|,', _other_name[0][begin:end])

        _related = response.xpath('//div[@id="yw4"]/div/div/div')
        disease_item['related_diseases'] = _related.xpath('ul/li/a[contains(@href, "/jibing/")]/@title').extract()
        disease_item['related_symptoms'] = _related.xpath('ul/li/a[contains(@href, "/zhengzhuang/")]/@title').extract()
        # print disease_item['related_diseases'], disease_item['related_symptoms']
        # print disease_item
        yield disease_item

        # Go on parsing details
        detail_urls = response.xpath('//div[@class="p_lbox1_ab"]/a/@href').extract()
        detail_urls += response.xpath('//ul[@class="p_sibox2ul clears"]/li/a/@href').extract()
        # print detail_urls
        for url in detail_urls:
            request = Request(url=url, dont_filter=True, callback=self._parse_disease_detail)
            request.meta['disease_item'] = disease_item
            yield request

        # Go on parsing questions
        question_url = response.xpath('//div[@class="p_lbox5"]/div[@class="p_lboxti"]/a/@href').extract()[0]
        request = Request(url=question_url, dont_filter=True, callback=self._parse_disease_question)
        request.meta['disease_item'] = disease_item
        # print request
        yield request
开发者ID:whypro,项目名称:medical_crawler,代码行数:35,代码来源:a120ask.py

示例11: parseJsonProduct

    def parseJsonProduct(self, response):
        item = response.meta["item"]
        # make a valid json file out of it and remove unneeded data
        prodResponse = response.body.split("$+$")[0].strip().replace("'", '"')
        prodDict = {}
        sizeWidthDict = {}
        jsonresponse = json.loads(prodResponse)
        for product, value in jsonresponse.iteritems():
            if item["sku"] not in prodDict:
                prodDict[item["sku"]] = {}
            if value["c"] not in prodDict[item["sku"]]:
                prodDict[item["sku"]][value["c"]] = {}
            if value["w"] not in prodDict[item["sku"]][value["c"]]:
                prodDict[item["sku"]][value["c"]][value["w"]] = {}
            if value["s"] not in sizeWidthDict:
                sizeWidthDict[value["s"]] = []
            if value["w"] not in sizeWidthDict[value["s"]]:
                sizeWidthDict[value["s"]].append(value["w"])
            prodDict[item["sku"]][value["c"]][value["w"]][value["s"]] = value["sku"]
        item["variant"] = prodDict
        item["size_width_list"] = sizeWidthDict
        # request first imageset
        if item["imageSetUrls"]:
            color, href = item["imageSetUrls"].popitem()
            if len(href) > 1:
                item["imageSetUrls"][color] = href[1:]
            request = Request(href[0], callback=self.parseJsonImageSet)
            request.meta["item"] = item
            return request

        self.to_csv(item)
        return item
开发者ID:alfonsjose,项目名称:scrapers,代码行数:32,代码来源:RoadRunnerSports.py

示例12: parse

    def parse(self, response):

        tabs = []
        tab_selector = response.xpath('//div[@id="siteDirectory"]')
        ### loop for all tabs
        for tab in tab_selector.xpath('.//div[@class="popover-grouping"]'):
            tabNameSel = tab.xpath("h2/text()").extract()

            if tabNameSel:
                tabName = tabNameSel[0]

                fobj = open(tabName + ".txt", "a+")

            cat_selector = tab.xpath(".//ul")

            ### loop for all categories
            for category in cat_selector.xpath("li"):  #'.//div[contains(@class, "ht180")]
                catNameSel = category.xpath(
                    "a/text()"
                ).extract()  # //div[contains(@class, "top-menu unit")]/ul/li/div/div/div/ul/li[@class="heading"]
                # print category.extract()
                if catNameSel:
                    catName = catNameSel[0]
                catLinkSel = category.xpath("a/@href").extract()
                if catLinkSel:
                    catLink = "http://www.amazon.in" + catLinkSel[0]

                request = Request(catLink, callback=self.parse_subcatpage)
                request.meta["fobj"] = fobj
                request.meta["tabName"] = tabName
                request.meta["catName"] = catName
                yield request

        fobj.close()
开发者ID:ank-26,项目名称:Ecomm,代码行数:34,代码来源:amazon_spider.py

示例13: parse

    def parse(self, response):
        """First step of Mon/gr parsing."""
        try:
            # Connect to Beanstalkd server
            self.beanstalk = beanstalkc.Connection(host=self.host_beanstalkd, port=11301)

            # See all tubes:
            self.beanstalk.tubes()

            # Switch to the default (tube):
            self.beanstalk.use("default")

            # self.makedirResults()
            self.nodes = json.loads(response.body_as_unicode())

            for node in self.nodes:
                link_node = self.domain + self.nodes[node]
                request = Request(link_node, callback=self.parseDomain)
                # Pass metadata to the next wave of parsing
                request.meta["node"] = node
                yield request
        except:
            print "Please run the beanstalkc"

        return
开发者ID:nbompetsis,项目名称:ScrapyMonRg,代码行数:25,代码来源:myspider.py

示例14: parse_symptom

    def parse_symptom(self, response):
        """解析【症状】页面"""
        symptom_item = SymptomItem()
        symptom_item['url'] = response.url
        symptom_item['name'] = response.xpath('//div[@id="m_1"]/div[@class="p_sibox1 p_siboxbor"]/div[@class="p_sititile"]/span/h1/text()').extract()[0]

        _related = response.xpath('//div[@id="yw3"]/div/div')
        symptom_item['related_diseases'] = _related.xpath('ul/li/a[contains(@href, "/jibing/")]/@title').extract()
        # symptom_item['related_symptoms'] = _related.xpath('ul/li/a[contains(@href, "/zhengzhuang/")]/@title').extract()
        # print symptom_item['related_diseases'], symptom_item['related_symptoms']
        # print symptom_item
        yield symptom_item

        # Go on parsing details
        detail_urls = response.xpath('//dl[@class="p_sibox1dl clears"]/dt/a/@href').extract()
        detail_urls += response.xpath('//ul[@class="p_sibox2ul clears"]/li/a[1]/@href').extract()
        # print detail_urls
        for url in detail_urls:
            request = Request(url=url, dont_filter=True, callback=self._parse_symptom_detail)
            request.meta['symptom_item'] = symptom_item
            yield request

        # Go on parsing questions
        question_url = response.xpath('//div[@class="p_sibox4 p_siboxbor"]/div[@class="p_sititile"]/a/@href').extract()[0]
        request = Request(url=question_url, dont_filter=True, callback=self._parse_symptom_question)
        request.meta['symptom_item'] = symptom_item
        # print request
        yield request
开发者ID:whypro,项目名称:medical_crawler,代码行数:28,代码来源:a120ask.py

示例15: amazon_marketplace

	def amazon_marketplace(self,response):
		
		sel = Selector(response)
		item = response.meta['item']
		try:
			sp = sel.xpath("//span[@style='text-decoration: inherit; white-space: nowrap;']/text()").extract()[0].replace(",","")
			shippingcost = sel.xpath("//span[@class='olpShippingPrice']/span/text()").extract()
			if shippingcost:
				sp = str(float(sp) + float(sel.xpath("//span[@class='olpShippingPrice']/span/text()").extract()[0].replace(",","")))	
			
			if sp>item['SP']:
				sp = item['SP']
		except:			
			try:
				flipkart_url = flipkart_urls[item['index']]
				request = Request(flipkart_url,callback = self.flipkart_scraper)
				request.meta['item'] = item
				# request.meta['proxy'] = "http://111.161.126.100:80"
				yield request
		
			except:				
				try:
					paytm_url = paytm_urls[item['index']]
					request = Request(paytm_url,callback = self.paytm_scraper)
					request.meta['item'] = item
					request.meta['proxy'] = "http://111.161.126.100:80"
					yield request
				except:
					self.to_csv(item)
开发者ID:Diwahars,项目名称:scrapers,代码行数:29,代码来源:PopularProductsScraper.py


注:本文中的scrapy.http.request.Request类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。