本文整理匯總了Python中scrapy.http.request.Request類的典型用法代碼示例。如果您正苦於以下問題:Python Request類的具體用法?Python Request怎麽用?Python Request使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了Request類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: parse_user
def parse_user(self, response):
item = MFWItem()
item['uid'] = response.meta['uid']
item['name'] = response.xpath(
'//div[@class="MAvaName"]/text()').extract_first()
item['level'] = int(response.xpath(
'//span[@class="MAvaLevel flt1"]/a/@title').extract_first().split('.')[-1])
if item['level'] <= 3:
return
item['tags'] = response.xpath(
'//div[@class="its_tags"]//i[contains(@class, "on")]/../@title').extract()
item['attention'] = [int(i) for i in response.xpath(
'//div[@class="MAvaMore clearfix"]//a/text()').extract()]
item['groups'] = response.xpath(
'//div[@class="MGroupDetail"]//a[@class="name"]/text()').extract()
item['dynamic'] = response.xpath(
'//span[@class="time"]/text()').extract()
item['download'] = []
infos = response.xpath('//div[@class="common_block relative_info"]/p')
for info in infos:
if u'剛剛下載了' in ''.join(info.xpath('text()').extract()):
item['download'].append({'time': info.xpath(
'span[@class="time"]/text()').extract_first(), 'name': info.xpath('a/text()').extract()[-1]})
item['note'] = {}
item['path'] = []
item['review'] = []
item['together'] = []
note = response.xpath(u'//a[@title="TA的遊記"]/@href').extract_first()
req = Request(urljoin(response.url, note), callback=self.parse_note)
req.meta['item'] = item
yield req
示例2: parse
def parse(self, response):
sel = Selector(response)
item = DicksItem()
if "&page=" in response.url: # Extracting the Page Number and then using that to assign sort.
pagenumber = float(response.url.split("&page=")[-1])
else:
pagenumber = 1
t = 0 + ((pagenumber-1)*48)
item["Sort_Order"] = {}
producturls= sel.xpath("//div[@class='prod-details']/h2/a/@href").extract()
productnames = sel.xpath("//div[@class='prod-details']/h2/a/@title").extract()
for url,name in zip(producturls,productnames):
item["Sort_Order"]["http://www.dickssportinggoods.com"+url] = t
t=t+1
for i in range(len(urllist)): #comparing the Category URL and assigning LYS Categorization
if urllist[i] == response.url:
item['Category'] = lyscat[i]
item['id1'] = priceid[i]
break
for url,name in zip(producturls,productnames):
if "Fitbit" not in name:
request=Request("http://www.dickssportinggoods.com"+url, self.product_page)
request.meta["item"] = item
yield request
示例3: getItem
def getItem(self, school):
item = SchoolItem()
logo = school.xpath('div/div[contains(@class,"school_m_img fl")]/a/img/@src').extract()
item["logo"] = logo[0] if logo else ""
# name province city area under school_m_main
school_main = school.xpath('div/div[contains(@class,"school_m_main fl")]')
name = school_main.xpath("li/h3/a/text()").extract()
item["name"] = name[0] if name else ""
item["province"] = ""
item["city"] = ""
item["area"] = ""
tempLocation = school_main.xpath("li[2]/b/text()").extract()
if tempLocation:
location = tempLocation[0].split()
item["province"] = location[0] if len(location) > 0 else ""
item["city"] = location[1] if len(location) > 1 else ""
item["area"] = location[2] if len(location) > 2 else ""
catagery = school_main.xpath("li[3]/b/text()").extract()
schoolType = school_main.xpath("li[4]/ol[1]/b/text()").extract()
level = school_main.xpath("li[4]/ol[2]/b/text()").extract()
item["level"] = level[0] if level else ""
item["catagery"] = catagery[0] if catagery else ""
item["schoolType"] = schoolType[0] if schoolType else ""
# address and phone under school_m_lx
addressAndPhone = school.xpath('ul[contains(@class,"school_m_lx")]')
address = addressAndPhone.xpath("li[1]/b/text()").extract()
item["address"] = address[0] if address else ""
item["phone"] = addressAndPhone.xpath("li[2]/b/text()").extract()
schoollUrl = school_main.xpath("li/h3/a/@href").extract()[0]
request = Request(schoollUrl, callback=self.parse_schoolIntroUrl)
request.meta["item"] = item
return request
示例4: parse_restaurants_en
def parse_restaurants_en(self,response):
sel=Selector(response)
item = response.meta['item']
descriptionpath=sel.xpath("//*[@id='idContentScroll']")
description=descriptionpath.xpath("span[@itemprop='description']/p//text()").extract()
timetable=descriptionpath.xpath("span[@itemprop='description']/p[2]//text()").extract()
timetable2=descriptionpath.xpath("span[@itemprop='description']/p[3]//text()").extract()
categoryPath=sel.xpath("//*[@id='gastronomy-content']/section[2]/div/section[1]/section/div/ul/li[2]/p[2]")
category=categoryPath.xpath("a/strong/text()").extract()
if len(description)>0:
item['description_en']=' '.join(description)
else:
item['description_en']=''
if len(category)>0:
item['category_en']=['Restaurant',category.pop()]
else:
item['category_en']=['Restaurant','Others']
if len(timetable)>0:
if len(timetable2)>0:
item['timetable_en']=' '.join([timetable.pop(),timetable2.pop()])
else:
item['timetable_en']=timetable.pop()
else:
item['timetable_en']=''
link=response.url
link=link.replace("/en/","/eu/")
request=Request(link,callback=self.parse_restaurants_eu)
request.meta['item']=item
yield request
示例5: parse
def parse(self, response):
''' Parse response from start urls (/channels)
Channels are groups by category. So, this spider extracts the
category of each channel, and constructs a request with the meta
information of the category (that information would not be
available from the channel page otherwise)
'''
self.logger.debug("Parse url {}".format(response.url))
cat_container = response.xpath('/html/body/div[1]/div/article/div')
# Channels are grouped by category in containers with class '.channel-category'
for cat in cat_container.css('.channel-category'):
# extract the title of the category
cat_title = cat.xpath('h2/text()').extract_first()
# extract the link to the channel pages
for channel in cat.css('ul.channel-grid li'):
link = channel.xpath('a//@href').extract_first()
full_link = loaders.contextualize(link, base_url=response.url)
# Construct request
request = Request(full_link, callback=self.parse_channel)
request.meta['category'] = cat_title
yield request
示例6: _parse_symptom_question
def _parse_symptom_question(self, response):
symptom_question_item = response.meta.get('symptom_questions')
# print response.url
if not symptom_question_item:
symptom_question_item = SymptomQuestionItem()
symptom_question_item['symptom_name'] = response.meta['symptom_item']['name']
symptom_question_item['qids'] = []
# parse
urls = response.xpath('//div[@class="p_list_li"]/div[@class="p_list_cent"]/div[@class="p_list_centt"]/dl/dt/a/@href').extract()
symptom_question_item['qids'] += [u.split('/')[-1].split('.')[0] for u in urls]
# last_url = response.xpath('//div[@class="portldet-content"]/a/@href').extract()[-1]
next_url = response.xpath('//div[@class="portlet-content"]/a[text()="下一頁 >"]/@href').extract()
if not next_url:
# 所有頁都處理完了
print symptom_question_item
yield symptom_question_item
else:
url = next_url[0]
# print url
# print symptom_question_item['qids']
request = Request(url, dont_filter=True, callback=self._parse_symptom_question)
request.meta['symptom_questions'] = symptom_question_item
# print request
yield request
示例7: parse_history
def parse_history(self,response):
#Parse Price History Table
house = response.meta['item']
tax_url = house['tax_url']
price_history = []
pattern = r' { "html": "(.*)" }'
html = re.search(pattern, response.body).group(1)
html = re.sub(r'\\"', r'"', html) # Correct escaped quotes
html = re.sub(r'\\/', r'/', html) # Correct escaped forward
if (html != ""):
soup = BeautifulSoup(html)
table = soup.find('table')
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele for ele in cols]
cols = cols[:3]
if (cols[2].find('span') != None):
date = cols[0].get_text()
event = cols[1].get_text()
price = cols[2].find('span').get_text()
price_history.append([date, event, price])
#Store history as JSON string
house['price_history'] = json.dumps(price_history)
tax_request = Request(tax_url,
callback=self.parse_taxes)
tax_request.meta['item'] = house
return tax_request
示例8: parse_page
def parse_page(self, response):
if response.meta.has_key('crawldepth'):
depth = response.meta['crawldepth']
else:
# Set search depth here
depth = 1
log.msg('Depth = %s' % str(depth), level=log.INFO)
if not isinstance(response, HtmlResponse):
log.msg('Not an HTML file: %s' % response.url, level=log.WARNING)
return
log.msg('Response from: %s' % response.url, level=log.INFO)
url_bf.add(response.url)
# TODO: Extract page title
extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode())
cleaned_text = extractor.getText()
# Eliminate duplicates
keywordset = set(keywordlist)
found_list = []
for keyword in keywordset: # TODO: Is there a more efficient way to do this?
# Look at word boundaries to match entire words only
if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)):
found_list.append(keyword)
# Parse this page
item = BiffleItem()
if (len(found_list) > 0):
item['url'] = response.url
item['body'] = cleaned_text
item['keywords'] = ', '.join(found_list)
item['process_date'] = datetime.today()
log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO)
self.map_keyword_count(found_list)
yield item
if (depth > 0):
# Find the next requests and yield those
hxs = HtmlXPathSelector(response)
links = hxs.select('//a/@href').extract()
log.msg('Links on page: %s' % len(links), level=log.INFO)
depth -= 1
log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO)
for l in links:
l = urlparse.urljoin(response.url, l)
if (l in url_bf):
pass
#log.msg('Duplicate URL found: %s' % l, level=log.INFO)
else:
url_bf.add(l)
#log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO)
# Decrement depth for next layer of links
#callback = lambda response, depth = depth: self.parse_page(response, depth)
callback = lambda response: self.parse_page(response)
request = Request(l, callback=callback)
request.meta['crawldepth'] = depth
yield request
示例9: parse_monuments_en
def parse_monuments_en(self,response):
sel=Selector(response)
monument=sel.xpath('//div[@class="col-50 content-desc"]')
title=monument.xpath("h2[@class='big sec-color']/text()").extract()
summary=''.join(monument.xpath("div[@id='idContentScroll']/span/p//text()").extract())
informationLink=monument.xpath("div[@id='idContentScroll']/span/a/@href").extract()
item = response.meta['item']
if len(informationLink)>0:
item['informationLink_en']=informationLink.pop()
else:
item['informationLink_en']=response.url
if len(title)>0:
item['name_en']=title.pop()
else:
item['name_en']=''
if len(summary)>0:
item['description_en']=summary
else:
item['description_en']=''
if len(informationLink)>0:
item['informationLink']=informationLink.pop()
else:
item['informationLink']=response.url
euLink=sel.xpath('//*[@id="eu"]/@href').extract()
request=Request(self.BASE+str(euLink.pop()),callback=self.parse_monuments_eu)
request.meta['item']=item
yield request
示例10: parse_disease
def parse_disease(self, response):
"""解析【疾病】頁麵"""
disease_item = DiseaseItem()
disease_item['url'] = response.url
_name = response.xpath('//div[@class="p_lbox1"]/div[@class="p_lboxti"]/h3')
disease_item['name'] = _name.xpath('text()').extract()[0]
_other_name = _name.xpath('var/text()').extract()
if _other_name:
begin = _other_name[0].find(':') + 1
end = _other_name[0].rfind(')')
disease_item['aliases'] = re.split(',|,', _other_name[0][begin:end])
_related = response.xpath('//div[@id="yw4"]/div/div/div')
disease_item['related_diseases'] = _related.xpath('ul/li/a[contains(@href, "/jibing/")]/@title').extract()
disease_item['related_symptoms'] = _related.xpath('ul/li/a[contains(@href, "/zhengzhuang/")]/@title').extract()
# print disease_item['related_diseases'], disease_item['related_symptoms']
# print disease_item
yield disease_item
# Go on parsing details
detail_urls = response.xpath('//div[@class="p_lbox1_ab"]/a/@href').extract()
detail_urls += response.xpath('//ul[@class="p_sibox2ul clears"]/li/a/@href').extract()
# print detail_urls
for url in detail_urls:
request = Request(url=url, dont_filter=True, callback=self._parse_disease_detail)
request.meta['disease_item'] = disease_item
yield request
# Go on parsing questions
question_url = response.xpath('//div[@class="p_lbox5"]/div[@class="p_lboxti"]/a/@href').extract()[0]
request = Request(url=question_url, dont_filter=True, callback=self._parse_disease_question)
request.meta['disease_item'] = disease_item
# print request
yield request
示例11: parseJsonProduct
def parseJsonProduct(self, response):
item = response.meta["item"]
# make a valid json file out of it and remove unneeded data
prodResponse = response.body.split("$+$")[0].strip().replace("'", '"')
prodDict = {}
sizeWidthDict = {}
jsonresponse = json.loads(prodResponse)
for product, value in jsonresponse.iteritems():
if item["sku"] not in prodDict:
prodDict[item["sku"]] = {}
if value["c"] not in prodDict[item["sku"]]:
prodDict[item["sku"]][value["c"]] = {}
if value["w"] not in prodDict[item["sku"]][value["c"]]:
prodDict[item["sku"]][value["c"]][value["w"]] = {}
if value["s"] not in sizeWidthDict:
sizeWidthDict[value["s"]] = []
if value["w"] not in sizeWidthDict[value["s"]]:
sizeWidthDict[value["s"]].append(value["w"])
prodDict[item["sku"]][value["c"]][value["w"]][value["s"]] = value["sku"]
item["variant"] = prodDict
item["size_width_list"] = sizeWidthDict
# request first imageset
if item["imageSetUrls"]:
color, href = item["imageSetUrls"].popitem()
if len(href) > 1:
item["imageSetUrls"][color] = href[1:]
request = Request(href[0], callback=self.parseJsonImageSet)
request.meta["item"] = item
return request
self.to_csv(item)
return item
示例12: parse
def parse(self, response):
tabs = []
tab_selector = response.xpath('//div[@id="siteDirectory"]')
### loop for all tabs
for tab in tab_selector.xpath('.//div[@class="popover-grouping"]'):
tabNameSel = tab.xpath("h2/text()").extract()
if tabNameSel:
tabName = tabNameSel[0]
fobj = open(tabName + ".txt", "a+")
cat_selector = tab.xpath(".//ul")
### loop for all categories
for category in cat_selector.xpath("li"): #'.//div[contains(@class, "ht180")]
catNameSel = category.xpath(
"a/text()"
).extract() # //div[contains(@class, "top-menu unit")]/ul/li/div/div/div/ul/li[@class="heading"]
# print category.extract()
if catNameSel:
catName = catNameSel[0]
catLinkSel = category.xpath("a/@href").extract()
if catLinkSel:
catLink = "http://www.amazon.in" + catLinkSel[0]
request = Request(catLink, callback=self.parse_subcatpage)
request.meta["fobj"] = fobj
request.meta["tabName"] = tabName
request.meta["catName"] = catName
yield request
fobj.close()
示例13: parse
def parse(self, response):
"""First step of Mon/gr parsing."""
try:
# Connect to Beanstalkd server
self.beanstalk = beanstalkc.Connection(host=self.host_beanstalkd, port=11301)
# See all tubes:
self.beanstalk.tubes()
# Switch to the default (tube):
self.beanstalk.use("default")
# self.makedirResults()
self.nodes = json.loads(response.body_as_unicode())
for node in self.nodes:
link_node = self.domain + self.nodes[node]
request = Request(link_node, callback=self.parseDomain)
# Pass metadata to the next wave of parsing
request.meta["node"] = node
yield request
except:
print "Please run the beanstalkc"
return
示例14: parse_symptom
def parse_symptom(self, response):
"""解析【症狀】頁麵"""
symptom_item = SymptomItem()
symptom_item['url'] = response.url
symptom_item['name'] = response.xpath('//div[@id="m_1"]/div[@class="p_sibox1 p_siboxbor"]/div[@class="p_sititile"]/span/h1/text()').extract()[0]
_related = response.xpath('//div[@id="yw3"]/div/div')
symptom_item['related_diseases'] = _related.xpath('ul/li/a[contains(@href, "/jibing/")]/@title').extract()
# symptom_item['related_symptoms'] = _related.xpath('ul/li/a[contains(@href, "/zhengzhuang/")]/@title').extract()
# print symptom_item['related_diseases'], symptom_item['related_symptoms']
# print symptom_item
yield symptom_item
# Go on parsing details
detail_urls = response.xpath('//dl[@class="p_sibox1dl clears"]/dt/a/@href').extract()
detail_urls += response.xpath('//ul[@class="p_sibox2ul clears"]/li/a[1]/@href').extract()
# print detail_urls
for url in detail_urls:
request = Request(url=url, dont_filter=True, callback=self._parse_symptom_detail)
request.meta['symptom_item'] = symptom_item
yield request
# Go on parsing questions
question_url = response.xpath('//div[@class="p_sibox4 p_siboxbor"]/div[@class="p_sititile"]/a/@href').extract()[0]
request = Request(url=question_url, dont_filter=True, callback=self._parse_symptom_question)
request.meta['symptom_item'] = symptom_item
# print request
yield request
示例15: amazon_marketplace
def amazon_marketplace(self,response):
sel = Selector(response)
item = response.meta['item']
try:
sp = sel.xpath("//span[@style='text-decoration: inherit; white-space: nowrap;']/text()").extract()[0].replace(",","")
shippingcost = sel.xpath("//span[@class='olpShippingPrice']/span/text()").extract()
if shippingcost:
sp = str(float(sp) + float(sel.xpath("//span[@class='olpShippingPrice']/span/text()").extract()[0].replace(",","")))
if sp>item['SP']:
sp = item['SP']
except:
try:
flipkart_url = flipkart_urls[item['index']]
request = Request(flipkart_url,callback = self.flipkart_scraper)
request.meta['item'] = item
# request.meta['proxy'] = "http://111.161.126.100:80"
yield request
except:
try:
paytm_url = paytm_urls[item['index']]
request = Request(paytm_url,callback = self.paytm_scraper)
request.meta['item'] = item
request.meta['proxy'] = "http://111.161.126.100:80"
yield request
except:
self.to_csv(item)