本文整理汇总了Python中scrapy.Request类的典型用法代码示例。如果您正苦于以下问题:Python Request类的具体用法?Python Request怎么用?Python Request使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Request类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: form_part_request
def form_part_request(url, callback, part):
request = Request(
url=url,
callback=callback
)
request.meta['part'] = part
return request
示例2: parse
def parse(self, response):
#翻页请求,每10页,停30秒
self.pageNo += 1
if self.pageNo % 2 == 0:
time.sleep(25)
select = Selector(response)
if not "shopDetail" in response.meta:
# 店铺列表页
allNo = self.questionIdPatten.findall(response.url)
cityId = allNo[0] # cityid
pageNumber = allNo[-1]
#记录page
self.fw.write("%s cityId:%s, pageNumber:%s\n" % (response.url, cityId, pageNumber))
self.fw.flush()
item = DianpingItem()
item["city_id"] = cityId
try:
cityName = select.css(".city").xpath("./text()").extract()[0]
except Exception,e:
cityName=""
print e
# self.fw.write("%s\t%s\n"%(cityId, cityName))
# self.fw.flush()
yieldPageFlag = False
shop_list = select.xpath(".//div[@class='info']")
for li in shop_list:
yieldPageFlag = True
item["shop_name"] = li.xpath(".//p[@class='title']/a/text()").extract()[0]
item["shop_cityname"] = cityName # 地区
# domain,当做标签,非区域,抓取区域指地区
item["shop_domain"] = ",".join(li.xpath(".//p[@class='area-key']/span[@class='area-list']/a/text()").extract())
key_list = ",".join(li.xpath(".//p[@class='area-key']/span[@class='key-list']/a/text()").extract())
item["shop_tag"] = ",".join([key_list, item["shop_domain"]]) # 标签包含区域
# href = '/shop/123456'
href = li.xpath(".//p[@class='title']/a[@class='shopname']/@href").extract()[0]
item["shop_id"] = href.split("/")[-1]
shopUrl = "http://www.dianping.com" + href
request = Request(shopUrl, callback=self.parse, priority=1234567)#店铺请求
request.meta["shopDetail"] = copy.deepcopy(item)
yield request
pass
if yieldPageFlag:
# 如果当前页有数据,则继续请求下一页
nextPageNumber = int(pageNumber) + 1
url = self.pageUrl % (cityId, nextPageNumber)
request = Request(url, callback=self.parse, priority=1234)
yield request
pass
示例3: parse_hiker_info
def parse_hiker_info(self, response):
# TODO: Somehow obtain the Hiker's direction 'dir'.
# TODO: Somehow obtain the Hiker's trail start date 'start_date'
# TODO: Somehow obtain the Hiker's trail estimated end date 'end_date'
print("Response received: %s" % response)
print("Parsing Hiker Info from response: %s" % response)
hiker = HikerItem()
hiker['id'] = self.extract_hiker_id(response=response)
hiker_name_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table//tr[2]/td//font[2]"
hiker_name = Selector(response=response).xpath(hiker_name_xpath).extract()[0]
hiker_name_start = str.find(hiker_name, "-", 0, len(hiker_name))
hiker_name_end = str.find(hiker_name, "<", hiker_name_start, len(hiker_name))
hiker_name = hiker_name[hiker_name_start + 1:hiker_name_end]
hiker_name = str.strip(hiker_name, " ")
hiker['name'] = hiker_name
hiker_trail_name_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table//tr[2]/td//font[1]/b"
hiker_trail_name = Selector(response=response).xpath(hiker_trail_name_xpath).extract()[0]
hiker_trail_name_start = str.find(hiker_trail_name, ">", 0, len(hiker_trail_name))
hiker_trail_name_end = str.find(hiker_trail_name, "<", hiker_trail_name_start, len(hiker_trail_name))
hiker_trail_name = hiker_trail_name[hiker_trail_name_start + 1:hiker_trail_name_end]
hiker['trail_name'] = hiker_trail_name
hiker['about_url'] = response.url
# TODO: Verify that the 'journal_url' is the FIRST journal entry.
hiker['journal_url'] = str.replace(response.url, "about", "entry")
journal_parse_request = Request(hiker['journal_url'], callback=self.parse_hiker_journal)
journal_parse_request.meta['hiker'] = hiker
yield journal_parse_request
示例4: test_hs_middlewares_retry
def test_hs_middlewares_retry(hs_downloader_middleware, hs_spider_middleware):
spider = Spider('test')
url = 'http://resp-url'
request_0 = Request(url)
response_0 = Response(url)
hs_downloader_middleware.process_request(request_0, spider)
assert HS_REQUEST_ID_KEY not in request_0.meta
assert HS_PARENT_ID_KEY not in request_0.meta
assert len(hs_spider_middleware._seen_requests) == 0
assert len(hs_downloader_middleware._seen_requests) == 0
hs_downloader_middleware.process_response(request_0, response_0, spider)
assert request_0.meta[HS_REQUEST_ID_KEY] == 0
assert request_0.meta[HS_PARENT_ID_KEY] is None
assert hs_spider_middleware._seen_requests[request_0] == 0
request_1 = request_0.copy()
response_1 = Response(url)
assert request_1.meta[HS_REQUEST_ID_KEY] == 0
assert request_1.meta[HS_PARENT_ID_KEY] is None
hs_downloader_middleware.process_request(request_1, spider)
assert HS_REQUEST_ID_KEY not in request_1.meta
assert request_1.meta[HS_PARENT_ID_KEY] == 0
hs_downloader_middleware.process_response(request_1, response_1, spider)
assert request_1.meta[HS_REQUEST_ID_KEY] == 1
assert request_1.meta[HS_PARENT_ID_KEY] == 0
示例5: start_requests
def start_requests(self):
kwargs = {
'debug': self.settings.getbool('GIANT_DEBUG'),
'limit': self.settings.getint('GIANT_LIMIT'),
'opt': 'twse'
}
requests = []
for stockid in TwseIdDBHandler().stock.get_ids(**kwargs):
for mon in range(4, -1, -1):
timestamp = datetime.utcnow() - relativedelta(months=mon)
if mon == 0:
if timestamp.day == 1 and timestamp.hour <= 14:
continue
URL = (
'http://www.twse.com.tw/ch/trading/exchange/' +
'STOCK_DAY/STOCK_DAY_print.php?genpage=genpage/' +
'Report%(year)d%(mon)02d/%(year)d%(mon)02d_F3_1_8_%(stock)s.php' +
'&type=csv') % {
'year': timestamp.year,
'mon': timestamp.month,
'stock': stockid
}
request = Request(
URL,
callback=self.parse,
dont_filter=True)
item = TwseHisStockItem()
item['stockid'] = stockid
request.meta['item'] = item
requests.append(request)
return requests
示例6: parse_depute
def parse_depute(self, response):
depute = json.loads(response.body_as_unicode())
if 'depute' in depute:
depute = depute['depute']
depute['photo_url'] = self.photo_url % depute['slug']
req = None
for ad in depute['adresses']:
adresse = ad['adresse']
pattern = ur'Télé(phone|copie)\s*:\s*(\d[0-9 ]+\d)'
for telm in re.finditer(pattern, adresse):
if telm.group(1) == 'phone':
ad['tel'] = telm.group(2)
else:
ad['fax'] = telm.group(2)
lad = adresse.lower()
if not req and not lad.startswith(u'assemblée nationale'):
trimmed = re.sub(pattern, '', adresse)
req = Request(url=self.get_geocode_url(adresse),
callback=self.parse_geocode)
req.meta['depute'] = depute
req.meta['adresse'] = ad
if req is not None:
yield req
else:
yield depute
示例7: parse
def parse(self, response):
for href in response.xpath('//div[contains(@id, "dnn_ctr430_ExbList_pnlList")]//ul//li//a/@href'):
url = response.urljoin(href.extract())
request = Request(url, callback=self.parse_exhibition)
request.meta['dont_redirect'] = True
yield request
示例8: parse
def parse(self, response):
'获取商铺详情页'
req = []
plazaId=response.url.split('/')[-1]
sel = Selector(response)
gouwu=sel.xpath('//*[@class="hot-top fn-clear"]/div')
i=1
for gouwushop in gouwu:
shopsurl='http://www.dianping.com'+gouwushop.xpath('a[1]/@href').extract()[0].strip()
shopImg=[]
shopImg=gouwushop.xpath('a[1]/img/@src').extract()
item=PlazaShop()
item['plazaId']=plazaId
if i<=4:
item['shopCatetory1']='购物'
else:
item['shopCatetory1']='餐饮'
item['shopUrl']=shopsurl
item['image_urls']=shopImg
r = Request(shopsurl, callback=self.shop_detail)
r.meta['item'] = item
i=i+1
req.append(r)
return req
示例9: parse_job_list_page
def parse_job_list_page(self, response):
self.get_connector().log(self.name, self.ACTION_CRAWL_LIST, response.url)
feed_parser = feedparser.parse(response.body)
for job_entry in feed_parser.entries:
job_url = job_entry.link
job_publication_date = datetime.fromtimestamp(mktime(job_entry.published_parsed))
job_publication_time = mktime(job_publication_date.timetuple())
last_job_publication_time = mktime(self._last_job_date.timetuple())
if job_publication_time <= last_job_publication_time:
self.get_connector().log(self.name,
self.ACTION_MARKER_FOUND,
"%s <= %s" % (job_publication_time, last_job_publication_time))
return
prepared_job = JobItem()
request = Request(job_url, self.parse_job_page)
request.meta['item'] = prepared_job
prepared_job['title'] = job_entry.title
prepared_job['description'] = job_entry.description
prepared_job['publication_datetime'] = job_publication_date
yield request
示例10: parse
def parse(self, response):
"""Parse a APS record into a HEP record.
Attempts to parse an XML JATS full text first, if available, and falls
back to parsing JSON if such is not available.
"""
aps_response = json.loads(response.body_as_unicode())
for article in aps_response['data']:
doi = get_value(article, 'identifiers.doi', default='')
if doi:
request = Request(url='{}/{}'.format(self.aps_base_url, doi),
headers={'Accept': 'text/xml'},
callback=self._parse_jats,
errback=self._parse_json_on_failure)
request.meta['json_article'] = article
request.meta['original_response'] = response
yield request
# Pagination support. Will yield until no more "next" pages are found
if 'Link' in response.headers:
links = link_header.parse(response.headers['Link'])
next = links.links_by_attr_pairs([('rel', 'next')])
if next:
next_url = next[0].href
yield Request(next_url)
示例11: parse_job_list_page
def parse_job_list_page(self, response):
"""
Pasring of job list
"""
self.get_connector().log(self.name, self.ACTION_CRAWL_LIST, response.url)
try:
for jobs in self._get_from_list__jobs_lists(response):
for job in self._get_from_list__jobs(jobs):
# first we check url. If the job exists, then skip crawling
# (it means that the page has already been crawled
try:
url = self._get_from_list__url(job)
except NotCrawlable:
break
if self.get_connector().job_exist(url):
self.get_connector().log(self.name, self.ACTION_MARKER_FOUND, url)
raise StopCrawlJobList()
request = Request(url, self.parse_job_page)
prefilled_job_item = self._get_prefilled_job_item(job, url)
request.meta['item'] = prefilled_job_item
if self.is_from_page_enabled():
yield request
else:
yield prefilled_job_item
next_page_url = self._get_from_list__next_page(response)
if next_page_url:
yield Request(url=next_page_url)
except NotFound, exc:
self.get_connector().log(self.name, self.ACTION_CRAWL_ERROR, str(exc))
示例12: parse_data
def parse_data(self, response):
rows = response.selector.xpath(
'//*[@id="container-outer"]/div[1]/div[3]/div/div/div[2]/table/tbody/tr')
for rows in rows:
if rows.xpath('td/p'):
url1_temp = rows.xpath('td/p').extract()
count = 0
for url1_temp in url1_temp:
item = SpiderItem()
url_tem = rows.xpath('td/p/a/@href').extract()
item['url'] = urljoin(response.url, url_tem[count])
item['publishdate'] = rows.xpath('td/div/a/@title').extract()
time_temp = rows.xpath('td/p[' + str(count + 1) + ']/text()[2]').extract()
item['publishtime'] = process_string(time_temp[0].strip().split('[')[0])
item['Source'] = "[House Committe on Appropriations - Subcommittee on Interior and Environment]"
item['_type'] = "[Hearings and Markups]"
item['ekwhere'] = "[Fed]"
link = 'http://docs.house.gov/Committee/Calendar/' + url_tem[count]
request = Request(link, callback=self.grab_title)
request.meta['item'] = item
yield request
count = count + 1
yield item
示例13: parse
def parse(self, response):
"""Crawl article index pages.
From the index page, for each article extract it's topic first
because in this old version, there is no text information about the
topic in the article page. On index pages, it's contained in the alt
attribute of article/topic image, but alt is empty on the article page.
After that, follow the "Read more" link and get the other article
fields.
"""
for i, a in enumerate(response.xpath(
"//div[@class='articletrailer']/descendant::a[@class='trailer'][1]/@href")):
article = Article()
# If the image is not the default topic image, it will not have
# an appropriate selector, so we use it's div.
article["category"] = response.xpath(
"//div[@class='articleheading']/descendant::img/@alt").extract()[i]
article_url = response.urljoin(a.extract())
request = Request(article_url, callback=self.parse_article)
request.meta["article"] = article
yield request
示例14: parse_booklink
def parse_booklink(self, response):
sel = Selector(response)
# Xpath choose 'The content of first <div> </div> with class="p-name"'
sites = sel.xpath('(//div[@class="p-name"])[1]')
req = []
for site in sites:
# This is the hyperlink to the details of the bookinfo.
# Xpath chooses 'The @href content(hyperlink) in <a> </a>"
books = site.xpath('a/@href').extract()
for b in books:
# Request pages from url, the page will show details of the book, including category info.
# Uses encode to keep Chinese charaters from losing
url = "http:" + b.encode('utf-8')
# Store the URL in the 'request' method, callback function is parse()
r = Request(url, callback=self.parse_category, dont_filter=True)
# Bookid is stored as an additional data in 'request',
r.meta['bkid']=response.meta['id']
req.append(r)
return req
示例15: parse
def parse(self, response):
"""
"""
sel = Selector(response)
sites = sel.xpath("//div[@class='tabs-container']//*//article//div[@class='description']")
domain = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(response.url))
rub = u'\u0440\u0443\u0431.'
items = []
for site in sites:
item = RealtyItem()
price = site.xpath(".//section[@class='d-1']//p[@class='price']//span/text()").extract()[0]
price = price.replace(rub, '').replace(u' ', '')
item['price'] = price
item['floor'] = site.xpath(".//section[@class='d-2 params']//p[@class='row floor']//span[@class='value corporate_red']/text()").extract()[0]
item['space'] = site.xpath(".//section[@class='d-2 params']//p[@class='row space']//span[@class='value corporate_red']/text()").extract()[0]
item['url'] = urljoin(domain, site.xpath(".//p[@class='title-obj']/a/@href").extract()[0])
kitchen = site.xpath(".//section[@class='d-2 params']//p[@class='row kitchen']//span[@class='value corporate_red']/text()").extract()
if kitchen:
item['kitchen'] = kitchen[0]
# item['district'] = request.meta['item']
request = Request(item['url'], callback=self.parse_page)
request.meta['item'] = item
yield request
items.append(item)