当前位置: 首页>>代码示例>>Python>>正文


Python scrapy.Request类代码示例

本文整理汇总了Python中scrapy.Request的典型用法代码示例。如果您正苦于以下问题:Python Request类的具体用法?Python Request怎么用?Python Request使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Request类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: form_part_request

 def form_part_request(url, callback, part):
     request = Request(
         url=url,
         callback=callback
     )
     request.meta['part'] = part
     return request
开发者ID:DenisMalofeyev,项目名称:tecdoc-parser,代码行数:7,代码来源:exist_spider.py

示例2: parse

    def parse(self, response):
        #翻页请求,每10页,停30秒
        self.pageNo += 1
        if self.pageNo % 2 == 0:
            time.sleep(25)
            
        select = Selector(response)
        if not "shopDetail" in response.meta:
            # 店铺列表页
            allNo = self.questionIdPatten.findall(response.url)
            cityId = allNo[0]  # cityid
            pageNumber = allNo[-1]
            
            #记录page
            self.fw.write("%s cityId:%s, pageNumber:%s\n" % (response.url, cityId, pageNumber))
            self.fw.flush()
            
            item = DianpingItem()
            item["city_id"] = cityId
            try:
                cityName = select.css(".city").xpath("./text()").extract()[0]
            except Exception,e:
                cityName=""
                print e
                
#             self.fw.write("%s\t%s\n"%(cityId, cityName))
#             self.fw.flush()

            yieldPageFlag = False
            shop_list = select.xpath(".//div[@class='info']")
            for li in shop_list:
                yieldPageFlag = True
                
                item["shop_name"] = li.xpath(".//p[@class='title']/a/text()").extract()[0]
                item["shop_cityname"] = cityName  # 地区
                # domain,当做标签,非区域,抓取区域指地区
                item["shop_domain"] = ",".join(li.xpath(".//p[@class='area-key']/span[@class='area-list']/a/text()").extract())
                key_list = ",".join(li.xpath(".//p[@class='area-key']/span[@class='key-list']/a/text()").extract())
                
                item["shop_tag"] = ",".join([key_list, item["shop_domain"]])  # 标签包含区域
                
                # href = '/shop/123456'
                href = li.xpath(".//p[@class='title']/a[@class='shopname']/@href").extract()[0]
                item["shop_id"] = href.split("/")[-1]
                
                shopUrl = "http://www.dianping.com" + href
                request = Request(shopUrl, callback=self.parse, priority=1234567)#店铺请求
                request.meta["shopDetail"] = copy.deepcopy(item)
                yield request
                
                pass
            
            if yieldPageFlag:
                # 如果当前页有数据,则继续请求下一页
                nextPageNumber = int(pageNumber) + 1
                
                url = self.pageUrl % (cityId, nextPageNumber)
                request = Request(url, callback=self.parse, priority=1234)
                yield request
            pass
开发者ID:junfeng-feng,项目名称:Spider,代码行数:60,代码来源:dianpingSpider.py

示例3: parse_hiker_info

 def parse_hiker_info(self, response):
     # TODO: Somehow obtain the Hiker's direction 'dir'.
     # TODO: Somehow obtain the Hiker's trail start date 'start_date'
     # TODO: Somehow obtain the Hiker's trail estimated end date 'end_date'
     print("Response received: %s" % response)
     print("Parsing Hiker Info from response: %s" % response)
     hiker = HikerItem()
     hiker['id'] = self.extract_hiker_id(response=response)
     hiker_name_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table//tr[2]/td//font[2]"
     hiker_name = Selector(response=response).xpath(hiker_name_xpath).extract()[0]
     hiker_name_start = str.find(hiker_name, "-", 0, len(hiker_name))
     hiker_name_end = str.find(hiker_name, "<", hiker_name_start, len(hiker_name))
     hiker_name = hiker_name[hiker_name_start + 1:hiker_name_end]
     hiker_name = str.strip(hiker_name, " ")
     hiker['name'] = hiker_name
     hiker_trail_name_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table//tr[2]/td//font[1]/b"
     hiker_trail_name = Selector(response=response).xpath(hiker_trail_name_xpath).extract()[0]
     hiker_trail_name_start = str.find(hiker_trail_name, ">", 0, len(hiker_trail_name))
     hiker_trail_name_end = str.find(hiker_trail_name, "<", hiker_trail_name_start, len(hiker_trail_name))
     hiker_trail_name = hiker_trail_name[hiker_trail_name_start + 1:hiker_trail_name_end]
     hiker['trail_name'] = hiker_trail_name
     hiker['about_url'] = response.url
     # TODO: Verify that the 'journal_url' is the FIRST journal entry.
     hiker['journal_url'] = str.replace(response.url, "about", "entry")
     journal_parse_request = Request(hiker['journal_url'], callback=self.parse_hiker_journal)
     journal_parse_request.meta['hiker'] = hiker
     yield journal_parse_request
开发者ID:ccampell,项目名称:ATS,代码行数:27,代码来源:ScrapyWebScraper2.py

示例4: test_hs_middlewares_retry

def test_hs_middlewares_retry(hs_downloader_middleware, hs_spider_middleware):
    spider = Spider('test')
    url = 'http://resp-url'
    request_0 = Request(url)
    response_0 = Response(url)

    hs_downloader_middleware.process_request(request_0, spider)

    assert HS_REQUEST_ID_KEY not in request_0.meta
    assert HS_PARENT_ID_KEY not in request_0.meta
    assert len(hs_spider_middleware._seen_requests) == 0
    assert len(hs_downloader_middleware._seen_requests) == 0

    hs_downloader_middleware.process_response(request_0, response_0, spider)

    assert request_0.meta[HS_REQUEST_ID_KEY] == 0
    assert request_0.meta[HS_PARENT_ID_KEY] is None
    assert hs_spider_middleware._seen_requests[request_0] == 0

    request_1 = request_0.copy()
    response_1 = Response(url)
    assert request_1.meta[HS_REQUEST_ID_KEY] == 0
    assert request_1.meta[HS_PARENT_ID_KEY] is None

    hs_downloader_middleware.process_request(request_1, spider)

    assert HS_REQUEST_ID_KEY not in request_1.meta
    assert request_1.meta[HS_PARENT_ID_KEY] == 0

    hs_downloader_middleware.process_response(request_1, response_1, spider)

    assert request_1.meta[HS_REQUEST_ID_KEY] == 1
    assert request_1.meta[HS_PARENT_ID_KEY] == 0
开发者ID:scrapinghub,项目名称:scrapinghub-entrypoint-scrapy,代码行数:33,代码来源:test_middlewares.py

示例5: start_requests

 def start_requests(self):
     kwargs = {
         'debug': self.settings.getbool('GIANT_DEBUG'),
         'limit': self.settings.getint('GIANT_LIMIT'),
         'opt': 'twse'
     }
     requests = []
     for stockid in TwseIdDBHandler().stock.get_ids(**kwargs):
         for mon in range(4, -1, -1):
             timestamp = datetime.utcnow() - relativedelta(months=mon)
             if mon == 0:
                 if timestamp.day == 1 and timestamp.hour <= 14:
                     continue
             URL = (
                 'http://www.twse.com.tw/ch/trading/exchange/' +
                 'STOCK_DAY/STOCK_DAY_print.php?genpage=genpage/' +
                 'Report%(year)d%(mon)02d/%(year)d%(mon)02d_F3_1_8_%(stock)s.php' +
                 '&type=csv') % {
                     'year': timestamp.year,
                     'mon': timestamp.month,
                     'stock': stockid
             }
             request = Request(
                 URL,
                 callback=self.parse,
                 dont_filter=True)
             item = TwseHisStockItem()
             item['stockid'] = stockid
             request.meta['item'] = item
             requests.append(request)
     return requests
开发者ID:KKJgit,项目名称:scrapy_giant,代码行数:31,代码来源:twsehisstock_spider.py

示例6: parse_depute

    def parse_depute(self, response):
        depute = json.loads(response.body_as_unicode())
        if 'depute' in depute:
            depute = depute['depute']

        depute['photo_url'] = self.photo_url % depute['slug']

        req = None

        for ad in depute['adresses']:
            adresse = ad['adresse']

            pattern = ur'Télé(phone|copie)\s*:\s*(\d[0-9 ]+\d)'
            for telm in re.finditer(pattern, adresse):
                if telm.group(1) == 'phone':
                    ad['tel'] = telm.group(2)
                else:
                    ad['fax'] = telm.group(2)

            lad = adresse.lower()
            if not req and not lad.startswith(u'assemblée nationale'):
                trimmed = re.sub(pattern, '', adresse)
                req = Request(url=self.get_geocode_url(adresse),
                              callback=self.parse_geocode)

                req.meta['depute'] = depute
                req.meta['adresse'] = ad

        if req is not None:
            yield req
        else:
            yield depute
开发者ID:briatte,项目名称:FranceData,代码行数:32,代码来源:depute.py

示例7: parse

    def parse(self, response):
        for href in response.xpath('//div[contains(@id, "dnn_ctr430_ExbList_pnlList")]//ul//li//a/@href'):
            url = response.urljoin(href.extract())

            request = Request(url, callback=self.parse_exhibition)
            request.meta['dont_redirect'] = True
            yield request
开发者ID:coreymcdermott,项目名称:artbot,代码行数:7,代码来源:m2_spider.py

示例8: parse

    def parse(self, response):
        '获取商铺详情页'
        req = []
        plazaId=response.url.split('/')[-1]
        sel = Selector(response)
        gouwu=sel.xpath('//*[@class="hot-top fn-clear"]/div')
        i=1
        for gouwushop in gouwu:
            shopsurl='http://www.dianping.com'+gouwushop.xpath('a[1]/@href').extract()[0].strip()
            shopImg=[]
            shopImg=gouwushop.xpath('a[1]/img/@src').extract()
            item=PlazaShop()
            item['plazaId']=plazaId
            if i<=4:
                item['shopCatetory1']='购物'
            else:
                item['shopCatetory1']='餐饮'
            item['shopUrl']=shopsurl
            item['image_urls']=shopImg
            r = Request(shopsurl, callback=self.shop_detail)
            r.meta['item'] = item
            i=i+1
            req.append(r)

        return req
开发者ID:zeliu,项目名称:scrapy_spider,代码行数:25,代码来源:plaza_dp_shop_spider2.py

示例9: parse_job_list_page

    def parse_job_list_page(self, response):
        self.get_connector().log(self.name, self.ACTION_CRAWL_LIST, response.url)

        feed_parser = feedparser.parse(response.body)
        for job_entry in feed_parser.entries:
            job_url = job_entry.link
            job_publication_date = datetime.fromtimestamp(mktime(job_entry.published_parsed))

            job_publication_time = mktime(job_publication_date.timetuple())
            last_job_publication_time = mktime(self._last_job_date.timetuple())
            if job_publication_time <= last_job_publication_time:
                self.get_connector().log(self.name,
                                         self.ACTION_MARKER_FOUND,
                                         "%s <= %s" % (job_publication_time, last_job_publication_time))
                return

            prepared_job = JobItem()
            request = Request(job_url, self.parse_job_page)
            request.meta['item'] = prepared_job

            prepared_job['title'] = job_entry.title
            prepared_job['description'] = job_entry.description
            prepared_job['publication_datetime'] = job_publication_date

            yield request
开发者ID:algoo,项目名称:crawlers,代码行数:25,代码来源:remixjobs.py

示例10: parse

    def parse(self, response):
        """Parse a APS record into a HEP record.

        Attempts to parse an XML JATS full text first, if available, and falls
        back to parsing JSON if such is not available.
        """
        aps_response = json.loads(response.body_as_unicode())

        for article in aps_response['data']:
            doi = get_value(article, 'identifiers.doi', default='')

            if doi:
                request = Request(url='{}/{}'.format(self.aps_base_url, doi),
                              headers={'Accept': 'text/xml'},
                              callback=self._parse_jats,
                              errback=self._parse_json_on_failure)
                request.meta['json_article'] = article
                request.meta['original_response'] = response
                yield request

        # Pagination support. Will yield until no more "next" pages are found
        if 'Link' in response.headers:
            links = link_header.parse(response.headers['Link'])
            next = links.links_by_attr_pairs([('rel', 'next')])
            if next:
                next_url = next[0].href
                yield Request(next_url)
开发者ID:drjova,项目名称:hepcrawl,代码行数:27,代码来源:aps_spider.py

示例11: parse_job_list_page

    def parse_job_list_page(self, response):
        """
        Pasring of job list
        """
        self.get_connector().log(self.name, self.ACTION_CRAWL_LIST, response.url)

        try:
            for jobs in self._get_from_list__jobs_lists(response):
                for job in self._get_from_list__jobs(jobs):
                    # first we check url. If the job exists, then skip crawling
                    # (it means that the page has already been crawled
                    try:
                        url = self._get_from_list__url(job)
                    except NotCrawlable:
                        break

                    if self.get_connector().job_exist(url):
                        self.get_connector().log(self.name, self.ACTION_MARKER_FOUND, url)
                        raise StopCrawlJobList()

                    request = Request(url, self.parse_job_page)
                    prefilled_job_item = self._get_prefilled_job_item(job, url)
                    request.meta['item'] = prefilled_job_item

                    if self.is_from_page_enabled():
                        yield request
                    else:
                        yield prefilled_job_item

            next_page_url = self._get_from_list__next_page(response)
            if next_page_url:
                yield Request(url=next_page_url)
        except NotFound, exc:
            self.get_connector().log(self.name, self.ACTION_CRAWL_ERROR, str(exc))
开发者ID:algoo,项目名称:crawlers,代码行数:34,代码来源:__init__.py

示例12: parse_data

    def parse_data(self, response):

        rows = response.selector.xpath(
            '//*[@id="container-outer"]/div[1]/div[3]/div/div/div[2]/table/tbody/tr')
        for rows in rows:

            if rows.xpath('td/p'):
                url1_temp = rows.xpath('td/p').extract()
                count = 0
                for url1_temp in url1_temp:
                    item = SpiderItem()
                    url_tem = rows.xpath('td/p/a/@href').extract()
                    item['url'] = urljoin(response.url, url_tem[count])
                    item['publishdate'] = rows.xpath('td/div/a/@title').extract()
                    time_temp = rows.xpath('td/p[' + str(count + 1) + ']/text()[2]').extract()
                    item['publishtime'] = process_string(time_temp[0].strip().split('[')[0])
                    item['Source'] = "[House Committe on Appropriations - Subcommittee on Interior and Environment]"
                    item['_type'] = "[Hearings and Markups]"
                    item['ekwhere'] = "[Fed]"
                    link = 'http://docs.house.gov/Committee/Calendar/' + url_tem[count]
                    request = Request(link, callback=self.grab_title)
                    request.meta['item'] = item
                    yield request
                    count = count + 1
                    yield item
开发者ID:shaharyarrrr,项目名称:scrapy,代码行数:25,代码来源:MainSpider.py

示例13: parse

    def parse(self, response):
        """Crawl article index pages.

        From the index page, for each article extract it's topic first
        because in this old version, there is no text information about the
        topic in the article page.  On index pages, it's contained in the alt
        attribute of article/topic image, but alt is empty on the article page.

        After that, follow the "Read more" link and get the other article
        fields.
        
        """
        for i, a in enumerate(response.xpath(
                "//div[@class='articletrailer']/descendant::a[@class='trailer'][1]/@href")):
            article = Article()

            # If the image is not the default topic image, it will not have
            # an appropriate selector, so we use it's div.
            article["category"] = response.xpath(
                    "//div[@class='articleheading']/descendant::img/@alt").extract()[i]

            article_url = response.urljoin(a.extract())
            request = Request(article_url, callback=self.parse_article)
            request.meta["article"] = article

            yield request
开发者ID:nzp,项目名称:tikiart-scraper,代码行数:26,代码来源:article_spider.py

示例14: parse_booklink

    def parse_booklink(self, response):
        sel = Selector(response)
        

        # Xpath choose 'The content of first <div> </div> with class="p-name"'
        sites = sel.xpath('(//div[@class="p-name"])[1]')
        req = []
        
        for site in sites:

            # This is the hyperlink to the details of the bookinfo.
            # Xpath chooses 'The @href content(hyperlink) in <a> </a>"
            books = site.xpath('a/@href').extract()

            for b in books:
                # Request pages from url, the page will show details of the book, including category info.
                # Uses encode to keep Chinese charaters from losing
                url = "http:" + b.encode('utf-8')
 
                # Store the URL in the 'request' method, callback function is parse()
                r = Request(url, callback=self.parse_category, dont_filter=True)

                # Bookid is stored as an additional data in 'request',
                r.meta['bkid']=response.meta['id']
                req.append(r)
        return req
开发者ID:wf94,项目名称:BookSpider,代码行数:26,代码来源:search.py

示例15: parse

    def parse(self, response):
        """
        """
        sel = Selector(response)
        sites = sel.xpath("//div[@class='tabs-container']//*//article//div[@class='description']")
        domain = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(response.url))
        rub = u'\u0440\u0443\u0431.'
        items = []
        for site in sites:
            item = RealtyItem()
            price = site.xpath(".//section[@class='d-1']//p[@class='price']//span/text()").extract()[0]
            price = price.replace(rub, '').replace(u' ', '')
            item['price'] = price
            item['floor'] = site.xpath(".//section[@class='d-2 params']//p[@class='row floor']//span[@class='value corporate_red']/text()").extract()[0]
            item['space'] = site.xpath(".//section[@class='d-2 params']//p[@class='row space']//span[@class='value corporate_red']/text()").extract()[0]
            item['url'] = urljoin(domain, site.xpath(".//p[@class='title-obj']/a/@href").extract()[0])
            kitchen = site.xpath(".//section[@class='d-2 params']//p[@class='row kitchen']//span[@class='value corporate_red']/text()").extract()
            if kitchen:
                item['kitchen'] = kitchen[0]
                # item['district'] = request.meta['item']
            request = Request(item['url'], callback=self.parse_page)
            request.meta['item'] = item
            yield request

            items.append(item)
开发者ID:gnoul,项目名称:scrapy_realty,代码行数:25,代码来源:sprealty.py


注:本文中的scrapy.Request类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。