當前位置: 首頁>>代碼示例>>Python>>正文


Python selector.HtmlXPathSelector方法代碼示例

本文整理匯總了Python中scrapy.selector.HtmlXPathSelector方法的典型用法代碼示例。如果您正苦於以下問題:Python selector.HtmlXPathSelector方法的具體用法?Python selector.HtmlXPathSelector怎麽用?Python selector.HtmlXPathSelector使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在scrapy.selector的用法示例。


在下文中一共展示了selector.HtmlXPathSelector方法的12個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: parse

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//ul[@class="directory-url"]/li')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.select('a/text()').extract()
            item['url'] = site.select('a/@href').extract()
            item['description'] = site.select('text()').re('-\s([^\n]*?)\\n')
            items.append(item)

        return items 
開發者ID:magic890,項目名稱:tripadvisor-scraper,代碼行數:22,代碼來源:dmoz.py

示例2: parse

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def parse(self, response):
        self.log("OK,%s" % response.url)
        hxs = HtmlXPathSelector(response)
        # 將文章的鏈接繼續進行處理
        divs = hxs.x('//div[@class="publicLeftCon mt10"]')
        for div in divs:
            url = div.x('h5/a/@href').extract()[0]
            yield self.make_requests_from_url(url).replace(callback=self.parse_content)
        # 將下一頁的鏈接繼續進行處理
        try:
            next_url = \
            hxs.x('//div[@id="project_left"]/div[@class="publicMiddleLine"]/span/a[b="下一頁"]/@href').extract()[0]
        except Exception:
            return
        next_url = 'http://article.yeeyan.org' + next_url
        #  if self.count==10:
        #      return
        #  self.count+=1
        yield self.make_requests_from_url(next_url).replace(callback=self.parse)

    # 過濾文章內容 
開發者ID:openslack,項目名稱:openslack-crawler,代碼行數:23,代碼來源:yeeyan.py

示例3: parse_content

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def parse_content(self, response):
        hxs = HtmlXPathSelector(response)
        item = YeeyanItem()
        if hxs.x('//a[@class="jx_logo"]/text()'):
            item = self.parse_jx(item, response)
        else:
            item['url'] = response.url
            item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip()
            div = hxs.x('//div[@class="user_info"]')
            item['author'] = div.x('.//h2/a/text()').extract()[0]
            item['excerpt'] = hxs.x('//p[@class="excerpt"]/text()').extract()
            if item['excerpt']:
                item['excerpt'] = item['excerpt'][0]
            else:
                item['excerpt'] = ''
            item['content_html'] = hxs.x('//div[@id="conBox"]').extract()[0]
            item['release_time'] = div.x('.//p/text()').extract()[0].strip()[1:-7]
            item['category'] = hxs.x('//div[@class="crumb"]/a/text()').extract()[1]
        return item

    # 過濾精選的文章 
開發者ID:openslack,項目名稱:openslack-crawler,代碼行數:23,代碼來源:yeeyan.py

示例4: parse

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def parse(self, response):
        """
        default parse method, rule is not useful now
        """
        # import pdb; pdb.set_trace()
        response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
        hxs = HtmlXPathSelector(response)
        index_level = self.determine_level(response)
        log.msg("Parse: index level:" + str(index_level))
        if index_level in [1, 2, 3, 4]:
            self.save_to_file_system(index_level, response)
            relative_urls = self.get_follow_links(index_level, hxs)
            if relative_urls is not None:
                for url in relative_urls:
                    log.msg('yield process, url:' + url)
                    yield Request(url, callback=self.parse)
        elif index_level == 5:
            personProfile = HtmlParser.extract_person_profile(hxs)
            linkedin_id = self.get_linkedin_id(response.url)
            linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
            if linkedin_id:
                personProfile['_id'] = linkedin_id
                personProfile['url'] = UnicodeDammit(response.url).markup
                yield personProfile 
開發者ID:openslack,項目名稱:openslack-crawler,代碼行數:26,代碼來源:LinkedinSpider.py

示例5: parse_page

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def parse_page(response):
        hxs = HtmlXPathSelector(response)
        item = LotteryticketItem()
        # 期數
        title = hxs.select('//html/body/center/center/table/tr/td/table[1]/tr[2]/td[1]/text()').extract()[0]
        item['title'] = filter(str.isdigit, ("".join(title.split()).encode("utf-8")))
        # 紅色球區
        red1 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[1]/font/text()').extract()[0]
        item['red1'] = filter(str.isdigit, ("".join(red1.split()).encode("utf-8")))
        red2 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[2]/font/text()').extract()[0]
        item['red2'] = filter(str.isdigit, ("".join(red2.split()).encode("utf-8")))
        red3 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[3]/font/text()').extract()[0]
        item['red3'] = filter(str.isdigit, ("".join(red3.split()).encode("utf-8")))
        red4 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[4]/font/text()').extract()[0]
        item['red4'] = filter(str.isdigit, ("".join(red4.split()).encode("utf-8")))
        red5 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[5]/font/text()').extract()[0]
        item['red5'] = filter(str.isdigit, ("".join(red5.split()).encode("utf-8")))
        red6 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[6]/font/text()').extract()[0]
        item['red6'] = filter(str.isdigit, ("".join(red6.split()).encode("utf-8")))
        # 藍色球區
        blue = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[7]/font/text()').extract()[0]
        item['blue'] = filter(str.isdigit, ("".join(blue.split()).encode("utf-8")))
        # 開獎時間
        created_at = hxs.select('//html/body/center/center/table/tr/td/table[1]/tr[2]/td[2]/text()').extract()[0]
        item['created_at'] = ("".join(created_at.split()).encode("utf-8"))[0:10]

        return item 
開發者ID:lamjack,項目名稱:LotteryTicket,代碼行數:29,代碼來源:LCW_spider.py

示例6: parse_torrent

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def parse_torrent(self,response):
		x = HtmlXPathSelector(response)
		
		torrent['url'] = response.url
		torrent['description'] = x.select("//span[@id='lblDescription']/text()").extract()
		torrent['jurisdictiontype'] = x.select("//span[@id='lblJurisdictionType']").extract()
		torrent['agency'] = x.select("//span[@id='lblUmbrellaAgency']/text()").extract()
		torrent['contactinfo'] = x.select("//span[@id='lblContact']/p/text()").extract()
		torrent['links'] = x.select("//span[@id='lblContacts']/p/a/@href").extract()
		return torrent 
開發者ID:dcondrey,項目名稱:scrapy-spiders,代碼行數:12,代碼來源:__init__ copy.py

示例7: parse

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def parse(self, response):
        x = HtmlXPathSelector(response)
        links = []
        url = response.url
        music_links = x.select('//ul/li/a/@href').extract()
        music_links = [m for m in music_links if m.endswith(".mid")]
        for l in music_links:
            link = MIDIFile()
            link['url'] = url
            link['ltype'] = self.ltype
            link['link'] = l
            link["file_urls"] = [l]
            links.append(link)
        return links 
開發者ID:VikParuchuri,項目名稱:evolve-music2,代碼行數:16,代碼來源:scrape.py

示例8: process_response

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def process_response(self, request, response, spider):
        url = response.url

        if response.status in [301, 307]:
            log.msg("trying to redirect us: %s" % url, level=log.INFO)
            reason = 'redirect %d' % response.status

            return self._retry(request, reason, spider) or response
        interval, redirect_url = get_meta_refresh(response)
        # handle meta redirect

        if redirect_url:
            log.msg("trying to redirect us: %s" % url, level=log.INFO)
            reason = 'meta'

            return self._retry(request, reason, spider) or response

        hxs = HtmlXPathSelector(response)
        # test for captcha page
        captcha = hxs.select(
            ".//input[contains(@id, 'captchacharacters')]").extract()

        if captcha:
            log.msg("captcha page %s" % url, level=log.INFO)
            reason = 'capcha'

            return self._retry(request, reason, spider) or response

        return response 
開發者ID:Karmenzind,項目名稱:fp-server,代碼行數:31,代碼來源:middlewares.py

示例9: parse_item

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def parse_item(self, response):
        print response.body
        hxs = HtmlXPathSelector(response)
        i = DmozItem()
        i['id'] = hxs.select('//input[@id="sid"]/@value').extract()
        i['title'] = hxs.select('//div[@id="name"]').extract()
        i['desc'] = hxs.select('//div[@id="description"]').extract()
        return i 
開發者ID:openslack,項目名稱:openslack-crawler,代碼行數:10,代碼來源:weibo_spider.py

示例10: parse_jx

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def parse_jx(self, item, response):
        hxs = HtmlXPathSelector(response)
        item['url'] = response.url
        item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip()
        div = hxs.x('//div[@class="jxar_author"]')
        item['author'] = div.x('.//a/text()').extract()[0]
        item['release_time'] = hxs.x('//p[@class="jxa_info"]/span[1]/text()').extract()[0]
        try:
            item['excerpt'] = hxs.x('//p[@class="jxa_intro"]/text()').extract()[0]
        except Exception:
            item['excerpt'] = None
        item['category'] = hxs.x('//p[@class="jxa_map"]/text()').extract()[1].split()[1]
        item['content_html'] = hxs.x('//div[@class="jxa_content"]').extract()[0]
        return item 
開發者ID:openslack,項目名稱:openslack-crawler,代碼行數:16,代碼來源:yeeyan.py

示例11: parse_detail

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def parse_detail(self, response):
        woaidu_item = WoaiduCrawlerItem()

        response_selector = HtmlXPathSelector(response)
        woaidu_item['book_name'] = list_first_item(
            response_selector.select('//div[@class="zizida"][1]/text()').extract())
        woaidu_item['author'] = [
            list_first_item(response_selector.select('//div[@class="xiaoxiao"][1]/text()').extract())[5:].strip(), ]
        woaidu_item['book_description'] = list_first_item(
            response_selector.select('//div[@class="lili"][1]/text()').extract()).strip()
        woaidu_item['book_covor_image_url'] = list_first_item(
            response_selector.select('//div[@class="hong"][1]/img/@src').extract())

        download = []
        for i in response_selector.select('//div[contains(@class,"xiazai_xiao")]')[1:]:
            download_item = {}
            download_item['url'] = strip_null(
                    deduplication(
                        [
                            list_first_item(i.select('./div')[0].select('./a/@href').extract()),
                            list_first_item(i.select('./div')[1].select('./a/@href').extract())
                            ]
                        )
                    )

            download_item['progress'] = list_first_item(i.select('./div')[2].select('./text()').extract())
            download_item['update_time'] = list_first_item(i.select('./div')[3].select('./text()').extract())
            download_item['source_site'] = \
                [
                    list_first_item(i.select('./div')[4].select('./a/text()').extract()), \
                    list_first_item(i.select('./div')[4].select('./a/@href').extract()) \
                ]

            download.append(download_item)

        woaidu_item['book_download'] = download
        woaidu_item['original_url'] = response.url

        yield woaidu_item 
開發者ID:openslack,項目名稱:openslack-crawler,代碼行數:41,代碼來源:woaidu_detail.py

示例12: parse_item

# 需要導入模塊: from scrapy import selector [as 別名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 別名]
def parse_item(self, response):
        self._log_page(response, 'after_login.html')
        hxs = HtmlXPathSelector(response)
        report_urls = hxs.select('//div[@id="menuh"]/ul/li[4]/div//a/@href').extract()
        for report_url in report_urls:
            # print "list:"+report_url
            yield Request(self._ab_path(response, report_url), \
                          headers=self.headers, \
                          meta={'cookiejar': response.meta['cookiejar'], \
                                }, \
                          callback=self.parse_report) 
開發者ID:openslack,項目名稱:openslack-crawler,代碼行數:13,代碼來源:AmazonSpider.py


注:本文中的scrapy.selector.HtmlXPathSelector方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。