当前位置: 首页>>代码示例>>Python>>正文


Python scrapy.Selector方法代码示例

本文整理汇总了Python中scrapy.Selector方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.Selector方法的具体用法?Python scrapy.Selector怎么用?Python scrapy.Selector使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy的用法示例。


在下文中一共展示了scrapy.Selector方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_page

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tr[@class="cells"]').extract()
        for i, info in enumerate(infos):
            self.log(info)
            val = Selector(text = info)

            ip = val.xpath('//td[2]/text()').extract_first()
            port = val.xpath('//td[3]/text()').extract_first()
            country = val.xpath('//td[5]/text()').extract_first()
            anonymity = val.xpath('//td[4]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy) 
开发者ID:awolfly9,项目名称:IPProxyTool,代码行数:26,代码来源:proxylistplus.py

示例2: parse_page

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tbody/tr').extract()
        for i, info in enumerate(infos):
            if i == 0:
                continue

            val = Selector(text = info)
            ip = val.xpath('//td[1]/text()').extract_first()
            port = val.xpath('//td[2]/text()').extract_first()
            country = val.xpath('//td[3]/div/text()').extract_first()
            anonymity = val.xpath('//td[6]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy) 
开发者ID:awolfly9,项目名称:IPProxyTool,代码行数:27,代码来源:hidemy.py

示例3: parse_page

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//ul[@class="l2"]').extract()
        for i, info in enumerate(infos):
            val = Selector(text = info)
            ip = val.xpath('//ul[@class="l2"]/span[1]/li/text()').extract_first()
            port = val.xpath('//ul[@class="l2"]/span[2]/li/text()').extract_first()
            anonymity = val.xpath('//ul[@class="l2"]/span[3]/li/text()').extract_first()
            https = val.xpath('//ul[@class="l2"]/span[4]/li/text()').extract_first()
            country = val.xpath('//ul[@class="l2"]/span[5]/li/a/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy) 
开发者ID:awolfly9,项目名称:IPProxyTool,代码行数:25,代码来源:data5u.py

示例4: _parse_general_post

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def _parse_general_post(self, post, response):
        """TODO: Docstring for _parse_general_post.

        :post: TODO
        :response: TODO
        :returns: TODO

        """
        item = Reply()
        #拼接字符串
        item['body'] = ''.join(post.css('cc div::text').extract()).strip()
        item['title'] = Selector(response).css('.core_title_txt::text').extract_first()
        item['post_time'] = json.loads(
            post
            .css('::attr(data-field)')
            .extract_first()
        )['content']['date']

        return item; 
开发者ID:jingzhou123,项目名称:tieba-crawler,代码行数:21,代码来源:reply.py

示例5: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse(self, response):
        """TODO: Docstring for parse.
        :returns: TODO

        """
        posts = Selector(response).css('.p_postlist .l_post')

        for i, post in enumerate(posts):
            if i == 0:
                yield self._parse_main_post(post, response)
            else:
                item = self._parse_reply(post, response)
                yield item

                if item['reply_num'] != 0:# 评论数
                    self._parse_comments(post) 
开发者ID:jingzhou123,项目名称:tieba-crawler,代码行数:18,代码来源:reply.py

示例6: _parse_user_id

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def _parse_user_id(self, response):
        """TODO: Docstring for _parse_user_id.

        :response: TODO
        :returns: 32 digits user id hex

        """

        uri = Selector(response).css('.concern_num a::attr(href)').extract_first()
        logging.debug('user id href: %s' % (uri))
        if uri:
            query_dict = parse_qs(urlparse(uri).query)
            # uri maybe this: /home/concern?id=a3e3474fbda1bfb5bfecc0d6d121?t=1423636759&fr=home
            return query_dict['id'][0]
        else:
            return '' 
开发者ID:jingzhou123,项目名称:tieba-crawler,代码行数:18,代码来源:user.py

示例7: _parse_following_and_followed

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def _parse_following_and_followed(self, response, item):
        """TODO: Docstring for _parse_following_and_followed.

        :response: TODO
        :item: item.following_num item.followed_num
        :returns: TODO

        """
        sels = Selector(response).css('.ihome_aside_title')
        for sel in sels:
            title = sel.css('::text').extract_first().strip()# 第一个text是'他关注的人'或者其它无用的信息
            #logging.debug('title: %s' % (title))
            #有的用户没有关注或被关注
            if title == '他关注的人' or title == '她关注的人':
                item['following_num'] = sel.css('a::text').extract_first()
            else:
                item['following_num'] = 0
            if title == '关注他的人' or title == '关注她的人':
                item['followed_num'] = sel.css('a::text').extract_first()
            else:
                item['followed_num'] = 0

        return item 
开发者ID:jingzhou123,项目名称:tieba-crawler,代码行数:25,代码来源:user.py

示例8: _get_next_page

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def _get_next_page(self, response):
        """TODO: Docstring for _parse_next_page.

        :response: TODO
        :returns: TODO

        """
        #logging.debug('beginning parsing next page if existed..')
        meta = response.meta
        anchor_sels = Selector(response).css('.j_pager a')
        next_page = 1
        #logging.debug('anchor selectors: %r' % (anchor_sels))
        for sel in anchor_sels:
            #logging.debug('pager anchor text: ' % (sel.css('::text').extract_first()))
            if sel.css('::text').extract_first() == '下一页':
                next_page = sel.css('::attr(href)').extract_first()[1:]
                logging.debug('next page num: %s' % (next_page))

        return int(next_page) 
开发者ID:jingzhou123,项目名称:tieba-crawler,代码行数:21,代码来源:comment.py

示例9: get_help

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def get_help(self):
        b = []
        b.append("Available Scrapy objects:")
        b.append("  scrapy     scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
        for k, v in sorted(self.vars.items()):
            if self._is_relevant(v):
                b.append("  %-10s %s" % (k, v))
        b.append("Useful shortcuts:")
        if self.inthread:
            b.append("  fetch(url[, redirect=True]) "
                     "Fetch URL and update local objects "
                     "(by default, redirects are followed)")
            b.append("  fetch(req)                  "
                     "Fetch a scrapy.Request and update local objects ")
        b.append("  shelp()           Shell help (print this help)")
        b.append("  view(response)    View response in a browser")

        return "\n".join("[s] %s" % l for l in b) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:20,代码来源:shell.py

示例10: download_sp500_price

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def download_sp500_price(self, response):
        trs = response.xpath('//*[@id="datatable"]/tr').extract()

        price_jsons = []

        try:
            for tr in trs[1:]:
                tds = Selector(text=tr).xpath('//td//text()').extract()
                tds = [x.strip() for x in tds if x.strip()]

                price_jsons.append({"timestamp": to_time_str(tds[0]),
                                    "close": to_float(tds[1])})

            if price_jsons:
                self.df_close = self.df_close.append(price_jsons, ignore_index=True)
                self.df_close = index_df_with_time(self.df_close)
        except Exception as e:
            self.logger.exception('error when getting sp500 price url={} error={}'.format(response.url, e)) 
开发者ID:foolcage,项目名称:fooltrader,代码行数:20,代码来源:sp500_spider.py

示例11: parse_user_0

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_user_0(self, response):
        """ 抓取个人信息-第一部分:微博数、关注数、粉丝数 """
        user_item = UserItem()
        selector = Selector(response)
        text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
        if text0:
            num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0)  # 微博数
            num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # 关注数
            num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # 粉丝数
            if num_tweets:
                user_item["ctweets"] = int(num_tweets[0])
            if num_follows:
                user_item["cfollows"] = int(num_follows[0])
            if num_fans:
                user_item["cfans"] = int(num_fans[0])
            user_item["_id"] = response.meta["user_id"]
            url_information1 = "http://weibo.cn/%s/info" % response.meta["user_id"]
            yield Request(url=url_information1, meta={"item": user_item}, callback=self.parse_user_1) 
开发者ID:wen-fei,项目名称:SinaWeiboSpider,代码行数:20,代码来源:spider.py

示例12: parse_user_1

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_user_1(self, response):
        """ 抓取个人信息2 """
        user_item = response.meta["item"]
        selector = Selector(response)
        text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract())  # 获取标签里的所有text()

        nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1)  # 昵称
        intro = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1)  # 简介
        auth = re.findall(u'\u8ba4\u8bc1[:|\uff1a](.*?);', text1)  # 认证信息

        gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1)  # 性别
        place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1)  # 地区(包括省份和城市)
        birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1)  # 生日
        sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1)  # 性取向
        marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1)  # 婚姻状况
        url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1)  # 首页链接

        if nickname:
            user_item["nickname"] = nickname[0]
        if auth:
            user_item["auth"] = auth[0]
        if intro:
            user_item["intro"] = intro[0]
        user_item['t'] = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        yield user_item 
开发者ID:wen-fei,项目名称:SinaWeiboSpider,代码行数:27,代码来源:spider.py

示例13: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse(self, response):
        print('url:', response.url)

        body = response.body.replace(b'<<+', b'&lt;&lt;+').replace(b'<+', b'&lt;+')
            
        selector = scrapy.Selector(text=body.decode('utf-8'))

        i = 1
        for x  in selector.css('.elem::text').extract():
            if 'Elements' in x:
                print('---', i, '---')
                i += 1
            else:
                print(x)

# --- it runs without project and saves in `output.csv` --- 
开发者ID:furas,项目名称:python-examples,代码行数:18,代码来源:main.py

示例14: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse(self, response):
        item = DmozItem()
        sel = scrapy.Selector(response)
        conn = pymssql.connect(host="121.42.136.4", user="sa", password="koala19920716!@#", database="test")
        cursor = conn.cursor()
        sites = sel.xpath("//dl[@id='clist']/dd/a/text()").extract()
        item['title'] = [n.encode('utf-8') for n in sites]
        yield item
        # sql = "select ID,CityName from Cities"
        # cursor.execute(sql)
        # for (ID,CityName) in cursor.fetchall():
        #     print ID
        for name in item['title']:
            # print name
            sql = "Insert into Cities(CityName)values('" + name + "')"
            cursor.execute(sql)
            conn.commit() 
开发者ID:openslack,项目名称:openslack-crawler,代码行数:19,代码来源:city.py

示例15: parse_page

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tbody/tr').extract()
        for i, info in enumerate(infos):
            if i == 0:
                continue

            val = Selector(text = info)
            ip = val.xpath('//td[1]/text()').extract_first()
            port = val.xpath('//td[2]/text()').extract_first()
            country = val.xpath('//td[6]/text()').extract_first()
            anonymity = val.xpath('//td[3]/text()').extract_first()
            https = val.xpath('//td[4]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy) 
开发者ID:awolfly9,项目名称:IPProxyTool,代码行数:28,代码来源:ip181.py


注:本文中的scrapy.Selector方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。