当前位置: 首页>>代码示例>>Python>>正文


Python selector.Selector方法代码示例

本文整理汇总了Python中scrapy.selector.Selector方法的典型用法代码示例。如果您正苦于以下问题:Python selector.Selector方法的具体用法?Python selector.Selector怎么用?Python selector.Selector使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.selector的用法示例。


在下文中一共展示了selector.Selector方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_page

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_page(self, response):
        sel = Selector(text = response.body)
        infos = sel.xpath('//tr[@class="odd"]').extract()
        for info in infos:
            val = Selector(text = info)
            ip = val.xpath('//td[2]/text()').extract_first()
            port = val.xpath('//td[3]/text()').extract_first()
            country = val.xpath('//td[4]/a/text()').extract_first()
            anonymity = val.xpath('//td[5]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy) 
开发者ID:awolfly9,项目名称:IPProxyTool,代码行数:22,代码来源:xicidaili.py

示例2: parse_1

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_1(self, response):
        info('Parse '+response.url)
        #sel = Selector(response)
        #v = sel.css('.gs_ggs a::attr(href)').extract()
        #import pdb; pdb.set_trace()
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        items = []
        if len(x) > 0:
            items = x[0]['.gs_r']
            pp.pprint(items)
        import pdb; pdb.set_trace()
        # return self.parse_with_rules(response, self.css_rules, googlescholarItem)

        for item in items:
            if item['related-url'] == '' or item['related-type'] != '[PDF]':
                continue
            url = item['related-url']
            info('pdf-url: ' + url)
            yield Request(url, callback=self.save_pdf) 
开发者ID:geekan,项目名称:google-scholar-crawler,代码行数:21,代码来源:spider.py

示例3: parse_item

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_item(self, response):
        item = DoubanmovieItem()
        sel = Selector(response)

        title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0]
        year = sel.xpath('//*[@id="content"]/h1/span[2]/text()').extract()[0]
        commit_num = sel.xpath(
            '//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()').extract()[0]
        star = sel.xpath(
            '//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()[0]
        director = sel.xpath(
            '//*[@id="info"]/span[1]/span[2]/a/text()').extract()[0]
        screenwriter = sel.xpath(
            '//*[@id="info"]/span[2]/span[2]/a/text()').extract()[0]

        item['title'] = title
        item['date'] = year
        item['star'] = star
        item['commit_num'] = commit_num
        item['director'] = director
        item['screenwriter'] = screenwriter

        return item 
开发者ID:WiseDoge,项目名称:crawler_examples,代码行数:25,代码来源:movie_spider.py

示例4: google_parse

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def google_parse(html):
    page = Selector(text=html)
    rs = []
    for ans in page.css('div.g'):
        title = ''.join(ans.css('h3').css('*::text').extract())
        content = ''.join(ans.css('span.st').css('*::text').extract())
        url = ans.css('*.r a::attr(href)').extract()
        try:
            url = re.findall('(http.*)', url[0])
            url = re.sub('&.*', '', url[0])
            rs.append({
                'url': url,
                'content': content,
                'title': title,
            })
        except Exception:
            pass
    return rs


# url = 'https://www.baidu.com/s?wd=jie%20tang&usm=1&tn=baidu&f=13&ie=utf-8&nojc=1&rqlang=en'
# html = getHTMLText(url)
# print(baidu_parse(html)) 
开发者ID:AMinerOpen,项目名称:prediction_api,代码行数:25,代码来源:crawler.py

示例5: parse_ph_key

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_ph_key(self,response):
        selector = Selector(response)
        logging.debug('request url:------>' + response.url)
        # logging.info(selector)
        divs = selector.xpath('//div[@class="phimage"]')
        for div in divs:
            viewkey = re.findall('viewkey=(.*?)"',div.extract())
            # logging.debug(viewkey)
            yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],callback = self.parse_ph_info)
        url_next = selector.xpath('//a[@class="orangeButton" and text()="Next"]/@href').extract()
        # logging.debug(url_next)
        if url_next:
        # if self.test:
            logging.debug(' next page:---------->' + self.host+url_next[0])
            yield Request(url=self.host+url_next[0],callback=self.parse_ph_key)
            # self.test = False 
开发者ID:ceres993434,项目名称:PornHubBot,代码行数:18,代码来源:pornHubSpider.py

示例6: parse_ph_info

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_ph_info(self,response):
        phItem = PornVideoItem()
        selector = Selector(response)
        _ph_info = re.findall('flashvars_.*?=(.*?);\n',selector.extract())
        logging.debug('PH信息的JSON:')
        logging.debug(_ph_info)
        _ph_info_json = json.loads(_ph_info[0])
        duration = _ph_info_json.get('video_duration')
        phItem['video_duration'] = duration
        title = _ph_info_json.get('video_title')
        phItem['video_title'] = title
        image_url = _ph_info_json.get('image_url')
        phItem['image_url'] = image_url
        link_url = _ph_info_json.get('link_url')
        phItem['link_url'] = link_url
        quality_480p = _ph_info_json.get('quality_480p')
        phItem['quality_480p'] = quality_480p
        logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url)
        yield phItem 
开发者ID:ceres993434,项目名称:PornHubBot,代码行数:21,代码来源:pornHubSpider.py

示例7: parse_follow

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_follow(self, response):
        """
        抓取关注列表
        """
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_follow, dont_filter=True, meta=response.meta)
        selector = Selector(response)
        urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="取消关注"]/@href').extract()
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        ID = re.findall('(\d+)/follow', response.url)[0]
        for uid in uids:
            relationships_item = RelationshipsItem()
            relationships_item['crawl_time'] = datetime.now()
            relationships_item["fan_id"] = ID
            relationships_item["followed_id"] = uid
            relationships_item["_id"] = ID + '-' + uid
            yield relationships_item 
开发者ID:Superbsco,项目名称:weibo-analysis-system,代码行数:26,代码来源:weibo_spider.py

示例8: parse_fans

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_fans(self, response):
        """
        抓取粉丝列表
        """
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta)
        selector = Selector(response)
        urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract()
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        ID = re.findall('(\d+)/fans', response.url)[0]
        for uid in uids:
            relationships_item = RelationshipsItem()
            relationships_item['crawl_time'] = datetime.now()
            relationships_item["fan_id"] = uid
            relationships_item["followed_id"] = ID
            relationships_item["_id"] = uid + '-' + ID
            yield relationships_item 
开发者ID:Superbsco,项目名称:weibo-analysis-system,代码行数:26,代码来源:weibo_spider.py

示例9: parse_comment

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_comment(self, response):
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta)
        selector = Selector(response)
        comment_nodes = selector.xpath('//div[@class="c" and contains(@id,"C_")]')
        for comment_node in comment_nodes:
            comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first()
            if not comment_user_url:
                continue
            comment_item = CommentItem()
            comment_item['crawl_time'] = datetime.now()
            comment_item['weibo_url'] = response.meta['weibo_url']
            comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url).group(1)
            comment_item['content'] = comment_node.xpath('.//span[@class="ctt"]').xpath('string(.)').extract_first()
            comment_item['_id'] = comment_node.xpath('./@id').extract_first()
            created_at = comment_node.xpath('.//span[@class="ct"]/text()').extract_first()
            comment_item['created_at'] = time_fix(created_at.split('\xa0')[0])
            yield comment_item 
开发者ID:Superbsco,项目名称:weibo-analysis-system,代码行数:27,代码来源:weibo_spider.py

示例10: parse

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html
        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sel = Selector(response)
        sites = sel.xpath('//ul[@class="directory-url"]/li')
        items = []

        for site in sites:
            item = DmozItem()
            item['name'] = site.xpath('a/text()').extract()
            item['url'] = site.xpath('a/@href').extract()
            item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
            items.append(item)

        return items 
开发者ID:Andrew-liu,项目名称:scrapy_example,代码行数:21,代码来源:dmoz_spider.py

示例11: parse

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse(self, response):
        """TODO: Docstring for pass.

        :response: TODO
        :returns: TODO

        """
        for item in self._parse_posts(response):
            if not self.should_stop(item):
                yield item
            else:
                return

        if len(Selector(response).css('#frs_list_pager .next')):
            #贴吧的分页有的不是完整的链接
            next_page_url = Selector(response).css('#frs_list_pager .next::attr(href)').extract_first()
            logging.debug('next_page_url %s', next_page_url)
            if -1 != next_page_url.find('http://tieba.baidu.com'):
                yield Request(next_page_url, callback=self.parse)
            else:
                yield Request('http://tieba.baidu.com' + next_page_url, callback=self.parse) 
开发者ID:jingzhou123,项目名称:tieba-crawler,代码行数:23,代码来源:post.py

示例12: handle_page

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def handle_page(self, response):
        hxs = Selector(response)
        # text_css = self.css_selector["text_css"]
        # title_css = self.css_selector["title_css"]
        text_css = self.get_css("text_css")
        title_css = self.get_css("title_css")
        if not text_css or not title_css:
            return []
        item = TextItem()

        try:
            item["title"] = hxs.css(title_css).xpath('text()').extract()[0]
        except Exception:
            return []

        item["texts"] = hxs.css(text_css).xpath('text()').extract()
        if not item["texts"]:
            return []

        return [item] 
开发者ID:xgfone,项目名称:snippet,代码行数:22,代码来源:textspider.py

示例13: extract_links

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def extract_links(self, response):
        hxs = Selector(response)
        list_css = self.get_css("list_css")
        if not list_css:
            return []

        urls = []
        try:
            links = hxs.css(list_css).xpath('@href').extract()
            for url in links:
                urls.append(url)
            next_url = self.extract_next_links(response)
            urls.extend(next_url)
        except Exception as err:
            self.logger.error("%s" % err)

        rtn = []
        for url in urls:
            url = URL.s_get_full_url(URL(url), URL(response.url))
            if url:
                rtn.append(Link(url=url))

        return rtn 
开发者ID:xgfone,项目名称:snippet,代码行数:25,代码来源:linkextractors.py

示例14: parse

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse(self, response):
        if not hasattr(self, 'parse_node'):
            raise NotConfigured('You must define parse_node method in order to scrape this XML feed')

        response = self.adapt_response(response)
        if self.iterator == 'iternodes':
            nodes = self._iternodes(response)
        elif self.iterator == 'xml':
            selector = Selector(response, type='xml')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        elif self.iterator == 'html':
            selector = Selector(response, type='html')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:21,代码来源:feed.py

示例15: xmliter

# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def xmliter(obj, nodename):
    """Return a iterator of Selector's over all nodes of a XML document,
       given the name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    nodename_patt = re.escape(nodename)

    HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename_patt, re.S)
    HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename_patt, re.S)
    text = _body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ''
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1]:].strip() if header_end else ''

    r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt}, re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0] 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:26,代码来源:iterators.py


注:本文中的scrapy.selector.Selector方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。