本文整理汇总了Python中scrapy.selector.Selector方法的典型用法代码示例。如果您正苦于以下问题:Python selector.Selector方法的具体用法?Python selector.Selector怎么用?Python selector.Selector使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.selector
的用法示例。
在下文中一共展示了selector.Selector方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_page
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_page(self, response):
sel = Selector(text = response.body)
infos = sel.xpath('//tr[@class="odd"]').extract()
for info in infos:
val = Selector(text = info)
ip = val.xpath('//td[2]/text()').extract_first()
port = val.xpath('//td[3]/text()').extract_first()
country = val.xpath('//td[4]/a/text()').extract_first()
anonymity = val.xpath('//td[5]/text()').extract_first()
proxy = Proxy()
proxy.set_value(
ip = ip,
port = port,
country = country,
anonymity = anonymity,
source = self.name,
)
self.add_proxy(proxy = proxy)
示例2: parse_1
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_1(self, response):
info('Parse '+response.url)
#sel = Selector(response)
#v = sel.css('.gs_ggs a::attr(href)').extract()
#import pdb; pdb.set_trace()
x = self.parse_with_rules(response, self.list_css_rules, dict)
items = []
if len(x) > 0:
items = x[0]['.gs_r']
pp.pprint(items)
import pdb; pdb.set_trace()
# return self.parse_with_rules(response, self.css_rules, googlescholarItem)
for item in items:
if item['related-url'] == '' or item['related-type'] != '[PDF]':
continue
url = item['related-url']
info('pdf-url: ' + url)
yield Request(url, callback=self.save_pdf)
示例3: parse_item
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_item(self, response):
item = DoubanmovieItem()
sel = Selector(response)
title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0]
year = sel.xpath('//*[@id="content"]/h1/span[2]/text()').extract()[0]
commit_num = sel.xpath(
'//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()').extract()[0]
star = sel.xpath(
'//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()[0]
director = sel.xpath(
'//*[@id="info"]/span[1]/span[2]/a/text()').extract()[0]
screenwriter = sel.xpath(
'//*[@id="info"]/span[2]/span[2]/a/text()').extract()[0]
item['title'] = title
item['date'] = year
item['star'] = star
item['commit_num'] = commit_num
item['director'] = director
item['screenwriter'] = screenwriter
return item
示例4: google_parse
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def google_parse(html):
page = Selector(text=html)
rs = []
for ans in page.css('div.g'):
title = ''.join(ans.css('h3').css('*::text').extract())
content = ''.join(ans.css('span.st').css('*::text').extract())
url = ans.css('*.r a::attr(href)').extract()
try:
url = re.findall('(http.*)', url[0])
url = re.sub('&.*', '', url[0])
rs.append({
'url': url,
'content': content,
'title': title,
})
except Exception:
pass
return rs
# url = 'https://www.baidu.com/s?wd=jie%20tang&usm=1&tn=baidu&f=13&ie=utf-8&nojc=1&rqlang=en'
# html = getHTMLText(url)
# print(baidu_parse(html))
示例5: parse_ph_key
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_ph_key(self,response):
selector = Selector(response)
logging.debug('request url:------>' + response.url)
# logging.info(selector)
divs = selector.xpath('//div[@class="phimage"]')
for div in divs:
viewkey = re.findall('viewkey=(.*?)"',div.extract())
# logging.debug(viewkey)
yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],callback = self.parse_ph_info)
url_next = selector.xpath('//a[@class="orangeButton" and text()="Next"]/@href').extract()
# logging.debug(url_next)
if url_next:
# if self.test:
logging.debug(' next page:---------->' + self.host+url_next[0])
yield Request(url=self.host+url_next[0],callback=self.parse_ph_key)
# self.test = False
示例6: parse_ph_info
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_ph_info(self,response):
phItem = PornVideoItem()
selector = Selector(response)
_ph_info = re.findall('flashvars_.*?=(.*?);\n',selector.extract())
logging.debug('PH信息的JSON:')
logging.debug(_ph_info)
_ph_info_json = json.loads(_ph_info[0])
duration = _ph_info_json.get('video_duration')
phItem['video_duration'] = duration
title = _ph_info_json.get('video_title')
phItem['video_title'] = title
image_url = _ph_info_json.get('image_url')
phItem['image_url'] = image_url
link_url = _ph_info_json.get('link_url')
phItem['link_url'] = link_url
quality_480p = _ph_info_json.get('quality_480p')
phItem['quality_480p'] = quality_480p
logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url)
yield phItem
示例7: parse_follow
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_follow(self, response):
"""
抓取关注列表
"""
# 如果是第1页,一次性获取后面的所有页
if response.url.endswith('page=1'):
all_page = re.search(r'/> 1/(\d+)页</div>', response.text)
if all_page:
all_page = all_page.group(1)
all_page = int(all_page)
for page_num in range(2, all_page + 1):
page_url = response.url.replace('page=1', 'page={}'.format(page_num))
yield Request(page_url, self.parse_follow, dont_filter=True, meta=response.meta)
selector = Selector(response)
urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="取消关注"]/@href').extract()
uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
ID = re.findall('(\d+)/follow', response.url)[0]
for uid in uids:
relationships_item = RelationshipsItem()
relationships_item['crawl_time'] = datetime.now()
relationships_item["fan_id"] = ID
relationships_item["followed_id"] = uid
relationships_item["_id"] = ID + '-' + uid
yield relationships_item
示例8: parse_fans
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_fans(self, response):
"""
抓取粉丝列表
"""
# 如果是第1页,一次性获取后面的所有页
if response.url.endswith('page=1'):
all_page = re.search(r'/> 1/(\d+)页</div>', response.text)
if all_page:
all_page = all_page.group(1)
all_page = int(all_page)
for page_num in range(2, all_page + 1):
page_url = response.url.replace('page=1', 'page={}'.format(page_num))
yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta)
selector = Selector(response)
urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract()
uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
ID = re.findall('(\d+)/fans', response.url)[0]
for uid in uids:
relationships_item = RelationshipsItem()
relationships_item['crawl_time'] = datetime.now()
relationships_item["fan_id"] = uid
relationships_item["followed_id"] = ID
relationships_item["_id"] = uid + '-' + ID
yield relationships_item
示例9: parse_comment
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse_comment(self, response):
# 如果是第1页,一次性获取后面的所有页
if response.url.endswith('page=1'):
all_page = re.search(r'/> 1/(\d+)页</div>', response.text)
if all_page:
all_page = all_page.group(1)
all_page = int(all_page)
for page_num in range(2, all_page + 1):
page_url = response.url.replace('page=1', 'page={}'.format(page_num))
yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta)
selector = Selector(response)
comment_nodes = selector.xpath('//div[@class="c" and contains(@id,"C_")]')
for comment_node in comment_nodes:
comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first()
if not comment_user_url:
continue
comment_item = CommentItem()
comment_item['crawl_time'] = datetime.now()
comment_item['weibo_url'] = response.meta['weibo_url']
comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url).group(1)
comment_item['content'] = comment_node.xpath('.//span[@class="ctt"]').xpath('string(.)').extract_first()
comment_item['_id'] = comment_node.xpath('./@id').extract_first()
created_at = comment_node.xpath('.//span[@class="ct"]/text()').extract_first()
comment_item['created_at'] = time_fix(created_at.split('\xa0')[0])
yield comment_item
示例10: parse
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse(self, response):
"""
The lines below is a spider contract. For more info see:
http://doc.scrapy.org/en/latest/topics/contracts.html
@url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
@scrapes name
"""
sel = Selector(response)
sites = sel.xpath('//ul[@class="directory-url"]/li')
items = []
for site in sites:
item = DmozItem()
item['name'] = site.xpath('a/text()').extract()
item['url'] = site.xpath('a/@href').extract()
item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
items.append(item)
return items
示例11: parse
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse(self, response):
"""TODO: Docstring for pass.
:response: TODO
:returns: TODO
"""
for item in self._parse_posts(response):
if not self.should_stop(item):
yield item
else:
return
if len(Selector(response).css('#frs_list_pager .next')):
#贴吧的分页有的不是完整的链接
next_page_url = Selector(response).css('#frs_list_pager .next::attr(href)').extract_first()
logging.debug('next_page_url %s', next_page_url)
if -1 != next_page_url.find('http://tieba.baidu.com'):
yield Request(next_page_url, callback=self.parse)
else:
yield Request('http://tieba.baidu.com' + next_page_url, callback=self.parse)
示例12: handle_page
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def handle_page(self, response):
hxs = Selector(response)
# text_css = self.css_selector["text_css"]
# title_css = self.css_selector["title_css"]
text_css = self.get_css("text_css")
title_css = self.get_css("title_css")
if not text_css or not title_css:
return []
item = TextItem()
try:
item["title"] = hxs.css(title_css).xpath('text()').extract()[0]
except Exception:
return []
item["texts"] = hxs.css(text_css).xpath('text()').extract()
if not item["texts"]:
return []
return [item]
示例13: extract_links
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def extract_links(self, response):
hxs = Selector(response)
list_css = self.get_css("list_css")
if not list_css:
return []
urls = []
try:
links = hxs.css(list_css).xpath('@href').extract()
for url in links:
urls.append(url)
next_url = self.extract_next_links(response)
urls.extend(next_url)
except Exception as err:
self.logger.error("%s" % err)
rtn = []
for url in urls:
url = URL.s_get_full_url(URL(url), URL(response.url))
if url:
rtn.append(Link(url=url))
return rtn
示例14: parse
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def parse(self, response):
if not hasattr(self, 'parse_node'):
raise NotConfigured('You must define parse_node method in order to scrape this XML feed')
response = self.adapt_response(response)
if self.iterator == 'iternodes':
nodes = self._iternodes(response)
elif self.iterator == 'xml':
selector = Selector(response, type='xml')
self._register_namespaces(selector)
nodes = selector.xpath('//%s' % self.itertag)
elif self.iterator == 'html':
selector = Selector(response, type='html')
self._register_namespaces(selector)
nodes = selector.xpath('//%s' % self.itertag)
else:
raise NotSupported('Unsupported node iterator')
return self.parse_nodes(response, nodes)
示例15: xmliter
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import Selector [as 别名]
def xmliter(obj, nodename):
"""Return a iterator of Selector's over all nodes of a XML document,
given the name of the node to iterate. Useful for parsing XML feeds.
obj can be:
- a Response object
- a unicode string
- a string encoded as utf-8
"""
nodename_patt = re.escape(nodename)
HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename_patt, re.S)
HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename_patt, re.S)
text = _body_or_str(obj)
header_start = re.search(HEADER_START_RE, text)
header_start = header_start.group(1).strip() if header_start else ''
header_end = re_rsearch(HEADER_END_RE, text)
header_end = text[header_end[1]:].strip() if header_end else ''
r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt}, re.DOTALL)
for match in r.finditer(text):
nodetext = header_start + match.group() + header_end
yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]