本文整理汇总了Python中scrapy.Selector方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.Selector方法的具体用法?Python scrapy.Selector怎么用?Python scrapy.Selector使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy
的用法示例。
在下文中一共展示了scrapy.Selector方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_page
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_page(self, response):
self.write(response.body)
sel = Selector(response)
infos = sel.xpath('//tr[@class="cells"]').extract()
for i, info in enumerate(infos):
self.log(info)
val = Selector(text = info)
ip = val.xpath('//td[2]/text()').extract_first()
port = val.xpath('//td[3]/text()').extract_first()
country = val.xpath('//td[5]/text()').extract_first()
anonymity = val.xpath('//td[4]/text()').extract_first()
proxy = Proxy()
proxy.set_value(
ip = ip,
port = port,
country = country,
anonymity = anonymity,
source = self.name,
)
self.add_proxy(proxy = proxy)
示例2: parse_page
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_page(self, response):
self.write(response.body)
sel = Selector(response)
infos = sel.xpath('//tbody/tr').extract()
for i, info in enumerate(infos):
if i == 0:
continue
val = Selector(text = info)
ip = val.xpath('//td[1]/text()').extract_first()
port = val.xpath('//td[2]/text()').extract_first()
country = val.xpath('//td[3]/div/text()').extract_first()
anonymity = val.xpath('//td[6]/text()').extract_first()
proxy = Proxy()
proxy.set_value(
ip = ip,
port = port,
country = country,
anonymity = anonymity,
source = self.name,
)
self.add_proxy(proxy = proxy)
示例3: parse_page
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_page(self, response):
self.write(response.body)
sel = Selector(response)
infos = sel.xpath('//ul[@class="l2"]').extract()
for i, info in enumerate(infos):
val = Selector(text = info)
ip = val.xpath('//ul[@class="l2"]/span[1]/li/text()').extract_first()
port = val.xpath('//ul[@class="l2"]/span[2]/li/text()').extract_first()
anonymity = val.xpath('//ul[@class="l2"]/span[3]/li/text()').extract_first()
https = val.xpath('//ul[@class="l2"]/span[4]/li/text()').extract_first()
country = val.xpath('//ul[@class="l2"]/span[5]/li/a/text()').extract_first()
proxy = Proxy()
proxy.set_value(
ip = ip,
port = port,
country = country,
anonymity = anonymity,
source = self.name,
)
self.add_proxy(proxy = proxy)
示例4: _parse_general_post
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def _parse_general_post(self, post, response):
"""TODO: Docstring for _parse_general_post.
:post: TODO
:response: TODO
:returns: TODO
"""
item = Reply()
#拼接字符串
item['body'] = ''.join(post.css('cc div::text').extract()).strip()
item['title'] = Selector(response).css('.core_title_txt::text').extract_first()
item['post_time'] = json.loads(
post
.css('::attr(data-field)')
.extract_first()
)['content']['date']
return item;
示例5: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse(self, response):
"""TODO: Docstring for parse.
:returns: TODO
"""
posts = Selector(response).css('.p_postlist .l_post')
for i, post in enumerate(posts):
if i == 0:
yield self._parse_main_post(post, response)
else:
item = self._parse_reply(post, response)
yield item
if item['reply_num'] != 0:# 评论数
self._parse_comments(post)
示例6: _parse_user_id
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def _parse_user_id(self, response):
"""TODO: Docstring for _parse_user_id.
:response: TODO
:returns: 32 digits user id hex
"""
uri = Selector(response).css('.concern_num a::attr(href)').extract_first()
logging.debug('user id href: %s' % (uri))
if uri:
query_dict = parse_qs(urlparse(uri).query)
# uri maybe this: /home/concern?id=a3e3474fbda1bfb5bfecc0d6d121?t=1423636759&fr=home
return query_dict['id'][0]
else:
return ''
示例7: _parse_following_and_followed
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def _parse_following_and_followed(self, response, item):
"""TODO: Docstring for _parse_following_and_followed.
:response: TODO
:item: item.following_num item.followed_num
:returns: TODO
"""
sels = Selector(response).css('.ihome_aside_title')
for sel in sels:
title = sel.css('::text').extract_first().strip()# 第一个text是'他关注的人'或者其它无用的信息
#logging.debug('title: %s' % (title))
#有的用户没有关注或被关注
if title == '他关注的人' or title == '她关注的人':
item['following_num'] = sel.css('a::text').extract_first()
else:
item['following_num'] = 0
if title == '关注他的人' or title == '关注她的人':
item['followed_num'] = sel.css('a::text').extract_first()
else:
item['followed_num'] = 0
return item
示例8: _get_next_page
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def _get_next_page(self, response):
"""TODO: Docstring for _parse_next_page.
:response: TODO
:returns: TODO
"""
#logging.debug('beginning parsing next page if existed..')
meta = response.meta
anchor_sels = Selector(response).css('.j_pager a')
next_page = 1
#logging.debug('anchor selectors: %r' % (anchor_sels))
for sel in anchor_sels:
#logging.debug('pager anchor text: ' % (sel.css('::text').extract_first()))
if sel.css('::text').extract_first() == '下一页':
next_page = sel.css('::attr(href)').extract_first()[1:]
logging.debug('next page num: %s' % (next_page))
return int(next_page)
示例9: get_help
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def get_help(self):
b = []
b.append("Available Scrapy objects:")
b.append(" scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
for k, v in sorted(self.vars.items()):
if self._is_relevant(v):
b.append(" %-10s %s" % (k, v))
b.append("Useful shortcuts:")
if self.inthread:
b.append(" fetch(url[, redirect=True]) "
"Fetch URL and update local objects "
"(by default, redirects are followed)")
b.append(" fetch(req) "
"Fetch a scrapy.Request and update local objects ")
b.append(" shelp() Shell help (print this help)")
b.append(" view(response) View response in a browser")
return "\n".join("[s] %s" % l for l in b)
示例10: download_sp500_price
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def download_sp500_price(self, response):
trs = response.xpath('//*[@id="datatable"]/tr').extract()
price_jsons = []
try:
for tr in trs[1:]:
tds = Selector(text=tr).xpath('//td//text()').extract()
tds = [x.strip() for x in tds if x.strip()]
price_jsons.append({"timestamp": to_time_str(tds[0]),
"close": to_float(tds[1])})
if price_jsons:
self.df_close = self.df_close.append(price_jsons, ignore_index=True)
self.df_close = index_df_with_time(self.df_close)
except Exception as e:
self.logger.exception('error when getting sp500 price url={} error={}'.format(response.url, e))
示例11: parse_user_0
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_user_0(self, response):
""" 抓取个人信息-第一部分:微博数、关注数、粉丝数 """
user_item = UserItem()
selector = Selector(response)
text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
if text0:
num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # 微博数
num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # 关注数
num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # 粉丝数
if num_tweets:
user_item["ctweets"] = int(num_tweets[0])
if num_follows:
user_item["cfollows"] = int(num_follows[0])
if num_fans:
user_item["cfans"] = int(num_fans[0])
user_item["_id"] = response.meta["user_id"]
url_information1 = "http://weibo.cn/%s/info" % response.meta["user_id"]
yield Request(url=url_information1, meta={"item": user_item}, callback=self.parse_user_1)
示例12: parse_user_1
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_user_1(self, response):
""" 抓取个人信息2 """
user_item = response.meta["item"]
selector = Selector(response)
text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract()) # 获取标签里的所有text()
nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1) # 昵称
intro = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1) # 简介
auth = re.findall(u'\u8ba4\u8bc1[:|\uff1a](.*?);', text1) # 认证信息
gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1) # 性别
place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1) # 地区(包括省份和城市)
birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1) # 生日
sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1) # 性取向
marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1) # 婚姻状况
url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1) # 首页链接
if nickname:
user_item["nickname"] = nickname[0]
if auth:
user_item["auth"] = auth[0]
if intro:
user_item["intro"] = intro[0]
user_item['t'] = time.strftime('%Y-%m-%d', time.localtime(time.time()))
yield user_item
示例13: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse(self, response):
print('url:', response.url)
body = response.body.replace(b'<<+', b'<<+').replace(b'<+', b'<+')
selector = scrapy.Selector(text=body.decode('utf-8'))
i = 1
for x in selector.css('.elem::text').extract():
if 'Elements' in x:
print('---', i, '---')
i += 1
else:
print(x)
# --- it runs without project and saves in `output.csv` ---
示例14: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse(self, response):
item = DmozItem()
sel = scrapy.Selector(response)
conn = pymssql.connect(host="121.42.136.4", user="sa", password="koala19920716!@#", database="test")
cursor = conn.cursor()
sites = sel.xpath("//dl[@id='clist']/dd/a/text()").extract()
item['title'] = [n.encode('utf-8') for n in sites]
yield item
# sql = "select ID,CityName from Cities"
# cursor.execute(sql)
# for (ID,CityName) in cursor.fetchall():
# print ID
for name in item['title']:
# print name
sql = "Insert into Cities(CityName)values('" + name + "')"
cursor.execute(sql)
conn.commit()
示例15: parse_page
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Selector [as 别名]
def parse_page(self, response):
self.write(response.body)
sel = Selector(response)
infos = sel.xpath('//tbody/tr').extract()
for i, info in enumerate(infos):
if i == 0:
continue
val = Selector(text = info)
ip = val.xpath('//td[1]/text()').extract_first()
port = val.xpath('//td[2]/text()').extract_first()
country = val.xpath('//td[6]/text()').extract_first()
anonymity = val.xpath('//td[3]/text()').extract_first()
https = val.xpath('//td[4]/text()').extract_first()
proxy = Proxy()
proxy.set_value(
ip = ip,
port = port,
country = country,
anonymity = anonymity,
source = self.name,
)
self.add_proxy(proxy = proxy)