本文整理汇总了Python中scrapy.selector.HtmlXPathSelector方法的典型用法代码示例。如果您正苦于以下问题:Python selector.HtmlXPathSelector方法的具体用法?Python selector.HtmlXPathSelector怎么用?Python selector.HtmlXPathSelector使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.selector
的用法示例。
在下文中一共展示了selector.HtmlXPathSelector方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def parse(self, response):
"""
The lines below is a spider contract. For more info see:
http://doc.scrapy.org/en/latest/topics/contracts.html
@url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
@scrapes name
"""
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul[@class="directory-url"]/li')
items = []
for site in sites:
item = Website()
item['name'] = site.select('a/text()').extract()
item['url'] = site.select('a/@href').extract()
item['description'] = site.select('text()').re('-\s([^\n]*?)\\n')
items.append(item)
return items
示例2: parse
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def parse(self, response):
self.log("OK,%s" % response.url)
hxs = HtmlXPathSelector(response)
# 将文章的链接继续进行处理
divs = hxs.x('//div[@class="publicLeftCon mt10"]')
for div in divs:
url = div.x('h5/a/@href').extract()[0]
yield self.make_requests_from_url(url).replace(callback=self.parse_content)
# 将下一页的链接继续进行处理
try:
next_url = \
hxs.x('//div[@id="project_left"]/div[@class="publicMiddleLine"]/span/a[b="下一页"]/@href').extract()[0]
except Exception:
return
next_url = 'http://article.yeeyan.org' + next_url
# if self.count==10:
# return
# self.count+=1
yield self.make_requests_from_url(next_url).replace(callback=self.parse)
# 过滤文章内容
示例3: parse_content
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def parse_content(self, response):
hxs = HtmlXPathSelector(response)
item = YeeyanItem()
if hxs.x('//a[@class="jx_logo"]/text()'):
item = self.parse_jx(item, response)
else:
item['url'] = response.url
item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip()
div = hxs.x('//div[@class="user_info"]')
item['author'] = div.x('.//h2/a/text()').extract()[0]
item['excerpt'] = hxs.x('//p[@class="excerpt"]/text()').extract()
if item['excerpt']:
item['excerpt'] = item['excerpt'][0]
else:
item['excerpt'] = ''
item['content_html'] = hxs.x('//div[@id="conBox"]').extract()[0]
item['release_time'] = div.x('.//p/text()').extract()[0].strip()[1:-7]
item['category'] = hxs.x('//div[@class="crumb"]/a/text()').extract()[1]
return item
# 过滤精选的文章
示例4: parse
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def parse(self, response):
"""
default parse method, rule is not useful now
"""
# import pdb; pdb.set_trace()
response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
hxs = HtmlXPathSelector(response)
index_level = self.determine_level(response)
log.msg("Parse: index level:" + str(index_level))
if index_level in [1, 2, 3, 4]:
self.save_to_file_system(index_level, response)
relative_urls = self.get_follow_links(index_level, hxs)
if relative_urls is not None:
for url in relative_urls:
log.msg('yield process, url:' + url)
yield Request(url, callback=self.parse)
elif index_level == 5:
personProfile = HtmlParser.extract_person_profile(hxs)
linkedin_id = self.get_linkedin_id(response.url)
linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
if linkedin_id:
personProfile['_id'] = linkedin_id
personProfile['url'] = UnicodeDammit(response.url).markup
yield personProfile
示例5: parse_page
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def parse_page(response):
hxs = HtmlXPathSelector(response)
item = LotteryticketItem()
# 期数
title = hxs.select('//html/body/center/center/table/tr/td/table[1]/tr[2]/td[1]/text()').extract()[0]
item['title'] = filter(str.isdigit, ("".join(title.split()).encode("utf-8")))
# 红色球区
red1 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[1]/font/text()').extract()[0]
item['red1'] = filter(str.isdigit, ("".join(red1.split()).encode("utf-8")))
red2 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[2]/font/text()').extract()[0]
item['red2'] = filter(str.isdigit, ("".join(red2.split()).encode("utf-8")))
red3 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[3]/font/text()').extract()[0]
item['red3'] = filter(str.isdigit, ("".join(red3.split()).encode("utf-8")))
red4 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[4]/font/text()').extract()[0]
item['red4'] = filter(str.isdigit, ("".join(red4.split()).encode("utf-8")))
red5 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[5]/font/text()').extract()[0]
item['red5'] = filter(str.isdigit, ("".join(red5.split()).encode("utf-8")))
red6 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[6]/font/text()').extract()[0]
item['red6'] = filter(str.isdigit, ("".join(red6.split()).encode("utf-8")))
# 蓝色球区
blue = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[7]/font/text()').extract()[0]
item['blue'] = filter(str.isdigit, ("".join(blue.split()).encode("utf-8")))
# 开奖时间
created_at = hxs.select('//html/body/center/center/table/tr/td/table[1]/tr[2]/td[2]/text()').extract()[0]
item['created_at'] = ("".join(created_at.split()).encode("utf-8"))[0:10]
return item
示例6: parse_torrent
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def parse_torrent(self,response):
x = HtmlXPathSelector(response)
torrent['url'] = response.url
torrent['description'] = x.select("//span[@id='lblDescription']/text()").extract()
torrent['jurisdictiontype'] = x.select("//span[@id='lblJurisdictionType']").extract()
torrent['agency'] = x.select("//span[@id='lblUmbrellaAgency']/text()").extract()
torrent['contactinfo'] = x.select("//span[@id='lblContact']/p/text()").extract()
torrent['links'] = x.select("//span[@id='lblContacts']/p/a/@href").extract()
return torrent
示例7: parse
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def parse(self, response):
x = HtmlXPathSelector(response)
links = []
url = response.url
music_links = x.select('//ul/li/a/@href').extract()
music_links = [m for m in music_links if m.endswith(".mid")]
for l in music_links:
link = MIDIFile()
link['url'] = url
link['ltype'] = self.ltype
link['link'] = l
link["file_urls"] = [l]
links.append(link)
return links
示例8: process_response
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def process_response(self, request, response, spider):
url = response.url
if response.status in [301, 307]:
log.msg("trying to redirect us: %s" % url, level=log.INFO)
reason = 'redirect %d' % response.status
return self._retry(request, reason, spider) or response
interval, redirect_url = get_meta_refresh(response)
# handle meta redirect
if redirect_url:
log.msg("trying to redirect us: %s" % url, level=log.INFO)
reason = 'meta'
return self._retry(request, reason, spider) or response
hxs = HtmlXPathSelector(response)
# test for captcha page
captcha = hxs.select(
".//input[contains(@id, 'captchacharacters')]").extract()
if captcha:
log.msg("captcha page %s" % url, level=log.INFO)
reason = 'capcha'
return self._retry(request, reason, spider) or response
return response
示例9: parse_item
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def parse_item(self, response):
print response.body
hxs = HtmlXPathSelector(response)
i = DmozItem()
i['id'] = hxs.select('//input[@id="sid"]/@value').extract()
i['title'] = hxs.select('//div[@id="name"]').extract()
i['desc'] = hxs.select('//div[@id="description"]').extract()
return i
示例10: parse_jx
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def parse_jx(self, item, response):
hxs = HtmlXPathSelector(response)
item['url'] = response.url
item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip()
div = hxs.x('//div[@class="jxar_author"]')
item['author'] = div.x('.//a/text()').extract()[0]
item['release_time'] = hxs.x('//p[@class="jxa_info"]/span[1]/text()').extract()[0]
try:
item['excerpt'] = hxs.x('//p[@class="jxa_intro"]/text()').extract()[0]
except Exception:
item['excerpt'] = None
item['category'] = hxs.x('//p[@class="jxa_map"]/text()').extract()[1].split()[1]
item['content_html'] = hxs.x('//div[@class="jxa_content"]').extract()[0]
return item
示例11: parse_detail
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def parse_detail(self, response):
woaidu_item = WoaiduCrawlerItem()
response_selector = HtmlXPathSelector(response)
woaidu_item['book_name'] = list_first_item(
response_selector.select('//div[@class="zizida"][1]/text()').extract())
woaidu_item['author'] = [
list_first_item(response_selector.select('//div[@class="xiaoxiao"][1]/text()').extract())[5:].strip(), ]
woaidu_item['book_description'] = list_first_item(
response_selector.select('//div[@class="lili"][1]/text()').extract()).strip()
woaidu_item['book_covor_image_url'] = list_first_item(
response_selector.select('//div[@class="hong"][1]/img/@src').extract())
download = []
for i in response_selector.select('//div[contains(@class,"xiazai_xiao")]')[1:]:
download_item = {}
download_item['url'] = strip_null(
deduplication(
[
list_first_item(i.select('./div')[0].select('./a/@href').extract()),
list_first_item(i.select('./div')[1].select('./a/@href').extract())
]
)
)
download_item['progress'] = list_first_item(i.select('./div')[2].select('./text()').extract())
download_item['update_time'] = list_first_item(i.select('./div')[3].select('./text()').extract())
download_item['source_site'] = \
[
list_first_item(i.select('./div')[4].select('./a/text()').extract()), \
list_first_item(i.select('./div')[4].select('./a/@href').extract()) \
]
download.append(download_item)
woaidu_item['book_download'] = download
woaidu_item['original_url'] = response.url
yield woaidu_item
示例12: parse_item
# 需要导入模块: from scrapy import selector [as 别名]
# 或者: from scrapy.selector import HtmlXPathSelector [as 别名]
def parse_item(self, response):
self._log_page(response, 'after_login.html')
hxs = HtmlXPathSelector(response)
report_urls = hxs.select('//div[@id="menuh"]/ul/li[4]/div//a/@href').extract()
for report_url in report_urls:
# print "list:"+report_url
yield Request(self._ab_path(response, report_url), \
headers=self.headers, \
meta={'cookiejar': response.meta['cookiejar'], \
}, \
callback=self.parse_report)