本文整理汇总了Python中scrapy.loader.ItemLoader.add_value方法的典型用法代码示例。如果您正苦于以下问题:Python ItemLoader.add_value方法的具体用法?Python ItemLoader.add_value怎么用?Python ItemLoader.add_value使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.loader.ItemLoader
的用法示例。
在下文中一共展示了ItemLoader.add_value方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_item
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse_item(self, response):
""" This function parses a property page.
@url http://web:9312/properties/property_000000.html
@returns items 1
@scrapes title price description address image_urls
@scrapes url project spider server date
"""
# Create the loader using the response
l = ItemLoader(item=PropertiesItem(), response=response)
# Load fields using XPath expressions
l.add_xpath("title", '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title))
l.add_xpath(
"price", './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(",", ""), float), re="[,.0-9]+"
)
l.add_xpath("description", '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join())
l.add_xpath("address", '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip))
l.add_xpath(
"image_urls", '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))
)
# Housekeeping fields
l.add_value("url", response.url)
l.add_value("project", self.settings.get("BOT_NAME"))
l.add_value("spider", self.name)
l.add_value("server", socket.gethostname())
l.add_value("date", datetime.datetime.now())
return l.load_item()
示例2: parse_link_page
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse_link_page(response):
for post in response.xpath('//div[@data-type="link"]'):
l = ItemLoader(RedditPostItem(), selector=post)
post_root_xpath = './div[contains(@class, "entry")]'
title = post_root_xpath + '/p[@class="title"]'
tagline = post_root_xpath + '/p[@class="tagline"]'
buttons = post_root_xpath + '/ul'
l.add_xpath('title', title + '/a/text()')
l.add_xpath('link', title + '/a/@href')
l.add_xpath('poster', tagline + '/a[contains(@class, "author")]/text()')
l.add_xpath('score', './div[contains(@class, "midcol")]/div[@class="score unvoted"]/text()')
l.add_xpath('number_of_comments', buttons + '//a[contains(@class, "comments")]/text()')
l.add_xpath('comments_link', buttons + '//a[contains(@class, "comments")]/@href')
l.add_xpath('subreddit', './@data-subreddit')
l.add_xpath('post_timestamp', tagline + '/time/@datetime')
l.add_value('scrape_timestamp', datetime.datetime.now())
item = l.load_item()
# if there are any comments for the post, go scrape them
item["comments"] = []
if item["number_of_comments"] > 0:
yield scrapy.Request(item["comments_link"]+"?limit=500",
callback=parse_comments,
meta={'item': item})
yield l.load_item()
示例3: parse
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse(self, response):
l=ItemLoader(item=RentalItem(),response=response)
l.add_xpath('price','//*[(@id = "main-info")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-big", " " )) and contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
l.add_xpath('adress','//*[(@id = "addressPromo")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
l.add_value('url', response.url)
return l.load_item()
示例4: parse_item
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse_item(self,response):
l = ItemLoader(item =MeizituItem(),response = response)
l.add_xpath('name','//h2/a/text()')
l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p")
l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",Identity())
l.add_value('url', response.url)
return l.load_item()
示例5: parse_image
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse_image(self, response):
logger.info("正在收集页面数据: %s ..." % response.url)
loader = ItemLoader(item=MeiTuItem(), response=response)
loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/text()")
loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/a[@class='tags']/text()")
loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/text()")
loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/a[@class='tags']/text()")
loader.add_xpath('publishtime', "//div[@class='width']/div[@class='c_l']/p[6]/text()")
loader.add_xpath('magazine_no', "//div[@class='width']/div[@class='c_l']/p[2]/text()")
loader.add_xpath('pic_qty', "//div[@class='width']/div[@class='c_l']/p[3]/text()")
loader.add_xpath('pixel', "//div[@class='width']/div[@class='c_l']/p[4]/text()")
try:
loader.add_xpath('desc', "//p[@class='buchongshuoming'/text()]")
except ValueError:
pass
loader.add_xpath('tag', "//div[@class='fenxiang_l']/a[@class='tags']/text()")
loader.add_xpath('sort', "//div[@class='weizhi']/span/a[2]/text()")
loader.add_xpath('image_url', "//div[@class='content']/center/img[@class='content_img']/@src")
loader.add_value("page_url", response.url)
yield loader.load_item()
示例6: parse_item
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse_item(self, response):
sel = response.css("div.path")
loader = ItemLoader(item=SeriesItem(), selector=sel)
loader.add_css("series_id", "a:last-child::attr(href)")
loader.add_css("series_name", "a:last-child::text")
series = loader.load_item()
print(series)
# 即将销售 & 在售
for sel in response.css("div.interval01-list-cars-infor"):
loader = ItemLoader(item=ModelItem(), selector=sel)
loader.add_css("model_id", "a::attr(href)")
loader.add_css("model_name", "a::text")
loader.add_value("series_id", series['series_id'])
loader.add_value("series_name", series['series_name'])
yield loader.load_item()
# 停售
url = "http://www.autohome.com.cn/ashx/series_allspec.ashx"
years = response.css(".dropdown-content a::attr(data)")
for year in years.extract():
qs = {
"y": year,
"s": series["series_id"]
}
yield Request(url + "?" + urlencode(qs), self.stop_sale)
示例7: parse_content_page
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse_content_page(self, response):
# Detect if this is a redirection page
m = redirect_re.search(response.body)
if m:
import requests
new_url = m.group(1)
new_content = requests.get(new_url).content
response = scrapy.http.HtmlResponse(new_url, body=new_content)
# Start scraping
il = ItemLoader(item = LuliItem(), response=response)
il.add_css('content', 'div#articleNew > p::text')
il.add_css('content', 'div[itemprop="articleBody"] > p')
il.add_css('date', 'div#articleDate::text')
il.add_css('date', 'header > time[datetime]::attr(datetime)')
il.add_css('title', 'div#articleNew > h1::text')
il.add_css('title', 'h1[itemprop="headline"]::text')
il.add_value('url', response.url)
item = il.load_item()
yield item
示例8: parse_colleagues
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse_colleagues(self, response, author_id):
self.logger.info('Parsing colleagues for author %s.' % author_id)
# get all authors listed
num_authors = 0
for div in response.xpath('//*[@class="gsc_1usr gs_scl"]'):
num_authors += 1
name_xp = './*[@class="gsc_1usr_name"]/text()'
id_val = urlparse.parse_qs(urlparse.urlparse(div.xpath('//*[@id="gsc_ccl"]/div[1]/div[2]/h3/a/@href').extract_first()).query)['user']
cited_by_xp = './*[@class="gsc_1_usr_cby"]/text()'
fos_xp = './/a[@class="gsc_co_int"]/@href' # --> ["foo", "bar",...]
# load general author item for colleague
co_auth = ItemLoader(item=AuthorItem(), response=response, selector=div)
co_auth.add_value('id', id_val)
co_auth.add_xpath('name', name_xp)
co_auth.add_xpath('cited', cited_by_xp)
co_auth.add_xpath('fos', fos_xp)
yield co_auth.load_item()
# load co-authorship
relation = [author_id, id_val]
relation.sort()
co_rel = ItemLoader(item=CoAuthorItem(), response=response)
co_rel.add_value('author1', relation[0])
co_rel.add_value('author2', relation[1])
yield co_rel.load_item()
self.logger.info('Found %d colleagues for author %s.' % (num_authors, author_id))
next_url = self.choose_next()
if next_url:
yield Request(url=next_url)
示例9: _parse
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def _parse(self, response):
l = ItemLoader(item=BookmarksItem(), response=response)
l.add_xpath(u"name", u"/html/head/title")
l.add_xpath(u"anchors", u"//a/@href'")
l.add_xpath(u"description", u"/html/body/text()")
l.add_value(u"last_updated", datetime.datetime) # you can also use literal values
return l.load_item()
示例10: parse
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse(self, response):
match = re.search('/displaySeminarList/',response.url)
if match:
urls = response.xpath('//div[@class="internList splitEntry"]//@href').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url, self.parse)
else:
table = response.xpath(self.seminar_list_xpath)
corpId = parse_qs(urlparse(response.url).query)['corpId']
for index,semi in enumerate(table):
loader = ItemLoader(SeminarItem(),semi)
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
loader.add_value('companyid',corpId)
loader.add_xpath('name','//div[@id="headerWrap"]//h3/text()')
loader.add_xpath('date','.//td[@class="date"]/text()',re='\d+\/\d+\/\d+')
loader.add_xpath('time','.//td[@class="time"]/text()')
loader.add_xpath('area','.//td[@class="area"]/text()')
loader.add_xpath('place','.//td[@class="place"]/text()')
loader.add_xpath('loc_n','.//td[@class="place"]//a', re='mycom_loc\|(\d+\/\d+\/\d+\.\d+)\,\d+\/\d+\/\d+\.\d+')
loader.add_xpath('loc_e','.//td[@class="place"]//a', re='mycom_loc\|\d+\/\d+\/\d+\.\d+\,(\d+\/\d+\/\d+\.\d+)')
loader.add_xpath('target','.//td[@class="target"]/text()')
yield loader.load_item()
示例11: parse_info
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse_info(self, response):
loaderJob = ItemLoader(item=JobInfoItem(), response=response)
loaderCom = ItemLoader(item=ComInfoItem(), response=response)
loaderJob.add_value('url', value=response.url)
loaderJob.add_xpath('job_name', '//div[@class="inner-left fl"][1]/h1/text()', TakeFirstL())
loaderJob.add_xpath('job_company', '//div[@class="inner-left fl"][1]/h2/a/text()', TakeFirstL())
loaderJob.add_xpath('job_benefits', '//div[@class="inner-left fl"][1]/div/span/text()', JoinL('|'))
divs = '//ul[@class="terminal-ul clearfix"]/li'
loaderJob.add_xpath('job_salary', divs, TakeFirstL(), re=u'(?<=职位月薪:</span><strong>).*(?=</strong></li>)')
loaderJob.add_xpath('job_location', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=工作地点:</span><strong>).*(?=</strong></li>)')
loaderJob.add_xpath('job_update', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=发布日期:</span><strong>).*(?=</strong></li>)')
loaderJob.add_xpath('job_nature', divs, TakeFirstL(), re=u'(?<=工作性质:</span><strong>).*(?=</strong></li>)')
loaderJob.add_xpath('job_experience', divs, TakeFirstL(), re=u'(?<=工作经验:</span><strong>).*(?=</strong></li>)')
loaderJob.add_xpath('job_miniEdu', divs, TakeFirstL(), re=u'(?<=最低学历:</span><strong>).*(?=</strong></li>)')
loaderJob.add_xpath('job_recruNums', divs, TakeFirstL(), re=u'(?<=招聘人数:</span><strong>).*(?=</strong></li>)')
loaderJob.add_xpath('job_category', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=职位类别:</span><strong>).*(?=</strong></li>)')
loaderJob.add_xpath('job_desc', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), StripBlankL(), JoinL('|'))
loaderJob.add_xpath('job_desc_resp', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=岗位职责|工作职责).*?(?=任职资格|岗位要求)')
loaderJob.add_xpath('job_desc_req', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=任职资格|岗位要求).*?(?=。)')
loaderJob.add_xpath('job_desc_loc', '//div[@class="tab-inner-cont"][1]/h2/text()', TakeFirstL())
loaderCom.add_xpath('url', '//div[@class="company-box"]/p[@class="company-name-t"]/a/@href', TakeFirstL())
loaderCom.add_xpath('com_name', '//div[@class="company-box"]/p[@class="company-name-t"]/a/text()', TakeFirstL())
divs = '//div[@class="company-box"]/ul/li'
loaderCom.add_xpath('com_size', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司规模[:,:]).*')
loaderCom.add_xpath('com_nature', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司性质[:,:]).*')
loaderCom.add_xpath('com_industry', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司行业[:,:]).*')
loaderCom.add_xpath('com_intro', '//div[@class="tab-inner-cont"][2]', ExtractTextL(), StripBlankL(), JoinL('|'))
loaderCom.add_xpath('com_link', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司主页[:,:]).*')
loaderCom.add_xpath('com_address', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=公司地址[:,:])[\s\S]*(?=</strong>)')
return loaderJob.load_item(), loaderCom.load_item()
示例12: get_player_info
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def get_player_info(self, response):
loader = ItemLoader(item=NFL_Player_2015(), response=response)
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()[0]
number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()
if type(number_and_position) is list:
number_and_position = number_and_position[0]
number = number_and_position.split()[0]
position = number_and_position.split()[1]
else:
number = ''
position = ''
loader.add_value('number', number)
loader.add_value('position', position)
loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')
# loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
# loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')
yield loader.load_item()
示例13: parse_titles
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse_titles(self, response):
loader = ItemLoader(item=BlogCategory(), response=response)
loader.add_value('hub', response.meta['hname'])
loader.add_css('title', 'div.company_post h1 span::text')
loader.add_css('date', 'div.published::text')
loader.add_css('article', 'div.content::text')
yield loader.load_item()
示例14: parse
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse(self, response):
""" This function parses the categories and its subcategories on a gscholar web page.
@url https://scholar.google.com/citations?view_op=top_venues&hl=de&vq=bus
@returns items 1 1
@returns requests 0 0
@scrapes name subs
"""
# We need the div that is 'selected' i.e. contains gs_sel as a css class
title_xp = '//*[@id="gs_m_broad"]/div[contains(@class,\'gs_sel\')]/a/span/text()'
item = ItemLoader(item=CategoryItem(), response=response)
title = response.xpath(title_xp).extract_first()
item.add_value('name', title)
subs = []
for sub in response.xpath('//*[@id="gs_m_rbs"]/ul/li/a'):
s = {'name' : sub.xpath('text()').extract_first()}
rel_url = sub.xpath('@href').extract_first()
s['vq'] = parse_qs(urlparse(rel_url).query)[u'vq'][0]
subs.append(s)
req = Request(urljoin(response.url,rel_url), callback=self.parse_item)
req.meta['parent'] = title
yield req
item.add_value('subs', subs)
yield item.load_item()
示例15: parse_item
# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse_item(self, response):
"""
This function parses a property page.
@url http://localhost:9312/properties/property_000000.html
@returns items 1
@scrapes title price description address image_urls
@scrapes url project spider server date
"""
l = ItemLoader(item=PropertiesItem(), response=response)
l.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title))
l.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float),
re='[,.0-9]+')
l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join())
l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip))
l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
MapCompose(lambda i: urlparse.urljoin(response.url, i)))
# Housekeeping fields
l.add_value('url', response.url)
l.add_value('project', self.settings.get('BOT_NAME'))
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
return l.load_item()