当前位置: 首页>>代码示例>>Python>>正文


Python ItemLoader.add_value方法代码示例

本文整理汇总了Python中scrapy.loader.ItemLoader.add_value方法的典型用法代码示例。如果您正苦于以下问题:Python ItemLoader.add_value方法的具体用法?Python ItemLoader.add_value怎么用?Python ItemLoader.add_value使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.loader.ItemLoader的用法示例。


在下文中一共展示了ItemLoader.add_value方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_item

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
    def parse_item(self, response):
        """ This function parses a property page.

        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_xpath("title", '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title))
        l.add_xpath(
            "price", './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(",", ""), float), re="[,.0-9]+"
        )
        l.add_xpath("description", '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join())
        l.add_xpath("address", '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip))
        l.add_xpath(
            "image_urls", '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))
        )

        # Housekeeping fields
        l.add_value("url", response.url)
        l.add_value("project", self.settings.get("BOT_NAME"))
        l.add_value("spider", self.name)
        l.add_value("server", socket.gethostname())
        l.add_value("date", datetime.datetime.now())

        return l.load_item()
开发者ID:semurat,项目名称:scrapybook,代码行数:33,代码来源:manual.py

示例2: parse_link_page

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
def parse_link_page(response):
    for post in response.xpath('//div[@data-type="link"]'):
        l = ItemLoader(RedditPostItem(), selector=post)
        post_root_xpath = './div[contains(@class, "entry")]'
        title = post_root_xpath + '/p[@class="title"]'
        tagline = post_root_xpath + '/p[@class="tagline"]'
        buttons = post_root_xpath + '/ul'
        l.add_xpath('title', title + '/a/text()')
        l.add_xpath('link', title + '/a/@href')
        l.add_xpath('poster', tagline + '/a[contains(@class, "author")]/text()')
        l.add_xpath('score', './div[contains(@class, "midcol")]/div[@class="score unvoted"]/text()')
        l.add_xpath('number_of_comments', buttons + '//a[contains(@class, "comments")]/text()')
        l.add_xpath('comments_link', buttons + '//a[contains(@class, "comments")]/@href')
        l.add_xpath('subreddit', './@data-subreddit')
        l.add_xpath('post_timestamp', tagline + '/time/@datetime')
        l.add_value('scrape_timestamp', datetime.datetime.now())

        item = l.load_item()
        # if there are any comments for the post, go scrape them
        item["comments"] = []
        if item["number_of_comments"] > 0:
            yield scrapy.Request(item["comments_link"]+"?limit=500",
                                 callback=parse_comments,
                                 meta={'item': item})
        yield l.load_item()
开发者ID:ContinuumIO,项目名称:appstarters,代码行数:27,代码来源:reddit_comment_spider.py

示例3: parse

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
    def parse(self, response):
        l=ItemLoader(item=RentalItem(),response=response)
        l.add_xpath('price','//*[(@id = "main-info")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-big", " " )) and contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
        l.add_xpath('adress','//*[(@id = "addressPromo")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()')
        l.add_value('url', response.url)

        return l.load_item()
开发者ID:vtisza,项目名称:MilanRentalSpyder,代码行数:9,代码来源:idealista.py

示例4: parse_item

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
		def parse_item(self,response):
			l = ItemLoader(item =MeizituItem(),response = response)
			l.add_xpath('name','//h2/a/text()')
			l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
			l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",Identity())
			l.add_value('url', response.url)
			return l.load_item()
开发者ID:xtstc131,项目名称:MeiziSpider_test,代码行数:9,代码来源:meizituSpider.py

示例5: parse_image

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
    def parse_image(self, response):
        logger.info("正在收集页面数据: %s ..." % response.url)
        loader = ItemLoader(item=MeiTuItem(), response=response)

        loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/text()")
        loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/a[@class='tags']/text()")
        loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/text()")
        loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/a[@class='tags']/text()")
        loader.add_xpath('publishtime', "//div[@class='width']/div[@class='c_l']/p[6]/text()")
        loader.add_xpath('magazine_no', "//div[@class='width']/div[@class='c_l']/p[2]/text()")
        loader.add_xpath('pic_qty', "//div[@class='width']/div[@class='c_l']/p[3]/text()")
        loader.add_xpath('pixel', "//div[@class='width']/div[@class='c_l']/p[4]/text()")

        try:
            loader.add_xpath('desc', "//p[@class='buchongshuoming'/text()]")
        except ValueError:
            pass

        loader.add_xpath('tag', "//div[@class='fenxiang_l']/a[@class='tags']/text()")
        loader.add_xpath('sort', "//div[@class='weizhi']/span/a[2]/text()")
        loader.add_xpath('image_url', "//div[@class='content']/center/img[@class='content_img']/@src")
        loader.add_value("page_url", response.url)


        yield loader.load_item()
开发者ID:ZhenbPeng,项目名称:fun_scrapy,代码行数:27,代码来源:meitu.py

示例6: parse_item

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
    def parse_item(self, response):
        sel = response.css("div.path")

        loader = ItemLoader(item=SeriesItem(), selector=sel)
        loader.add_css("series_id", "a:last-child::attr(href)")
        loader.add_css("series_name", "a:last-child::text")

        series = loader.load_item()
        print(series)

        # 即将销售 & 在售
        for sel in response.css("div.interval01-list-cars-infor"):
            loader = ItemLoader(item=ModelItem(), selector=sel)
            loader.add_css("model_id", "a::attr(href)")
            loader.add_css("model_name", "a::text")
            loader.add_value("series_id", series['series_id'])
            loader.add_value("series_name", series['series_name'])

            yield loader.load_item()

        # 停售
        url = "http://www.autohome.com.cn/ashx/series_allspec.ashx"

        years = response.css(".dropdown-content a::attr(data)")

        for year in years.extract():
            qs = {
                "y": year,
                "s": series["series_id"]
            }

            yield Request(url + "?" + urlencode(qs), self.stop_sale)
开发者ID:zhanqh,项目名称:baidu_ife,代码行数:34,代码来源:automobile.py

示例7: parse_content_page

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
    def parse_content_page(self, response):

        # Detect if this is a redirection page
        m = redirect_re.search(response.body)
        if m:
            import requests
            new_url = m.group(1)
            new_content = requests.get(new_url).content
            response = scrapy.http.HtmlResponse(new_url, body=new_content)

        # Start scraping
        il = ItemLoader(item = LuliItem(), response=response)
        
        il.add_css('content', 'div#articleNew > p::text')
        il.add_css('content', 'div[itemprop="articleBody"] > p')
        
        il.add_css('date', 'div#articleDate::text')
        il.add_css('date', 'header > time[datetime]::attr(datetime)')
        
        il.add_css('title', 'div#articleNew > h1::text')
        il.add_css('title', 'h1[itemprop="headline"]::text')
        
        il.add_value('url', response.url)

        item = il.load_item() 
        yield item
开发者ID:tilacog,项目名称:scrapy_luli,代码行数:28,代码来源:folha.py

示例8: parse_colleagues

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
    def parse_colleagues(self, response, author_id):
        self.logger.info('Parsing colleagues for author %s.' % author_id)

        # get all authors listed
        num_authors = 0
        for div in response.xpath('//*[@class="gsc_1usr gs_scl"]'):
            num_authors += 1
            name_xp = './*[@class="gsc_1usr_name"]/text()'
            id_val = urlparse.parse_qs(urlparse.urlparse(div.xpath('//*[@id="gsc_ccl"]/div[1]/div[2]/h3/a/@href').extract_first()).query)['user']
            cited_by_xp = './*[@class="gsc_1_usr_cby"]/text()'
            fos_xp = './/a[@class="gsc_co_int"]/@href' # --> ["foo", "bar",...]

            # load general author item for colleague
            co_auth = ItemLoader(item=AuthorItem(), response=response, selector=div)
            co_auth.add_value('id', id_val)
            co_auth.add_xpath('name', name_xp)
            co_auth.add_xpath('cited', cited_by_xp)
            co_auth.add_xpath('fos', fos_xp)
            yield co_auth.load_item()

            # load co-authorship
            relation = [author_id, id_val]
            relation.sort()
            co_rel = ItemLoader(item=CoAuthorItem(), response=response)
            co_rel.add_value('author1', relation[0])
            co_rel.add_value('author2', relation[1])
            yield co_rel.load_item()

        self.logger.info('Found %d colleagues for author %s.' % (num_authors, author_id))

        next_url = self.choose_next()

        if next_url:
            yield Request(url=next_url)
开发者ID:enplotz,项目名称:webir2015,代码行数:36,代码来源:author_complete.py

示例9: _parse

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
 def _parse(self, response):
     l = ItemLoader(item=BookmarksItem(), response=response)
     l.add_xpath(u"name", u"/html/head/title")
     l.add_xpath(u"anchors", u"//a/@href'")
     l.add_xpath(u"description", u"/html/body/text()")
     l.add_value(u"last_updated", datetime.datetime)  # you can also use literal values
     return l.load_item()
开发者ID:Darth-Neo,项目名称:ScrapyBookmarks,代码行数:9,代码来源:links.py

示例10: parse

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
	def parse(self, response):
		match = re.search('/displaySeminarList/',response.url)

		if match:
			urls = response.xpath('//div[@class="internList splitEntry"]//@href').extract()
			for url in urls:
				url = response.urljoin(url)
				yield scrapy.Request(url, self.parse)
		else:
			table = response.xpath(self.seminar_list_xpath)
			corpId = parse_qs(urlparse(response.url).query)['corpId']
			for index,semi in enumerate(table):
				loader = ItemLoader(SeminarItem(),semi)
				loader.default_input_processor = MapCompose(unicode.strip)
				loader.default_output_processor = Join()
				loader.add_value('companyid',corpId)
				loader.add_xpath('name','//div[@id="headerWrap"]//h3/text()')
				loader.add_xpath('date','.//td[@class="date"]/text()',re='\d+\/\d+\/\d+')
				loader.add_xpath('time','.//td[@class="time"]/text()')
				loader.add_xpath('area','.//td[@class="area"]/text()')
				loader.add_xpath('place','.//td[@class="place"]/text()')
				loader.add_xpath('loc_n','.//td[@class="place"]//a', re='mycom_loc\|(\d+\/\d+\/\d+\.\d+)\,\d+\/\d+\/\d+\.\d+')
				loader.add_xpath('loc_e','.//td[@class="place"]//a', re='mycom_loc\|\d+\/\d+\/\d+\.\d+\,(\d+\/\d+\/\d+\.\d+)')
				loader.add_xpath('target','.//td[@class="target"]/text()')
				yield loader.load_item()
开发者ID:whirlp00l,项目名称:Seminavi,代码行数:27,代码来源:seminar.py

示例11: parse_info

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
    def parse_info(self, response):

        loaderJob = ItemLoader(item=JobInfoItem(), response=response)
        loaderCom = ItemLoader(item=ComInfoItem(), response=response)
        loaderJob.add_value('url', value=response.url)
        loaderJob.add_xpath('job_name', '//div[@class="inner-left fl"][1]/h1/text()', TakeFirstL())
        loaderJob.add_xpath('job_company', '//div[@class="inner-left fl"][1]/h2/a/text()', TakeFirstL())
        loaderJob.add_xpath('job_benefits', '//div[@class="inner-left fl"][1]/div/span/text()', JoinL('|'))
        divs = '//ul[@class="terminal-ul clearfix"]/li'
        loaderJob.add_xpath('job_salary', divs, TakeFirstL(), re=u'(?<=职位月薪:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_location', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=工作地点:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_update', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=发布日期:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_nature', divs, TakeFirstL(), re=u'(?<=工作性质:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_experience', divs, TakeFirstL(), re=u'(?<=工作经验:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_miniEdu', divs, TakeFirstL(), re=u'(?<=最低学历:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_recruNums', divs, TakeFirstL(), re=u'(?<=招聘人数:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_category', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=职位类别:</span><strong>).*(?=</strong></li>)')
        loaderJob.add_xpath('job_desc', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), StripBlankL(), JoinL('|'))
        loaderJob.add_xpath('job_desc_resp', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=岗位职责|工作职责).*?(?=任职资格|岗位要求)')
        loaderJob.add_xpath('job_desc_req', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=任职资格|岗位要求).*?(?=。)')
        loaderJob.add_xpath('job_desc_loc', '//div[@class="tab-inner-cont"][1]/h2/text()', TakeFirstL())

        loaderCom.add_xpath('url', '//div[@class="company-box"]/p[@class="company-name-t"]/a/@href', TakeFirstL())
        loaderCom.add_xpath('com_name', '//div[@class="company-box"]/p[@class="company-name-t"]/a/text()', TakeFirstL())
        divs = '//div[@class="company-box"]/ul/li'
        loaderCom.add_xpath('com_size', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司规模[:,:]).*')
        loaderCom.add_xpath('com_nature', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司性质[:,:]).*')
        loaderCom.add_xpath('com_industry', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司行业[:,:]).*')
        loaderCom.add_xpath('com_intro', '//div[@class="tab-inner-cont"][2]', ExtractTextL(), StripBlankL(), JoinL('|'))
        loaderCom.add_xpath('com_link', divs, ExtractTextL(), TakeFirstL(),  re=u'(?<=公司主页[:,:]).*')
        loaderCom.add_xpath('com_address', divs, RemoveTagsL(), TakeFirstL(),  re=u'(?<=公司地址[:,:])[\s\S]*(?=</strong>)')

        return loaderJob.load_item(), loaderCom.load_item()
开发者ID:zuiaimenger,项目名称:chinahr-web,代码行数:35,代码来源:zhaopin_crawlSpider.py

示例12: get_player_info

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
    def get_player_info(self, response):
        loader = ItemLoader(item=NFL_Player_2015(), response=response)
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()[0]
        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()

        if type(number_and_position) is list:
            number_and_position = number_and_position[0]
            number = number_and_position.split()[0]
            position = number_and_position.split()[1]
        else:
            number = ''
            position = ''

        loader.add_value('number', number)
        loader.add_value('position', position)
        loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        # loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        # loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        yield loader.load_item()
开发者ID:AncillaryStats,项目名称:AS-Scrapers,代码行数:27,代码来源:nfl_team_rosters.py

示例13: parse_titles

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
 def parse_titles(self, response):
     loader = ItemLoader(item=BlogCategory(), response=response)
     loader.add_value('hub', response.meta['hname'])
     loader.add_css('title', 'div.company_post h1 span::text')
     loader.add_css('date', 'div.published::text')
     loader.add_css('article', 'div.content::text')
     yield loader.load_item()
开发者ID:kirillsavelyev,项目名称:python-tceh,代码行数:9,代码来源:habraspider.py

示例14: parse

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
    def parse(self, response):
        """ This function parses the categories and its subcategories on a gscholar web page.

        @url https://scholar.google.com/citations?view_op=top_venues&hl=de&vq=bus
        @returns items 1 1
        @returns requests 0 0
        @scrapes name subs
        """

        # We need the div that is 'selected' i.e. contains gs_sel as a css class
        title_xp = '//*[@id="gs_m_broad"]/div[contains(@class,\'gs_sel\')]/a/span/text()'

        item = ItemLoader(item=CategoryItem(), response=response)
        title = response.xpath(title_xp).extract_first()

        item.add_value('name', title)
        subs = []
        for sub in response.xpath('//*[@id="gs_m_rbs"]/ul/li/a'):
            s = {'name' : sub.xpath('text()').extract_first()}
            rel_url = sub.xpath('@href').extract_first()
            s['vq'] = parse_qs(urlparse(rel_url).query)[u'vq'][0]
            subs.append(s)
            req = Request(urljoin(response.url,rel_url), callback=self.parse_item)
            req.meta['parent'] = title
            yield req
        item.add_value('subs', subs)
        yield item.load_item()
开发者ID:enplotz,项目名称:webir2015,代码行数:29,代码来源:categories.py

示例15: parse_item

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import add_value [as 别名]
	def parse_item(self, response):
		"""
		This function parses a property page.

		@url http://localhost:9312/properties/property_000000.html
		@returns items 1
		@scrapes title price description address image_urls
		@scrapes url project spider server date
		"""
		l = ItemLoader(item=PropertiesItem(), response=response)
		l.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title))
		l.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float),
					re='[,.0-9]+')
		l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join())
		l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip))
		l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
					MapCompose(lambda i: urlparse.urljoin(response.url, i)))

		# Housekeeping fields
		l.add_value('url', response.url)
		l.add_value('project', self.settings.get('BOT_NAME'))
		l.add_value('spider', self.name)
		l.add_value('server', socket.gethostname())
		l.add_value('date', datetime.datetime.now())
		return l.load_item()
开发者ID:leozhao0709,项目名称:scraping,代码行数:27,代码来源:noncelogin.py


注:本文中的scrapy.loader.ItemLoader.add_value方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。