当前位置: 首页>>代码示例>>Python>>正文


Python ItemLoader.default_input_processor方法代码示例

本文整理汇总了Python中scrapy.loader.ItemLoader.default_input_processor方法的典型用法代码示例。如果您正苦于以下问题:Python ItemLoader.default_input_processor方法的具体用法?Python ItemLoader.default_input_processor怎么用?Python ItemLoader.default_input_processor使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.loader.ItemLoader的用法示例。


在下文中一共展示了ItemLoader.default_input_processor方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import default_input_processor [as 别名]
	def parse(self, response):
		match = re.search('/displaySeminarList/',response.url)

		if match:
			urls = response.xpath('//div[@class="internList splitEntry"]//@href').extract()
			for url in urls:
				url = response.urljoin(url)
				yield scrapy.Request(url, self.parse)
		else:
			table = response.xpath(self.seminar_list_xpath)
			corpId = parse_qs(urlparse(response.url).query)['corpId']
			for index,semi in enumerate(table):
				loader = ItemLoader(SeminarItem(),semi)
				loader.default_input_processor = MapCompose(unicode.strip)
				loader.default_output_processor = Join()
				loader.add_value('companyid',corpId)
				loader.add_xpath('name','//div[@id="headerWrap"]//h3/text()')
				loader.add_xpath('date','.//td[@class="date"]/text()',re='\d+\/\d+\/\d+')
				loader.add_xpath('time','.//td[@class="time"]/text()')
				loader.add_xpath('area','.//td[@class="area"]/text()')
				loader.add_xpath('place','.//td[@class="place"]/text()')
				loader.add_xpath('loc_n','.//td[@class="place"]//a', re='mycom_loc\|(\d+\/\d+\/\d+\.\d+)\,\d+\/\d+\/\d+\.\d+')
				loader.add_xpath('loc_e','.//td[@class="place"]//a', re='mycom_loc\|\d+\/\d+\/\d+\.\d+\,(\d+\/\d+\/\d+\.\d+)')
				loader.add_xpath('target','.//td[@class="target"]/text()')
				yield loader.load_item()
开发者ID:whirlp00l,项目名称:Seminavi,代码行数:27,代码来源:seminar.py

示例2: get_player_info

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import default_input_processor [as 别名]
    def get_player_info(self, response):
        loader = ItemLoader(item=NFL_Player_2015(), response=response)
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()[0]
        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()

        if type(number_and_position) is list:
            number_and_position = number_and_position[0]
            number = number_and_position.split()[0]
            position = number_and_position.split()[1]
        else:
            number = ''
            position = ''

        loader.add_value('number', number)
        loader.add_value('position', position)
        loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        # loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        # loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        yield loader.load_item()
开发者ID:AncillaryStats,项目名称:AS-Scrapers,代码行数:27,代码来源:nfl_team_rosters.py

示例3: parse_depth_chart

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import default_input_processor [as 别名]
    def parse_depth_chart(self, response):
        loader = ItemLoader(item=NFL_Team_2015(), response=response)
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        loader.add_xpath("division", '//*[@id="sub-branding"]/div[2]/text()')
        loader.add_xpath("name", '//*[@id="sub-branding"]/h2/a/b/text()')

        yield loader.load_item()
开发者ID:AncillaryStats,项目名称:AS-Scrapers,代码行数:11,代码来源:nfl_team_info_spider.py

示例4: parse_auction_item

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import default_input_processor [as 别名]
    def parse_auction_item(self, response):
        
        loader = ItemLoader(AuctionItems(), response=response)

        loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
        loader.default_output_processor = Join()
       
        for field, xpath in auction_item_fields.iteritems():
            loader.add_xpath(field, xpath)        

              
        yield loader.load_item()
开发者ID:fredriksoderberg,项目名称:auction-analysis,代码行数:14,代码来源:auction_item_spider.py

示例5: parse_bids

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import default_input_processor [as 别名]
    def parse_bids(self, response):
        
        selector = Selector(response)
        
        for bid in selector.select(self.bid_list_xpath) :
            loader = ItemLoader(BidItems(), selector=bid)

            loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
            loader.default_output_processor = Join()
       
            for field, xpath in auction_bid_fields.iteritems():
                loader.add_xpath(field, xpath)        

              
            yield loader.load_item()
开发者ID:fredriksoderberg,项目名称:auction-analysis,代码行数:17,代码来源:auction_bid_spider.py

示例6: parse

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import default_input_processor [as 别名]
    def parse(self, response):
        def strip_dollar(x):
            return x.strip('$')




        self.driver.get(response.url)
        try:
            WebDriverWait(self.driver, 15).until(
                EC.presence_of_element_located(
                    (By.XPATH,
                        '//*[@id="depart-container"]/div[2]/div[1]/div/[@style="width: 0%;"]')))
        except TimeoutException:
            print 'Page load time out'
            pass

        while True:
            try:
                try:
                    WebDriverWait(self.driver, 15).until(
                        EC.presence_of_element_located(
                            (By.XPATH,
                                '//*[@id="depart-container"]/div/div/div/button')))
                except TimeoutException:
                    break

                next = self.driver.find_element_by_xpath(
                    '//*[@id="depart-container"]/div/div/div/button')
                next.click()

            except ElementNotVisibleException:
                break
        for trips in Selector(
                text=self.driver.page_source).xpath(self.trips_list_xpath):
            loader = ItemLoader(BusTrip(), selector=trips)

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            loader.price_in = MapCompose(strip_dollar)


            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            dateoftrip = str(response.url).split("/")[-1]
            loader.add_value('dateoftrip', dateoftrip.decode('unicode-escape'))
            yield loader.load_item()
开发者ID:krishnbx,项目名称:wanderu-scraper,代码行数:49,代码来源:wanderu_spider.py

示例7: parse_answers

# 需要导入模块: from scrapy.loader import ItemLoader [as 别名]
# 或者: from scrapy.loader.ItemLoader import default_input_processor [as 别名]
    def parse_answers(self, response):
        # use selector to extract answers
        selector = Selector(response)

        # iterate over answers
        for answer in selector.xpath(self.answers_list_xpath):
            loader = ItemLoader(item=ZhihuAnswer(), selector=answer)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            item = loader.load_item()

            # convert the full text of answer into html
            item["answer"] = item["answer"].encode('ascii', 'xmlcharrefreplace')

            # if summary has image, convert it to html
            if "summary_img" in item:
                item["summary_img"] = item["summary_img"].encode('ascii', 'xmlcharrefreplace')
            else:
                item['summary_img'] = ""

            # change vote to integer
            item["vote"] = int(item["vote"])

            # in case of anonymous authors
            if "author" not in item:
                item["author"] = u'匿名用户'

            # complete links
            item["question_link"] = u"http://www.zhihu.com" + item["question_link"]

            if "author_link" in item:
                item["author_link"] = u"http://www.zhihu.com" + item["author_link"]
            else:
                item["author_link"] = ""

            # add the date when scraped
            item["date"] = date.today()

            yield item
开发者ID:naity,项目名称:zhihu_scraper,代码行数:48,代码来源:zhihu_spider.py


注:本文中的scrapy.loader.ItemLoader.default_input_processor方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。