本文整理汇总了Python中scrapy.loader.ItemLoader方法的典型用法代码示例。如果您正苦于以下问题:Python loader.ItemLoader方法的具体用法?Python loader.ItemLoader怎么用?Python loader.ItemLoader使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.loader
的用法示例。
在下文中一共展示了loader.ItemLoader方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def parse(self, response):
self.logger.info('Parse function called on {}'.format(response.url))
# quotes = response.xpath("//div[@class='quote']")
quotes = response.css('div.quote')
for quote in quotes:
loader = ItemLoader(item=QuoteItem(), selector=quote)
# pay attention to the dot .// to use relative xpath
# loader.add_xpath('quote_content', ".//span[@class='text']/text()")
loader.add_css('quote_content', '.text::text')
# loader.add_xpath('author', './/small//text()')
loader.add_css('tags', '.tag::text')
quote_item = loader.load_item()
author_url = quote.css('.author + a::attr(href)').get()
# go to the author page and pass the current collected quote info
yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item})
# go to Next page
for a in response.css('li.next a'):
yield response.follow(a, self.parse)
示例2: parse_post
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def parse_post(self,response):
new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item'])
new.context['lang'] = self.lang
new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()")
new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()')
# new.add_xpath('date','//div/div/abbr/text()')
new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()')
#check reactions for old posts
check_reactions = response.xpath("//a[contains(@href,'reaction/profile')]/div/div/text()").get()
if not check_reactions:
yield new.load_item()
else:
new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()")
reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href")
reactions = response.urljoin(reactions[0].extract())
yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})
示例3: scrape_product
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def scrape_product(self, response):
item_loader = ItemLoader(item=MyItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
item_loader.default_output_processor = TakeFirst()
item_loader.add_css("my_field", "selector")
return item_loader.load_item()
示例4: populate_item
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def populate_item(self, response):
item_loader = ItemLoader(item=MySpiderItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
# item_loader.add_css("", "")
yield item_loader.load_item()
# 3. PAGINATION LEVEL 1
示例5: populate_item
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def populate_item(self, response):
item_loader = ItemLoader(item=MySpiderItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
# item_loader.add_css("")
# item_loader.add_value("raw", raw)
# yield the populated item first
yield item_loader.load_item()
# then yield the function which paginates to another page that contains data
yield self.paginate(response)
# 3. PAGINATION LEVEL 2
示例6: populate_item
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def populate_item(self, response):
item_loader = ItemLoader(item=MySpiderItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
#item_loader.add_css("", "")
#item_loader.add_css("", "")
yield item_loader.load_item()
示例7: parse
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def parse(self, response):
item_loader = ItemLoader(item=MyItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
item_loader.default_output_processor = TakeFirst()
#
#item_loader.add_css("my_field", "my_css")
#item_loader.add_xpath("my_field", "my_xpath")
#
return item_loader.load_item()
示例8: populate_item
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def populate_item(self, response):
item_loader = ItemLoader(item=MySpiderItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
#item_loader.add_css("field", "")
yield item_loader.load_item()
# 3. PAGINATION LEVEL 2
示例9: parse
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def parse(self, response):
item = PriceItem()
item_loader = ItemLoader(item=item, response=response)
item_loader.default_output_processor = TakeFirst()
item_loader.add_css("price", self.price_css)
item_loader.add_css("stock", self.stock_css)
item_loader.add_value("product_id", response.meta.get("product_id"))
item_loader.add_value("cron_id", self.cron_id)
item_loader.add_value("shop_id", self.shop_id)
item_loader.add_value("item_id", str(uuid.uuid1()))
item_loader.add_value("updated", str(datetime.datetime.now()))
item_loader.add_value("url", response.url)
return item_loader.load_item()
# 2. Updating database by calling the backend API
示例10: parse
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def parse(self, response):
item_loader = ItemLoader(item=MyItem(), response=response)
item_loader.default_input_processor = MapCompose(remove_tags)
#item_loader.add_css("", "")
#item_loader.add_css("", "")
#item_loader.add_css("", "")
yield FormRequest("POST_URL", formdata={'parameter': 'p'},
meta={'item': item_loader.load_item()}, callback=self.populate_field)
示例11: parse
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def parse(self,response):
print('url:', response.url)
articles = response.xpath('//div[@class="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls"]')
for article in articles:
if article.xpath('.//a[@class="button button--smaller button--chromeless u-baseColor--buttonNormal"]/@href').extract_first():
l = ItemLoader(item = MediumItem(), selector = article)
l.default_output_processor = scrapy.loader.processors.TakeFirst()
l.add_css('Title','div > h3::text')
l.add_xpath('Name','.//a[@class="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
l.add_css('Read','span::attr(title)')
l.add_xpath('Publication', './/a[@class="ds-link ds-link--styleSubtle link--darkenlink--accent u-accentColor--textNormal"]/text()')
l.add_xpath('Claps','.//button[@class="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
l.add_xpath('Responses','.//a[@class="button button--chromeless u-baseColor--buttonNormal"]/text()')
l.add_value('Page', response.url)
yield l.load_item()
示例12: parse
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def parse(self, response):
"""Scrape a list of world rankings, cities, countries, and congestion levels"""
"""Then populate an item with the data and return it"""
# Base XPath for extract need values
xpath_selector = "//div[@id='RankingPage-table']//td[{}]"
world_ranks = response.xpath(xpath_selector.format(1)).getall()
cities = response.xpath(xpath_selector.format(3)).getall()
countries = response.xpath(xpath_selector.format(4)).getall()
congestion_levels = response.xpath(xpath_selector.format(5)).getall()
for rank, city, country, level in zip(
world_ranks, cities, countries, congestion_levels,
):
i = ItemLoader(item=TrafficIndexItem())
i.add_value("world_rank", rank)
i.add_value("city", city)
i.add_value("country", country)
i.add_value("congestion_level", level)
yield i.load_item()
示例13: parse
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def parse(self, response):
table = bs(response.body, "html.parser").findAll("tbody")[2]
countries = [
c["title"] for c in table.findAll("a", href=True, title=True)
]
dateCandidates = table.findAll("td", {"data-sort-value": True})
dates = [re.findall(r'\d+', yr)[0] for yr in dateCandidates]
inflations = []
for td in [i.replace("−", "-") for i in table.findAll("td")]:
if isinstance(td, float):
inflations.append(td)
for inflation, country, date in zip(
inflations, countries, dates,
):
i = ItemLoader(item=InflationItem())
i.add_value("inflation", inflation)
i.add_value("date", date)
i.add_value("country", country)
yield i.load_item()
示例14: parse_item
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def parse_item(self, response):
il = ItemLoader(item=ImageItem(), response=response)
il.add_css('image_urls', 'img::attr(src)')
return il.load_item()
示例15: __init__
# 需要导入模块: from scrapy import loader [as 别名]
# 或者: from scrapy.loader import ItemLoader [as 别名]
def __init__(self, *args, **kwargs):
item_loader = ItemLoader(item=Event())
for key, value in kwargs.items():
try:
item_loader.add_value(key, value)
except KeyError:
raise KeyError(f'{key} is not a valid event field')
self.item = item_loader.load_item()