本文整理汇总了Python中scrapy.loader方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.loader方法的具体用法?Python scrapy.loader怎么用?Python scrapy.loader使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy
的用法示例。
在下文中一共展示了scrapy.loader方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse(self, response):
self.logger.info('Parse function called on {}'.format(response.url))
# quotes = response.xpath("//div[@class='quote']")
quotes = response.css('div.quote')
for quote in quotes:
loader = ItemLoader(item=QuoteItem(), selector=quote)
# pay attention to the dot .// to use relative xpath
# loader.add_xpath('quote_content', ".//span[@class='text']/text()")
loader.add_css('quote_content', '.text::text')
# loader.add_xpath('author', './/small//text()')
loader.add_css('tags', '.tag::text')
quote_item = loader.load_item()
author_url = quote.css('.author + a::attr(href)').get()
# go to the author page and pass the current collected quote info
yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item})
# go to Next page
for a in response.css('li.next a'):
yield response.follow(a, self.parse)
示例2: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse(self,response):
print('url:', response.url)
articles = response.xpath('//div[@class="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls"]')
for article in articles:
if article.xpath('.//a[@class="button button--smaller button--chromeless u-baseColor--buttonNormal"]/@href').extract_first():
l = ItemLoader(item = MediumItem(), selector = article)
l.default_output_processor = scrapy.loader.processors.TakeFirst()
l.add_css('Title','div > h3::text')
l.add_xpath('Name','.//a[@class="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
l.add_css('Read','span::attr(title)')
l.add_xpath('Publication', './/a[@class="ds-link ds-link--styleSubtle link--darkenlink--accent u-accentColor--textNormal"]/text()')
l.add_xpath('Claps','.//button[@class="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
l.add_xpath('Responses','.//a[@class="button button--chromeless u-baseColor--buttonNormal"]/text()')
l.add_value('Page', response.url)
yield l.load_item()
示例3: phone_parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def phone_parse(self, response):
print("[phone_parse] response:", response)
# get data from `parse_car_page`
loader = response.meta['loader']
item = response.xpath('//p/text()').extract()
print('[phone_parse] item:', type(item), item)
json_data = json.loads(item[0])
print('[phone_parse] json:', json_data)
number = json_data["value"].replace(" ","")
print("'[phone_parse] number:", number)
# add new data to loader
loader.add_value('number', number)
# send all data
yield loader.load_item()
# --- run without project and save in `output.csv` ---
示例4: parse_author
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse_author(self, response):
quote_item = response.meta['quote_item']
loader = ItemLoader(item=quote_item, response=response)
loader.add_css('author_name', '.author-title::text')
loader.add_css('author_birthday', '.author-born-date::text')
loader.add_css('author_bornlocation', '.author-born-location::text')
loader.add_css('author_bio', '.author-description::text')
yield loader.load_item()
示例5: main
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def main():
total = 0
time = 0
tar = tarfile.open("bookfiles.tar.gz")
for member in tar.getmembers():
f = tar.extractfile(member)
html = f.read()
response = HtmlResponse(url="local", body=html, encoding='utf8')
for i in xrange(0, 10):
start = timer()
loader = ItemLoader(item=ItemloaderItem(), response=response)
loader.add_xpath(
'rating', '//*[@id="content_inner"]/article/div[1]/div[2]/p[3]/i[1]')
loader.add_xpath(
'title', '//*[@id=("content_inner")]/article/div[1]/div[2]/h1')
loader.add_xpath(
'price', '//*[@id=("content_inner")]/article/div[1]/div[2]/p[1]')
loader.add_css('stock', '.product_main .instock.availability ::text')
loader.add_css('category', 'ul.breadcrumb li:nth-last-child(2) ::text')
loader.add_value('name', 'item {}'.format(i))
loader.add_value('url', 'http://site.com/item{}'.format(i))
product = loader.load_item()
end = timer()
total += 1
time = time + end - start
print("\nTotal number of items extracted = {0}".format(total))
print("Time taken = {0}".format(time))
click.secho("Rate of link extraction : {0} items/second\n".format(
float(total / time)), bold=True)
with open("Benchmark.txt", 'w') as g:
g.write(" {0}".format((float(total / time))))
示例6: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse(self, response):
#initialize collector item which stores the website's content and meta data
loader = ItemLoader(item=Collector())
loader.add_value("dl_slot", response.request.meta.get('download_slot'))
loader.add_value("redirect", self.checkRedirectDomain(response))
loader.add_value("start_page", response.url)
loader.add_value("start_domain", self.subdomainGetter(response))
loader.add_value("scraped_urls", [response.urljoin(response.url)])
loader.add_value("scrape_counter", 1)
loader.add_value("scraped_text", [self.extractText(response)])
loader.add_value("error", "None")
loader.add_value("ID", response.request.meta["ID"])
#initialize the fingerprints set which stores all fingerprints of visited websites
fingerprints = set()
#add the fingerprints of the start_page
fingerprints.add(request_fingerprint(response.request))
#if there was an initial redirect, the new domain is added to the allowed domains
domain = self.subdomainGetter(response)
if domain not in self.allowed_domains:
self.allowed_domains.append(domain)
self.refreshAllowedDomains()
#extract all urls from the page...
urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
#...and safe them to a urlstack
urlstack = [response.urljoin(url) for url in urls]
#attach the urlstack, the loader, and the fingerprints to the response...
response.meta["urlstack"] = urlstack
response.meta["loader"] = loader
response.meta["fingerprints"] = fingerprints
#...and send it over to the processURLstack function
return self.processURLstack(response)
##################################################################
# PROCESS URL STACK
##################################################################
示例7: parse_car_page
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse_car_page(self, response):
loader = OtomotoCarLoader(OtomotoItem(), response=response)
property_list_map = {
'Marka pojazdu': 'brand',
'Model pojazdu': 'model',
'Rok produkcji': 'year',
}
for params in response.css('.offer-params__item'):
property_name = params.css('.offer-params__label::text').extract_first().strip()
if property_name in property_list_map:
css = params.css('div::text').extract_first().strip()
if css == '':
css = params.css('a::text').extract_first().strip()
loader.add_value(property_list_map[property_name], css)
loader.add_css('features', '.offer-features__item::text')
loader.add_value('url', response.url)
number_id = self.parse_number(response)
print('number_id:', len(number_id), '|', number_id)
for id in number_id:
phone_url = "https://www.otomoto.pl/ajax/misc/contact/multi_phone/" + id + '/0/'
# use `meta=` to send data to `photo_parse`
yield scrapy.Request(phone_url, callback=self.phone_parse, meta={'loader': loader})
示例8: errorback
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def errorback(self, failure):
loader = ItemLoader(item=Collector())
if failure.check(HttpError):
response = failure.value.response
loader.add_value("dl_slot", response.request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("error", response.status)
loader.add_value("ID", response.request.meta["ID"])
yield loader.load_item()
elif failure.check(DNSLookupError):
request = failure.request
loader.add_value("dl_slot", request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("error", "DNS")
loader.add_value("ID", request.meta["ID"])
yield loader.load_item()
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
loader.add_value("dl_slot", request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("error", "Timeout")
loader.add_value("ID", request.meta["ID"])
yield loader.load_item()
else:
request = failure.request
loader.add_value("dl_slot", request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("error", "other")
loader.add_value("ID", request.meta["ID"])
yield loader.load_item()
##################################################################
# MAIN PARSE
##################################################################
示例9: processURLstack
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def processURLstack(self, response):
#get meta data from response object to revive dragged stuff
meta = response.request.meta
loader = meta["loader"]
urlstack = meta["urlstack"]
fingerprints = meta["fingerprints"]
#check whether max number of webpages has been scraped for this website
if self.site_limit != 0:
if loader.get_collected_values("scrape_counter")[0] >= self.site_limit:
del urlstack[:]
#reorder the urlstack to scrape the most relevant urls first
urlstack = self.reorderUrlstack(urlstack, self.language, self.prefer_short_urls)
#check if the next url in the urlstack is valid
while len(urlstack) > 0:
#pop non-valid domains
domain = self.subdomainGetter(urlstack[0])
if domain not in self.allowed_domains:
urlstack.pop(0)
#pop "mailto" urls
elif re.match(r"mailto", urlstack[0]):
urlstack.pop(0)
#pop unwanted filetypes
elif urlstack[0].split(".")[-1].lower() in self.filetypes:
urlstack.pop(0)
#pop visited urls
#(potential bottleneck: Request has to be sent to generate fingerprint from)
elif request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
urlstack.pop(0)
else:
break
#if the url was assessed to be valid, send out a request and callbak the parse_subpage function
#errbacks return to processURLstack
#ALLOW ALL HTTP STATUS:
#errors must to be catched in the callback function, because middleware catched request break the sequence and collector items get lost
if len(urlstack) > 0:
yield scrapy.Request(urlstack.pop(0), meta={"loader": loader, "urlstack": urlstack, "fingerprints": fingerprints, 'handle_httpstatus_all': True}, dont_filter=True, callback=self.parse_subpage, errback=self.processURLstack)
#if there are no urls left in the urlstack, the website was scraped completely and the item can be sent to the pipeline
else:
yield loader.load_item()
##################################################################
# PARSE SUB PAGE
##################################################################
示例10: errorback
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def errorback(self, failure):
loader = ItemLoader(item=LinkCollector())
if failure.check(HttpError):
response = failure.value.response
loader.add_value("dl_slot", response.request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("error", response.status)
loader.add_value("ID", response.request.meta["ID"])
loader.add_value("links", "")
loader.add_value("alias", "")
yield loader.load_item()
elif failure.check(DNSLookupError):
request = failure.request
loader.add_value("dl_slot", request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("error", "DNS")
loader.add_value("ID", request.meta["ID"])
loader.add_value("links", "")
loader.add_value("alias", "")
yield loader.load_item()
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
loader.add_value("dl_slot", request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("error", "Timeout")
loader.add_value("ID", request.meta["ID"])
loader.add_value("links", "")
loader.add_value("alias", "")
yield loader.load_item()
else:
request = failure.request
loader.add_value("dl_slot", request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("error", "other")
loader.add_value("ID", request.meta["ID"])
loader.add_value("links", "")
loader.add_value("alias", "")
yield loader.load_item()
##################################################################
# MAIN PARSE
##################################################################
示例11: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse(self, response):
#initialize collector item which stores the website's content and meta data
loader = ItemLoader(item=LinkCollector())
loader.add_value("dl_slot", response.request.meta.get('download_slot'))
loader.add_value("redirect", self.checkRedirectDomain(response))
#add alias if there was an initial redirect
if self.checkRedirectDomain(response):
loader.add_value("alias", self.subdomainGetter(response).split("www.")[-1])
else:
loader.add_value("alias", "")
loader.add_value("start_page", response.url)
loader.add_value("start_domain", self.subdomainGetter(response))
loader.add_value("scraped_urls", [response.urljoin(response.url)])
loader.add_value("scrape_counter", 1)
loader.add_value("error", "None")
loader.add_value("ID", response.request.meta["ID"])
loader.add_value("links", "")
#initialize the fingerprints set which stores all fingerprints of visited websites
fingerprints = set()
#add the fingerprints of the start_page
fingerprints.add(request_fingerprint(response.request))
#if there was an initial redirect, the new domain is added to the allowed domains
domain = self.subdomainGetter(response)
if domain not in self.allowed_domains:
self.allowed_domains.append(domain)
self.refreshAllowedDomains()
#extract all urls from the page...
urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
#...and safe them to a urlstack
urlstack = [response.urljoin(url) for url in urls]
#attach the urlstack, the loader, and the fingerprints to the response...
response.meta["urlstack"] = urlstack
response.meta["loader"] = loader
response.meta["fingerprints"] = fingerprints
#...and send it over to the processURLstack function
return self.processURLstack(response)
##################################################################
# PROCESS URL STACK
##################################################################
示例12: errorback
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def errorback(self, failure):
loader = ItemLoader(item=Collector())
if failure.check(HttpError):
response = failure.value.response
loader.add_value("dl_slot", response.request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("title", "")
loader.add_value("description", "")
loader.add_value("keywords", "")
loader.add_value("error", response.status)
loader.add_value("ID", response.request.meta["ID"])
yield loader.load_item()
elif failure.check(DNSLookupError):
request = failure.request
loader.add_value("dl_slot", request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("title", "")
loader.add_value("description", "")
loader.add_value("keywords", "")
loader.add_value("error", "DNS")
loader.add_value("ID", request.meta["ID"])
yield loader.load_item()
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
loader.add_value("dl_slot", request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("title", "")
loader.add_value("description", "")
loader.add_value("keywords", "")
loader.add_value("error", "Timeout")
loader.add_value("ID", request.meta["ID"])
yield loader.load_item()
else:
request = failure.request
loader.add_value("dl_slot", request.meta.get('download_slot'))
loader.add_value("start_page", "")
loader.add_value("scraped_urls", "")
loader.add_value("redirect", [None])
loader.add_value("scraped_text", "")
loader.add_value("title", "")
loader.add_value("description", "")
loader.add_value("keywords", "")
loader.add_value("error", "other")
loader.add_value("ID", request.meta["ID"])
yield loader.load_item()
##################################################################
# MAIN PARSE
##################################################################
示例13: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse(self, response):
#initialize collector item which stores the website's content and meta data
loader = ItemLoader(item=Collector())
loader.add_value("dl_slot", response.request.meta.get('download_slot'))
loader.add_value("redirect", self.checkRedirectDomain(response))
loader.add_value("start_page", response.url)
loader.add_value("start_domain", self.subdomainGetter(response))
loader.add_value("scraped_urls", [response.urljoin(response.url)])
loader.add_value("scrape_counter", 1)
loader.add_value("scraped_text", [self.extractText(response)])
title, description, keywords = self.extractHeader(response)
loader.add_value("title", [title])
loader.add_value("description", [description])
loader.add_value("keywords", [keywords])
loader.add_value("error", "None")
loader.add_value("ID", response.request.meta["ID"])
#initialize the fingerprints set which stores all fingerprints of visited websites
fingerprints = set()
#add the fingerprints of the start_page
fingerprints.add(request_fingerprint(response.request))
#if there was an initial redirect, the new domain is added to the allowed domains
domain = self.subdomainGetter(response)
if domain not in self.allowed_domains:
self.allowed_domains.append(domain)
self.refreshAllowedDomains()
#extract all urls from the page...
urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
#...and safe them to a urlstack
urlstack = [response.urljoin(url) for url in urls]
#attach the urlstack, the loader, and the fingerprints to the response...
response.meta["urlstack"] = urlstack
response.meta["loader"] = loader
response.meta["fingerprints"] = fingerprints
#...and send it over to the processURLstack function
return self.processURLstack(response)
##################################################################
# PROCESS URL STACK
##################################################################
示例14: processURLstack
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def processURLstack(self, response):
#get meta data from response object to revive dragged stuff
meta = response.request.meta
loader = meta["loader"]
urlstack = meta["urlstack"]
fingerprints = meta["fingerprints"]
#check whether max number of websites has been scraped for this website
if self.site_limit != 0:
if loader.get_collected_values("scrape_counter")[0] >= self.site_limit:
del urlstack[:]
#reorder the urlstack to scrape the most relevant urls first
urlstack = self.reorderUrlstack(urlstack, self.language, self.prefer_short_urls)
#check if the next url in the urlstack is valid
while len(urlstack) > 0:
#pop non-valid domains
domain = self.subdomainGetter(urlstack[0])
if domain not in self.allowed_domains:
urlstack.pop(0)
#pop "mailto" urls
elif re.match(r"mailto", urlstack[0]):
urlstack.pop(0)
#pop unwanted filetypes
elif urlstack[0].split(".")[-1].lower() in self.filetypes:
urlstack.pop(0)
#pop visited urls
#(potential bottleneck: Request has to be sent to generate fingerprint from)
elif request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
urlstack.pop(0)
else:
break
#if the url was assessed to be valid, send out a request and callback the parse_subpage function
#errbacks return to processURLstack
#ALLOW ALL HTTP STATUS:
#errors must be caught in the callback function, because middleware caught request break the sequence and collector items get lost
if len(urlstack) > 0:
yield scrapy.Request(urlstack.pop(0), meta={"loader": loader, "urlstack": urlstack, "fingerprints": fingerprints, 'handle_httpstatus_all': True}, dont_filter=True, callback=self.parse_subpage, errback=self.processURLstack)
#if there are no urls left in the urlstack, the website was scraped completely and the item can be sent to the pipeline
else:
yield loader.load_item()
##################################################################
# PARSE SUB PAGE
##################################################################