当前位置: 首页>>代码示例>>Python>>正文


Python scrapy.loader方法代码示例

本文整理汇总了Python中scrapy.loader方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.loader方法的具体用法?Python scrapy.loader怎么用?Python scrapy.loader使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy的用法示例。


在下文中一共展示了scrapy.loader方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse(self, response):
        self.logger.info('Parse function called on {}'.format(response.url))
        # quotes = response.xpath("//div[@class='quote']")
        quotes = response.css('div.quote')

        for quote in quotes:
            loader = ItemLoader(item=QuoteItem(), selector=quote)
            # pay attention to the dot .// to use relative xpath
            # loader.add_xpath('quote_content', ".//span[@class='text']/text()")
            loader.add_css('quote_content', '.text::text')
            # loader.add_xpath('author', './/small//text()')
            loader.add_css('tags', '.tag::text')
            quote_item = loader.load_item()
            author_url = quote.css('.author + a::attr(href)').get()
            # go to the author page and pass the current collected quote info
            yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item})

        # go to Next page
        for a in response.css('li.next a'):
            yield response.follow(a, self.parse) 
开发者ID:harrywang,项目名称:scrapy-tutorial,代码行数:22,代码来源:quotes_spider.py

示例2: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse(self,response):
        print('url:', response.url)
        articles = response.xpath('//div[@class="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls"]')
        for article in articles:

            if article.xpath('.//a[@class="button button--smaller button--chromeless u-baseColor--buttonNormal"]/@href').extract_first():
                l = ItemLoader(item = MediumItem(), selector = article)
                l.default_output_processor = scrapy.loader.processors.TakeFirst()
                l.add_css('Title','div > h3::text')
                l.add_xpath('Name','.//a[@class="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
                l.add_css('Read','span::attr(title)')
                l.add_xpath('Publication', './/a[@class="ds-link ds-link--styleSubtle link--darkenlink--accent u-accentColor--textNormal"]/text()')
                l.add_xpath('Claps','.//button[@class="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
                l.add_xpath('Responses','.//a[@class="button button--chromeless u-baseColor--buttonNormal"]/text()')
                l.add_value('Page', response.url)
                yield l.load_item() 
开发者ID:furas,项目名称:python-examples,代码行数:18,代码来源:main.py

示例3: phone_parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def phone_parse(self, response):
        print("[phone_parse] response:", response)

        # get data from `parse_car_page`
        loader = response.meta['loader']

        item = response.xpath('//p/text()').extract()
        print('[phone_parse] item:', type(item), item)
        
        json_data = json.loads(item[0])
        print('[phone_parse] json:', json_data)

        number = json_data["value"].replace(" ","")
        print("'[phone_parse] number:", number)
        
        # add new data to loader
        loader.add_value('number', number)
        
        # send all data 
        yield loader.load_item()

# --- run without project and save in `output.csv` --- 
开发者ID:furas,项目名称:python-examples,代码行数:24,代码来源:main.py

示例4: parse_author

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse_author(self, response):
        quote_item = response.meta['quote_item']
        loader = ItemLoader(item=quote_item, response=response)
        loader.add_css('author_name', '.author-title::text')
        loader.add_css('author_birthday', '.author-born-date::text')
        loader.add_css('author_bornlocation', '.author-born-location::text')
        loader.add_css('author_bio', '.author-description::text')
        yield loader.load_item() 
开发者ID:harrywang,项目名称:scrapy-tutorial,代码行数:10,代码来源:quotes_spider.py

示例5: main

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def main():
    total = 0
    time = 0
    tar = tarfile.open("bookfiles.tar.gz")

    for member in tar.getmembers():
        f = tar.extractfile(member)
        html = f.read()
        response = HtmlResponse(url="local", body=html, encoding='utf8')

        for i in xrange(0, 10):
            start = timer()
            loader = ItemLoader(item=ItemloaderItem(), response=response)
            loader.add_xpath(
                'rating', '//*[@id="content_inner"]/article/div[1]/div[2]/p[3]/i[1]')
            loader.add_xpath(
                'title', '//*[@id=("content_inner")]/article/div[1]/div[2]/h1')
            loader.add_xpath(
                'price', '//*[@id=("content_inner")]/article/div[1]/div[2]/p[1]')
            loader.add_css('stock', '.product_main .instock.availability ::text')
            loader.add_css('category', 'ul.breadcrumb li:nth-last-child(2) ::text')
            loader.add_value('name', 'item {}'.format(i))
            loader.add_value('url', 'http://site.com/item{}'.format(i))
            product = loader.load_item()
            end = timer()

            total += 1
            time = time + end - start


    print("\nTotal number of items extracted = {0}".format(total))
    print("Time taken = {0}".format(time))
    click.secho("Rate of link extraction : {0} items/second\n".format(
        float(total / time)), bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / time)))) 
开发者ID:scrapy,项目名称:scrapy-bench,代码行数:39,代码来源:itemloader.py

示例6: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=Collector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("scraped_text", [self.extractText(response)])
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]   
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
################################################################## 
开发者ID:datawizard1337,项目名称:ARGUS,代码行数:43,代码来源:textspider.py

示例7: parse_car_page

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse_car_page(self, response):

        loader = OtomotoCarLoader(OtomotoItem(), response=response)

        property_list_map = {
            'Marka pojazdu': 'brand',
            'Model pojazdu': 'model',
            'Rok produkcji': 'year',
        }

        for params in response.css('.offer-params__item'):

            property_name = params.css('.offer-params__label::text').extract_first().strip()
            
            if property_name in property_list_map:
                css = params.css('div::text').extract_first().strip()
                
                if css == '':
                    css = params.css('a::text').extract_first().strip()

                loader.add_value(property_list_map[property_name], css)

        loader.add_css('features', '.offer-features__item::text')
        loader.add_value('url', response.url)
        
        number_id = self.parse_number(response)
        print('number_id:', len(number_id), '|', number_id)
        
        for id in number_id:
            phone_url = "https://www.otomoto.pl/ajax/misc/contact/multi_phone/" + id + '/0/'
            # use `meta=` to send data to `photo_parse`
            yield scrapy.Request(phone_url, callback=self.phone_parse, meta={'loader': loader}) 
开发者ID:furas,项目名称:python-examples,代码行数:34,代码来源:main.py

示例8: errorback

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def errorback(self, failure):
        loader = ItemLoader(item=Collector())
        if failure.check(HttpError):
            response = failure.value.response
            loader.add_value("dl_slot", response.request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", response.status)
            loader.add_value("ID", response.request.meta["ID"])
            yield loader.load_item()
        elif failure.check(DNSLookupError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "DNS")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item() 
        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "Timeout")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()
        else:
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "other")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()


##################################################################
# MAIN PARSE
################################################################## 
开发者ID:datawizard1337,项目名称:ARGUS,代码行数:49,代码来源:textspider.py

示例9: processURLstack

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def processURLstack(self, response):

        #get meta data from response object to revive dragged stuff
        meta = response.request.meta
        loader = meta["loader"]
        urlstack = meta["urlstack"]
        fingerprints = meta["fingerprints"]
        
        #check whether max number of webpages has been scraped for this website
        if self.site_limit != 0:
            if loader.get_collected_values("scrape_counter")[0] >= self.site_limit:
                del urlstack[:]
        
        #reorder the urlstack to scrape the most relevant urls first
        urlstack = self.reorderUrlstack(urlstack, self.language, self.prefer_short_urls)
            
        #check if the next url in the urlstack is valid
        while len(urlstack) > 0:
            #pop non-valid domains
            domain = self.subdomainGetter(urlstack[0])
            if domain not in self.allowed_domains:
                urlstack.pop(0)
            #pop "mailto" urls
            elif re.match(r"mailto", urlstack[0]):
                urlstack.pop(0)
            #pop unwanted filetypes
            elif urlstack[0].split(".")[-1].lower() in self.filetypes:
                urlstack.pop(0)
            #pop visited urls 
            #(potential bottleneck: Request has to be sent to generate fingerprint from)
            elif request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
                urlstack.pop(0)
            else:
                break

        #if the url was assessed to be valid, send out a request and callbak the parse_subpage function
        #errbacks return to processURLstack
        #ALLOW ALL HTTP STATUS: 
        #errors must to be catched in the callback function, because middleware catched request break the sequence and collector items get lost
        if len(urlstack) > 0:
            yield scrapy.Request(urlstack.pop(0), meta={"loader": loader, "urlstack": urlstack, "fingerprints": fingerprints, 'handle_httpstatus_all': True}, dont_filter=True, callback=self.parse_subpage, errback=self.processURLstack)
        #if there are no urls left in the urlstack, the website was scraped completely and the item can be sent to the pipeline
        else:
            yield loader.load_item()
    
    
##################################################################
# PARSE SUB PAGE
################################################################## 
开发者ID:datawizard1337,项目名称:ARGUS,代码行数:51,代码来源:textspider.py

示例10: errorback

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def errorback(self, failure):
        loader = ItemLoader(item=LinkCollector())
        if failure.check(HttpError):
            response = failure.value.response
            loader.add_value("dl_slot", response.request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", response.status)
            loader.add_value("ID", response.request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()
        elif failure.check(DNSLookupError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "DNS")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item() 
        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "Timeout")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()
        else:
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("error", "other")
            loader.add_value("ID", request.meta["ID"])
            loader.add_value("links", "")
            loader.add_value("alias", "")
            yield loader.load_item()


##################################################################
# MAIN PARSE
################################################################## 
开发者ID:datawizard1337,项目名称:ARGUS,代码行数:57,代码来源:linkspider.py

示例11: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=LinkCollector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        #add alias if there was an initial redirect
        if self.checkRedirectDomain(response):
            loader.add_value("alias", self.subdomainGetter(response).split("www.")[-1])
        else:
            loader.add_value("alias", "") 
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])
        loader.add_value("links", "")

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
################################################################## 
开发者ID:datawizard1337,项目名称:ARGUS,代码行数:48,代码来源:linkspider.py

示例12: errorback

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def errorback(self, failure):
        loader = ItemLoader(item=Collector())
        if failure.check(HttpError):
            response = failure.value.response
            loader.add_value("dl_slot", response.request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", response.status)
            loader.add_value("ID", response.request.meta["ID"])
            yield loader.load_item()
        elif failure.check(DNSLookupError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", "DNS")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item() 
        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", "Timeout")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()
        else:
            request = failure.request
            loader.add_value("dl_slot", request.meta.get('download_slot'))
            loader.add_value("start_page", "")
            loader.add_value("scraped_urls", "")
            loader.add_value("redirect", [None])
            loader.add_value("scraped_text", "")
            loader.add_value("title", "")
            loader.add_value("description", "")
            loader.add_value("keywords", "")
            loader.add_value("error", "other")
            loader.add_value("ID", request.meta["ID"])
            yield loader.load_item()


##################################################################
# MAIN PARSE
################################################################## 
开发者ID:datawizard1337,项目名称:ARGUS,代码行数:61,代码来源:textspider.py

示例13: parse

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def parse(self, response):

        #initialize collector item which stores the website's content and meta data
        loader = ItemLoader(item=Collector())
        loader.add_value("dl_slot", response.request.meta.get('download_slot'))
        loader.add_value("redirect", self.checkRedirectDomain(response))
        loader.add_value("start_page", response.url)
        loader.add_value("start_domain", self.subdomainGetter(response))  
        loader.add_value("scraped_urls", [response.urljoin(response.url)])
        loader.add_value("scrape_counter", 1)
        loader.add_value("scraped_text", [self.extractText(response)])
        title, description, keywords = self.extractHeader(response)
        loader.add_value("title", [title])
        loader.add_value("description", [description])
        loader.add_value("keywords", [keywords])
        loader.add_value("error", "None")
        loader.add_value("ID", response.request.meta["ID"])

        #initialize the fingerprints set which stores all fingerprints of visited websites
        fingerprints = set()
        #add the fingerprints of the start_page
        fingerprints.add(request_fingerprint(response.request))
        
        #if there was an initial redirect, the new domain is added to the allowed domains
        domain = self.subdomainGetter(response)
        if domain not in self.allowed_domains:
            self.allowed_domains.append(domain)
            self.refreshAllowedDomains()

        #extract all urls from the page...
        urls = response.xpath("//a/@href").extract() + response.xpath("//frame/@src").extract() + response.xpath("//frameset/@src").extract()
        #...and safe them to a urlstack
        urlstack = [response.urljoin(url) for url in urls]
            
        #attach the urlstack, the loader, and the fingerprints to the response...        
        response.meta["urlstack"] = urlstack
        response.meta["loader"] = loader
        response.meta["fingerprints"] = fingerprints
        #...and send it over to the processURLstack function
        return self.processURLstack(response)
    
    
##################################################################
# PROCESS URL STACK
################################################################## 
开发者ID:datawizard1337,项目名称:ARGUS,代码行数:47,代码来源:textspider.py

示例14: processURLstack

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import loader [as 别名]
def processURLstack(self, response):

        #get meta data from response object to revive dragged stuff
        meta = response.request.meta
        loader = meta["loader"]
        urlstack = meta["urlstack"]
        fingerprints = meta["fingerprints"]
        
        #check whether max number of websites has been scraped for this website
        if self.site_limit != 0:
            if loader.get_collected_values("scrape_counter")[0] >= self.site_limit:
                del urlstack[:]
        
        #reorder the urlstack to scrape the most relevant urls first
        urlstack = self.reorderUrlstack(urlstack, self.language, self.prefer_short_urls)
            
        #check if the next url in the urlstack is valid
        while len(urlstack) > 0:
            #pop non-valid domains
            domain = self.subdomainGetter(urlstack[0])
            if domain not in self.allowed_domains:
                urlstack.pop(0)
            #pop "mailto" urls
            elif re.match(r"mailto", urlstack[0]):
                urlstack.pop(0)
            #pop unwanted filetypes
            elif urlstack[0].split(".")[-1].lower() in self.filetypes:
                urlstack.pop(0)
            #pop visited urls 
            #(potential bottleneck: Request has to be sent to generate fingerprint from)
            elif request_fingerprint(scrapy.Request(urlstack[0], callback=None)) in fingerprints:
                urlstack.pop(0)
            else:
                break

        #if the url was assessed to be valid, send out a request and callback the parse_subpage function
        #errbacks return to processURLstack
        #ALLOW ALL HTTP STATUS: 
        #errors must be caught in the callback function, because middleware caught request break the sequence and collector items get lost
        if len(urlstack) > 0:
            yield scrapy.Request(urlstack.pop(0), meta={"loader": loader, "urlstack": urlstack, "fingerprints": fingerprints, 'handle_httpstatus_all': True}, dont_filter=True, callback=self.parse_subpage, errback=self.processURLstack)
        #if there are no urls left in the urlstack, the website was scraped completely and the item can be sent to the pipeline
        else:
            yield loader.load_item()
    
    
##################################################################
# PARSE SUB PAGE
################################################################## 
开发者ID:datawizard1337,项目名称:ARGUS,代码行数:51,代码来源:textspider.py


注:本文中的scrapy.loader方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。