当前位置: 首页>>代码示例>>Python>>正文


Python log.INFO属性代码示例

本文整理汇总了Python中scrapy.log.INFO属性的典型用法代码示例。如果您正苦于以下问题:Python log.INFO属性的具体用法?Python log.INFO怎么用?Python log.INFO使用的例子?那么, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在scrapy.log的用法示例。


在下文中一共展示了log.INFO属性的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: request_seen

# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def request_seen(self, request):
        is_seen = is_request_seen(request)

        if not is_seen:
            log.msg('New URL: %s. Adding it to seen database' % request.url, log.DEBUG)
            seen = Seen(fingerprint=request_fingerprint(request),
                        url=request.url,
                        last_crawl_time=datetime.now())
            try:
                session.add(seen)
                session.commit()
            except:
                session.rollback()
                raise
            finally:
                session.close()
        else:
            log.msg('[seen] "%s" is seen. Skipping.' % request.url, log.INFO)

        return is_seen 
开发者ID:eren,项目名称:sozlukcrawler,代码行数:22,代码来源:dupefilter.py

示例2: info

# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def info(msg):
    log.msg(str(msg), level=log.INFO) 
开发者ID:geekan,项目名称:google-scholar-crawler,代码行数:4,代码来源:log.py

示例3: parse

# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="topic"]/ul[@id="entry-list"]/li')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.xpath('./@data-id').extract()[0]
            baslik_id = response.xpath('//*[@id="title"]/a/@href').re(r'--(\d*)')[0]
            baslik = response.xpath('//*[@id="title"]/a/span/text()').extract()[0]
            date = sel.xpath('./footer/div[@class="info"]/a[@class="entry-date permalink"]/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.xpath('string(./div)').extract()[0]
            nick = sel.xpath('./footer/div[@class="info"]/a[@class="entry-author"]/text()').extract()[0]

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        # Sozluk sayfalamayi javascript ile yapiyor, dolayisi ile sayfa linkini XPath ile alamiyoruz ancak kacinci
        # sayfada oldugumuz ve son sayfa html icerisinde yer aliyor. Bu bilgileri kullanarak crawl edilecek bir
        # sonraki sayfanin adresini belirle. SSG degistirmez umarim :(
        current_page = int(response.xpath('//*[@id="topic"]/div[2]/@data-currentpage').extract()[0])
        page_count = int(response.xpath('//*[@id="topic"]/div[2]/@data-pagecount').extract()[0])

        current_url = response.request.url.split('?p')[0]

        next_page = current_page + 1
        if page_count >= next_page:
        # if current_page < 1:
            yield Request('%s?p=%s' % (current_url, next_page)) 
开发者ID:eren,项目名称:sozlukcrawler,代码行数:42,代码来源:eksisozluk.py

示例4: parse

# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.xpath('./footer/div[@class="entrymenu"]/@data-info').extract()[0].split(',')[0]
            baslik_id = response.xpath('//*[@id="canonical_url"]/@value').re(r'--(\d*)')[0]
            baslik = response.xpath('//*[@id="title"]/a/text()').extract()[0]
            date = sel.xpath('./footer/div[2]/time/a/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.xpath('string(./div)').extract()[0]
            nick = sel.css('a.yazarlink').xpath('text()').extract()[0]

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        current_url = response.request.url.split('/sayfa')[0]

        title_re = response.xpath('//title').re(r'sayfa (\d*)')
        current_page = int(title_re[0]) if title_re else 1

        page_count = int(response.xpath('//a[@rel="last"]')[0].xpath('text()').extract()[0])

        next_page = current_page + 1
        if page_count >= next_page:
        # if current_page < 2:
            yield Request('%s/sayfa/%s' % (current_url, next_page)) 
开发者ID:eren,项目名称:sozlukcrawler,代码行数:41,代码来源:itusozluk.py

示例5: parse

# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.css('span.voting').css('a.entryid_a').xpath('./span/text()').re(r'#(\d*)')[0]
            baslik_id = response.xpath('//*[@id="main"]/div/div[1]/div[1]/div/ul/li[1]/ul/li/a/@onclick').re("'(\d*)'")[0]
            baslik = response.css('h1.title').xpath('./a/text()').extract()[0]
            date = sel.xpath('.//a[@class="entry_tarih"]/small/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.css('div.entry-p').xpath('string(.)').extract()[0]
            nick = sel.css('span.entry-author').xpath('./a/text()').extract()[0].lower()

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        current_page = int(response.css('div.pagination').css('li.active').xpath('./a/text()').extract()[0])
        page_count = int(response.xpath('//*[@id="main"]/div/div[3]/ul/li/a')[-2].xpath('text()').extract()[0])
        next_page = current_page + 1

        # Bir sonraki adimda sayfalama linkini dondurmek icin sayfalamadan onceki baslik adresini cikarmamiz gerek.
        # Adres uludagsozluk.com/k/BASLIK/10 seklinde gitmekte. Path'in sayfalamadan onceki kismini al
        url_split = urlsplit(response.request.url)
        current_baslik_url = '%s://%s%s' % (url_split.scheme, url_split.netloc, '/'.join(url_split.path.split('/')[:3]))

        if page_count >= next_page:
        # if current_page < 1:
            yield Request('%s/%s' % (current_baslik_url, next_page)) 
开发者ID:eren,项目名称:sozlukcrawler,代码行数:42,代码来源:uludagsozluk.py

示例6: process_request

# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def process_request(self,request,spider):
        user_agent = UserAgent()
        ua = user_agent.random
        if ua:
            log.msg('Current UserAgent: '+ua, level=log.INFO) 
            request.headers.setdefault('User-Agent', ua) 
开发者ID:snjoer,项目名称:Douban_Crawler,代码行数:8,代码来源:middlewares.py

示例7: parse_hansard_index_page

# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def parse_hansard_index_page(self, response):
        """ Parse the handard record for a particular year

            Pages from 1998 onward
            Example: http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1314.htm

            Pages from 1994-1997
            Example: http://www.legco.gov.hk/yr95-96/english/lc_sitg/general/yr9596.htm
            Currently unsupported

            Pages from 1994 and before
            Example: http://www.legco.gov.hk/yr94-95/english/lc_sitg/yr9495.htm
        """
        sel = Selector(response)    

        # First find out what format we are dealing with
    
        if sel.xpath("//table//td/strong[starts-with(text(),'Meetings')]"):
            self.log("%s: HANSARD - Post 1998 Hansard" % response.url, level=log.INFO)
            return self.parse_hansard_post_1998(response)
        elif sel.xpath("//table//td/strong[starts-with(text(),'Hansard')]"):
            self.log("%s: HANSARD - Pre 1995 Hansard" % response.url, level=log.INFO)
            return self.parse_hansard_pre_1995(response)
        elif sel.xpath("//h2[starts-with(text(),'LegCo Sittings')]"):
            self.log("%s: HANSARD - 1995 - 1997 Hansard" % response.url, level=log.INFO)
            self.log("%s: Page type not currently supported" % response.url, level=log.WARNING)
            return self.parse_hansard_1995_to_1997(response)
        else:
            raise Exception("Unknown Hansard page type") 
开发者ID:legco-watch,项目名称:legco-watch,代码行数:31,代码来源:hansard.py

示例8: parse

# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def parse(self, response):
        sel = Selector(response)
        body = sel.xpath('//div[@id="_content_"]')
        if len(body) != 1:
            self.log(u'Expected single body element, but found {} on {}'.format(len(body), response.url), level=log.WARNING)
            return
        body = body[0]
        if u'chinese' in response.url:
            language = 'C'
            matcher = self.HEADER_RE_C
        else:
            language = 'E'
            matcher = self.HEADER_RE_E
        # We'll need lxml to parse this
        parser = HTMLParser(encoding='utf-8')
        body_extract = body.extract().encode('utf-8')
        body_elements = lxml.html.fromstring(body_extract, parser=parser)
        # Iterate over the body elements, processing each h2-table pair for each meeting
        count_sessions = 0
        count_questions = 0
        for elem in body_elements:
            # Skip comments
            if elem.tag == lxml.etree.Comment:
                continue
            # Take the first 50 characters, so RE doesn't scan the whole body of text for large elements
            match = re.search(matcher, elem.text_content()[:50])
            if match is not None:
                this_date = match.groupdict()['date']
                self.log(u'Found table for date {}'.format(this_date))
                count_sessions += 1
                questions_table = elem.getnext()
                for row in questions_table.xpath('./tr'):
                    # We ignore the header row, which is indicated by ths
                    if row[0].tag == 'th':
                        continue
                    this_question = self.make_question(language, response, row, this_date)
                    count_questions += 1
                    yield Question(**this_question)

        self.log(u'Processed {} questions in {} sessions'.format(count_questions, count_sessions), level=log.INFO) 
开发者ID:legco-watch,项目名称:legco-watch,代码行数:42,代码来源:questions.py

示例9: process_response

# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def process_response(self, request, response, spider):
        url = response.url

        if response.status in [301, 307]:
            log.msg("trying to redirect us: %s" % url, level=log.INFO)
            reason = 'redirect %d' % response.status

            return self._retry(request, reason, spider) or response
        interval, redirect_url = get_meta_refresh(response)
        # handle meta redirect

        if redirect_url:
            log.msg("trying to redirect us: %s" % url, level=log.INFO)
            reason = 'meta'

            return self._retry(request, reason, spider) or response

        hxs = HtmlXPathSelector(response)
        # test for captcha page
        captcha = hxs.select(
            ".//input[contains(@id, 'captchacharacters')]").extract()

        if captcha:
            log.msg("captcha page %s" % url, level=log.INFO)
            reason = 'capcha'

            return self._retry(request, reason, spider) or response

        return response 
开发者ID:Karmenzind,项目名称:fp-server,代码行数:31,代码来源:middlewares.py

示例10: do_scrape

# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def do_scrape(spider_name):
    """
    Asynchronous task for individual scrapes that is executed by Celery workers.
    :param spider_name: str name of the spider that should be run
    :return: the full path of the jsonlines output file to which results are stored
    """
    # create and configure the spider
    crawl_settings = get_project_settings()
    # configure the output
    # Technically don't need this unless we actually do the scrape, but need to put
    # up here before the crawler is instantiated so the FEED_URI override is active
    output_name = generate_scrape_name(spider_name)
    output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes', output_name)
    crawl_settings.overrides['FEED_URI'] = output_path
    crawler = Crawler(crawl_settings)
    crawler.configure()
    try:
        spider = crawler.spiders.create(spider_name)
    except KeyError as e:
        # No spider found.
        raise RuntimeError('Could not find spider with name {}'.format(spider_name))

    # Check to see if we're already running a scrape by looking for open ScrapeJobs
    is_scraping = is_spider_scraping(spider_name)
    if is_scraping is False:
        logger.info('Starting new scrape of {}'.format(spider_name))
        # Create the ScrapeJob record
        job_id = do_scrape.request.id
        if job_id is None:
            # Case if called directly without using Celery, put in a dummy job id
            timestamp = datetime.now().strftime('%y%m%d%H%M')
            job_id = 'MANUAL_RUN{}'.format(timestamp)
        job = ScrapeJob.objects.create(
            spider=spider_name,
            scheduled=datetime.now(),
            # see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task
            job_id=job_id,
            raw_response=output_path
        )
        # and set up the callback for updating it
        complete_cb = complete_job(job.id)

        # Connect the signals and logging, then start it up
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.signals.connect(complete_cb, signal=signals.spider_closed)
        log.start(loglevel=log.INFO, logstdout=True)
        crawler.crawl(spider)
        crawler.start()
        reactor.run()
    else:
        logger.info('Pending job found for spider {}'.format(spider_name))
        job = is_scraping

    return job.raw_response 
开发者ID:legco-watch,项目名称:legco-watch,代码行数:56,代码来源:tasks.py


注:本文中的scrapy.log.INFO属性示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。