本文整理汇总了Python中scrapy.log.INFO属性的典型用法代码示例。如果您正苦于以下问题:Python log.INFO属性的具体用法?Python log.INFO怎么用?Python log.INFO使用的例子?那么, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类scrapy.log
的用法示例。
在下文中一共展示了log.INFO属性的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: request_seen
# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def request_seen(self, request):
is_seen = is_request_seen(request)
if not is_seen:
log.msg('New URL: %s. Adding it to seen database' % request.url, log.DEBUG)
seen = Seen(fingerprint=request_fingerprint(request),
url=request.url,
last_crawl_time=datetime.now())
try:
session.add(seen)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
else:
log.msg('[seen] "%s" is seen. Skipping.' % request.url, log.INFO)
return is_seen
示例2: info
# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def info(msg):
log.msg(str(msg), level=log.INFO)
示例3: parse
# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def parse(self, response):
self.log("PARSING: %s" % response.request.url, level=log.INFO)
items_to_scrape = response.xpath('//*[@id="topic"]/ul[@id="entry-list"]/li')
if len(items_to_scrape) == 0:
self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
level=log.ERROR)
raise CloseSpider('no_item_found')
for sel in items_to_scrape:
girdi_id = sel.xpath('./@data-id').extract()[0]
baslik_id = response.xpath('//*[@id="title"]/a/@href').re(r'--(\d*)')[0]
baslik = response.xpath('//*[@id="title"]/a/span/text()').extract()[0]
date = sel.xpath('./footer/div[@class="info"]/a[@class="entry-date permalink"]/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
text = sel.xpath('string(./div)').extract()[0]
nick = sel.xpath('./footer/div[@class="info"]/a[@class="entry-author"]/text()').extract()[0]
item = Girdi()
item['source'] = self.name
item['baslik'] = baslik
item['girdi_id'] = girdi_id
item['baslik_id'] = baslik_id
item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
item['text'] = text
item['nick'] = nick
yield item
# Sozluk sayfalamayi javascript ile yapiyor, dolayisi ile sayfa linkini XPath ile alamiyoruz ancak kacinci
# sayfada oldugumuz ve son sayfa html icerisinde yer aliyor. Bu bilgileri kullanarak crawl edilecek bir
# sonraki sayfanin adresini belirle. SSG degistirmez umarim :(
current_page = int(response.xpath('//*[@id="topic"]/div[2]/@data-currentpage').extract()[0])
page_count = int(response.xpath('//*[@id="topic"]/div[2]/@data-pagecount').extract()[0])
current_url = response.request.url.split('?p')[0]
next_page = current_page + 1
if page_count >= next_page:
# if current_page < 1:
yield Request('%s?p=%s' % (current_url, next_page))
示例4: parse
# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def parse(self, response):
self.log("PARSING: %s" % response.request.url, level=log.INFO)
items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
if len(items_to_scrape) == 0:
self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
level=log.ERROR)
raise CloseSpider('no_item_found')
for sel in items_to_scrape:
girdi_id = sel.xpath('./footer/div[@class="entrymenu"]/@data-info').extract()[0].split(',')[0]
baslik_id = response.xpath('//*[@id="canonical_url"]/@value').re(r'--(\d*)')[0]
baslik = response.xpath('//*[@id="title"]/a/text()').extract()[0]
date = sel.xpath('./footer/div[2]/time/a/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
text = sel.xpath('string(./div)').extract()[0]
nick = sel.css('a.yazarlink').xpath('text()').extract()[0]
item = Girdi()
item['source'] = self.name
item['baslik'] = baslik
item['girdi_id'] = girdi_id
item['baslik_id'] = baslik_id
item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
item['text'] = text
item['nick'] = nick
yield item
current_url = response.request.url.split('/sayfa')[0]
title_re = response.xpath('//title').re(r'sayfa (\d*)')
current_page = int(title_re[0]) if title_re else 1
page_count = int(response.xpath('//a[@rel="last"]')[0].xpath('text()').extract()[0])
next_page = current_page + 1
if page_count >= next_page:
# if current_page < 2:
yield Request('%s/sayfa/%s' % (current_url, next_page))
示例5: parse
# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def parse(self, response):
self.log("PARSING: %s" % response.request.url, level=log.INFO)
items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
if len(items_to_scrape) == 0:
self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
level=log.ERROR)
raise CloseSpider('no_item_found')
for sel in items_to_scrape:
girdi_id = sel.css('span.voting').css('a.entryid_a').xpath('./span/text()').re(r'#(\d*)')[0]
baslik_id = response.xpath('//*[@id="main"]/div/div[1]/div[1]/div/ul/li[1]/ul/li/a/@onclick').re("'(\d*)'")[0]
baslik = response.css('h1.title').xpath('./a/text()').extract()[0]
date = sel.xpath('.//a[@class="entry_tarih"]/small/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
text = sel.css('div.entry-p').xpath('string(.)').extract()[0]
nick = sel.css('span.entry-author').xpath('./a/text()').extract()[0].lower()
item = Girdi()
item['source'] = self.name
item['baslik'] = baslik
item['girdi_id'] = girdi_id
item['baslik_id'] = baslik_id
item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
item['text'] = text
item['nick'] = nick
yield item
current_page = int(response.css('div.pagination').css('li.active').xpath('./a/text()').extract()[0])
page_count = int(response.xpath('//*[@id="main"]/div/div[3]/ul/li/a')[-2].xpath('text()').extract()[0])
next_page = current_page + 1
# Bir sonraki adimda sayfalama linkini dondurmek icin sayfalamadan onceki baslik adresini cikarmamiz gerek.
# Adres uludagsozluk.com/k/BASLIK/10 seklinde gitmekte. Path'in sayfalamadan onceki kismini al
url_split = urlsplit(response.request.url)
current_baslik_url = '%s://%s%s' % (url_split.scheme, url_split.netloc, '/'.join(url_split.path.split('/')[:3]))
if page_count >= next_page:
# if current_page < 1:
yield Request('%s/%s' % (current_baslik_url, next_page))
示例6: process_request
# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def process_request(self,request,spider):
user_agent = UserAgent()
ua = user_agent.random
if ua:
log.msg('Current UserAgent: '+ua, level=log.INFO)
request.headers.setdefault('User-Agent', ua)
示例7: parse_hansard_index_page
# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def parse_hansard_index_page(self, response):
""" Parse the handard record for a particular year
Pages from 1998 onward
Example: http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1314.htm
Pages from 1994-1997
Example: http://www.legco.gov.hk/yr95-96/english/lc_sitg/general/yr9596.htm
Currently unsupported
Pages from 1994 and before
Example: http://www.legco.gov.hk/yr94-95/english/lc_sitg/yr9495.htm
"""
sel = Selector(response)
# First find out what format we are dealing with
if sel.xpath("//table//td/strong[starts-with(text(),'Meetings')]"):
self.log("%s: HANSARD - Post 1998 Hansard" % response.url, level=log.INFO)
return self.parse_hansard_post_1998(response)
elif sel.xpath("//table//td/strong[starts-with(text(),'Hansard')]"):
self.log("%s: HANSARD - Pre 1995 Hansard" % response.url, level=log.INFO)
return self.parse_hansard_pre_1995(response)
elif sel.xpath("//h2[starts-with(text(),'LegCo Sittings')]"):
self.log("%s: HANSARD - 1995 - 1997 Hansard" % response.url, level=log.INFO)
self.log("%s: Page type not currently supported" % response.url, level=log.WARNING)
return self.parse_hansard_1995_to_1997(response)
else:
raise Exception("Unknown Hansard page type")
示例8: parse
# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def parse(self, response):
sel = Selector(response)
body = sel.xpath('//div[@id="_content_"]')
if len(body) != 1:
self.log(u'Expected single body element, but found {} on {}'.format(len(body), response.url), level=log.WARNING)
return
body = body[0]
if u'chinese' in response.url:
language = 'C'
matcher = self.HEADER_RE_C
else:
language = 'E'
matcher = self.HEADER_RE_E
# We'll need lxml to parse this
parser = HTMLParser(encoding='utf-8')
body_extract = body.extract().encode('utf-8')
body_elements = lxml.html.fromstring(body_extract, parser=parser)
# Iterate over the body elements, processing each h2-table pair for each meeting
count_sessions = 0
count_questions = 0
for elem in body_elements:
# Skip comments
if elem.tag == lxml.etree.Comment:
continue
# Take the first 50 characters, so RE doesn't scan the whole body of text for large elements
match = re.search(matcher, elem.text_content()[:50])
if match is not None:
this_date = match.groupdict()['date']
self.log(u'Found table for date {}'.format(this_date))
count_sessions += 1
questions_table = elem.getnext()
for row in questions_table.xpath('./tr'):
# We ignore the header row, which is indicated by ths
if row[0].tag == 'th':
continue
this_question = self.make_question(language, response, row, this_date)
count_questions += 1
yield Question(**this_question)
self.log(u'Processed {} questions in {} sessions'.format(count_questions, count_sessions), level=log.INFO)
示例9: process_response
# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def process_response(self, request, response, spider):
url = response.url
if response.status in [301, 307]:
log.msg("trying to redirect us: %s" % url, level=log.INFO)
reason = 'redirect %d' % response.status
return self._retry(request, reason, spider) or response
interval, redirect_url = get_meta_refresh(response)
# handle meta redirect
if redirect_url:
log.msg("trying to redirect us: %s" % url, level=log.INFO)
reason = 'meta'
return self._retry(request, reason, spider) or response
hxs = HtmlXPathSelector(response)
# test for captcha page
captcha = hxs.select(
".//input[contains(@id, 'captchacharacters')]").extract()
if captcha:
log.msg("captcha page %s" % url, level=log.INFO)
reason = 'capcha'
return self._retry(request, reason, spider) or response
return response
示例10: do_scrape
# 需要导入模块: from scrapy import log [as 别名]
# 或者: from scrapy.log import INFO [as 别名]
def do_scrape(spider_name):
"""
Asynchronous task for individual scrapes that is executed by Celery workers.
:param spider_name: str name of the spider that should be run
:return: the full path of the jsonlines output file to which results are stored
"""
# create and configure the spider
crawl_settings = get_project_settings()
# configure the output
# Technically don't need this unless we actually do the scrape, but need to put
# up here before the crawler is instantiated so the FEED_URI override is active
output_name = generate_scrape_name(spider_name)
output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes', output_name)
crawl_settings.overrides['FEED_URI'] = output_path
crawler = Crawler(crawl_settings)
crawler.configure()
try:
spider = crawler.spiders.create(spider_name)
except KeyError as e:
# No spider found.
raise RuntimeError('Could not find spider with name {}'.format(spider_name))
# Check to see if we're already running a scrape by looking for open ScrapeJobs
is_scraping = is_spider_scraping(spider_name)
if is_scraping is False:
logger.info('Starting new scrape of {}'.format(spider_name))
# Create the ScrapeJob record
job_id = do_scrape.request.id
if job_id is None:
# Case if called directly without using Celery, put in a dummy job id
timestamp = datetime.now().strftime('%y%m%d%H%M')
job_id = 'MANUAL_RUN{}'.format(timestamp)
job = ScrapeJob.objects.create(
spider=spider_name,
scheduled=datetime.now(),
# see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task
job_id=job_id,
raw_response=output_path
)
# and set up the callback for updating it
complete_cb = complete_job(job.id)
# Connect the signals and logging, then start it up
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.signals.connect(complete_cb, signal=signals.spider_closed)
log.start(loglevel=log.INFO, logstdout=True)
crawler.crawl(spider)
crawler.start()
reactor.run()
else:
logger.info('Pending job found for spider {}'.format(spider_name))
job = is_scraping
return job.raw_response