本文整理汇总了Python中scrapy.crawler.CrawlerProcess方法的典型用法代码示例。如果您正苦于以下问题:Python crawler.CrawlerProcess方法的具体用法?Python crawler.CrawlerProcess怎么用?Python crawler.CrawlerProcess使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.crawler
的用法示例。
在下文中一共展示了crawler.CrawlerProcess方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: runspider
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def runspider(name):
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='log/%s.log' % name,
format='%(levelname)s %(asctime)s: %(message)s',
level=logging.DEBUG
)
process = CrawlerProcess(get_project_settings())
try:
logging.info('runspider start spider:%s' % name)
process.crawl(name)
process.start()
except Exception as e:
logging.exception('runspider spider:%s exception:%s' % (name, e))
logging.debug('finish this spider:%s\n\n' % name)
示例2: runspider
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def runspider(spargs):
url = spargs.get('url')
name = spargs.get('name', 'jd')
if not os.path.exists('log'):
os.makedirs('log')
configure_logging(install_root_handler = False)
logging.basicConfig(
filename = 'log/%s.log' % name,
format = '%(levelname)s %(asctime)s: %(message)s',
level = logging.ERROR
)
print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
process = CrawlerProcess(get_project_settings())
start_time = time.time()
try:
logging.info('进入爬虫')
process.crawl(name, **spargs)
process.start()
except Exception, e:
process.stop()
logging.error("url:%s, errorMsg:%s" % (url, e.message))
示例3: runspider
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def runspider(spargs):
url = spargs.get('url')
name = spargs.get('name', 'jd')
guid = spargs.get('guid')
product_id = spargs.get('product_id')
if not os.path.exists('log'):
os.makedirs('log')
configure_logging(install_root_handler = False)
logging.basicConfig(
filename = 'log/%s.log' % name,
format = '%(levelname)s %(asctime)s: %(message)s',
level = logging.ERROR
)
print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
process = CrawlerProcess(get_project_settings())
start_time = time.time()
try:
logging.info('进入爬虫')
process.crawl(name, **spargs)
process.start()
except Exception, e:
process.stop()
logging.error("url:%s, errorMsg:%s" % (url, e.message))
示例4: main
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def main():
"""Main routine for the execution of the Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print("Item extracted:", item)
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# define the spider for the crawler
crawler.crawl(EuropythonSpyder())
# start scrapy
print("STARTING ENGINE")
crawler.start() #iniciar el crawler llamando al spider definido
print("ENGINE STOPPED")
开发者ID:PacktPublishing,项目名称:Learning-Python-Networking-Second-Edition,代码行数:25,代码来源:EuropythonSpyder.py
示例5: __init__
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def __init__(self, splash_url, crawler_options):
self.process = CrawlerProcess({'LOG_ENABLED': True})
self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': crawler_options['user_agent'],
'SPLASH_URL': splash_url,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2,
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
'DEPTH_LIMIT': crawler_options['depth_limit'],
'SPLASH_COOKIES_DEBUG': False
})
示例6: __init__
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def __init__(self, accounts, loglevel, remote=False):
self.accounts = settings.SCRAPY_ACCOUNTS
if accounts:
self.accounts.update(accounts)
self.loglevel = loglevel
self.settings = self._get_settings()
# Values for `loglevel`: CRITICAL, ERROR, WARNING, INFO, DEBUG.
self.settings.set('LOG_LEVEL', loglevel)
if remote:
# Configure remote logging and disable the scrapy logging.
self.settings.set('LOG_ENABLED', False)
logger = logging.getLogger()
handler = ScrapySocketHandler(
'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT)
handler.setLevel(loglevel)
logger.addHandler(handler)
self.process = CrawlerProcess(self.settings)
示例7: load_crawler
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def load_crawler(self, crawler, url, ignore_regex):
"""
Loads the given crawler with the given url.
:param class crawler: class of the crawler to load
:param str url: url to start the crawler with
:param regex ignore_regex: to be able to ignore urls that match this
regex code
"""
self.process = CrawlerProcess(self.cfg.get_scrapy_options())
self.process.crawl(
crawler,
self.helper,
url=url,
config=self.cfg,
ignore_regex=ignore_regex)
示例8: cleanup
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def cleanup(ctx):
"""
Cleanup old cache entries.
By default, entries older than 90 days will be removed. This value can be
overriden in the config file.
"""
settings = ctx.obj["settings"]
# Manually configure logging since we don't have a CrawlerProcess which
# would take care of that.
configure_logging(settings)
if not settings.getbool("HTTPCACHE_ENABLED"):
logger.error("Cache is disabled, will not clean up cache dir.")
return 1
run_cleanup_cache(settings)
示例9: fetch_url
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def fetch_url(cls, session, msites, platform_id, purpose):
"""Actual method to do fetch url action.
Parameters
----------
msites : list
a list of Site model class, contains info to build spiders.
platform_id : int
id of platform, bind fetched url with this id.
purpose : {'update', 'archive'}
indicate which url to fetch.
"""
settings = Settings(cls.conf['crawl']['scrapy'])
settings.set('ITEM_PIPELINES',
{'hoaxy.crawl.pipelines.UrlPipeline': 300})
process = CrawlerProcess(settings)
sll = cls.conf['logging']['loggers']['scrapy']['level']
logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
for ms in msites:
for sm in build_spiders_iter(ms, purpose):
sm['kwargs']['session'] = session
sm['kwargs']['platform_id'] = platform_id
process.crawl(sm['cls'], *sm['args'], **sm['kwargs'])
process.start()
示例10: fetch_html
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def fetch_html(cls, session, url_tuples):
"""Actual method to do fetch html action.
Parameters
----------
session : object
a SQLAlchemy session object.
url_tuples : list
a list of url tuple (id, raw, status_code).
"""
settings = Settings(cls.conf['crawl']['scrapy'])
settings.set('ITEM_PIPELINES',
{'hoaxy.crawl.pipelines.HtmlPipeline': 300})
process = CrawlerProcess(settings)
sll = cls.conf['logging']['loggers']['scrapy']['level']
logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
logger.warning('Number of url to fetch html is: %s', len(url_tuples))
process.crawl(
HtmlSpider,
session=session,
url_tuples=url_tuples,
excluded_domains=cls.conf['crawl']['excluded_domains'])
process.start()
示例11: crawl
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def crawl(url, user_agent):
try:
output = Services.get("output")
# Settings for the crawler
settings = get_project_settings()
settings.set("USER_AGENT", user_agent)
settings.set("LOG_LEVEL", "CRITICAL")
settings.set("RETRY_ENABLED", False)
settings.set("CONCURRENT_REQUESTS", 15)
# Create the process that will perform the crawl
output.info("Start crawling the target website")
process = CrawlerProcess(settings)
allowed_domains.append(str(urlparse(url).hostname))
process.crawl(
SitadelSpider, start_urls=[str(url)], allowed_domains=allowed_domains
)
process.start()
# Clean the results
clean_urls = []
for u in urls:
try:
new_url = urlparse(u).geturl()
clean_urls.append(new_url)
except ValueError:
continue
return clean_urls
except KeyboardInterrupt:
process.stop()
raise
示例12: collect
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def collect(conf, conn):
process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
process.crawl(Spider, conn=conn)
process.start()
示例13: collect
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def collect(conf, conn, date_from=None, date_to=None):
process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
process.start()
示例14: collect
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def collect(conf, conn, page_from=None, page_to=None):
process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
process.crawl(Spider, conn=conn, page_from=page_from, page_to=page_to)
process.start()
示例15: collect
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def collect(conf, conn):
process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
process.crawl(Spider, conn=conn,
http_user=conf['ICTRP_USER'],
http_pass=conf['ICTRP_PASS'])
process.start()