本文整理汇总了Python中scrapy.settings方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.settings方法的具体用法?Python scrapy.settings怎么用?Python scrapy.settings使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy
的用法示例。
在下文中一共展示了scrapy.settings方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_handler
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def _get_handler(settings):
""" Return a log handler object according to settings """
filename = settings.get('LOG_FILE')
if filename:
encoding = settings.get('LOG_ENCODING')
handler = logging.FileHandler(filename, encoding=encoding)
elif settings.getbool('LOG_ENABLED'):
handler = logging.StreamHandler()
else:
handler = logging.NullHandler()
formatter = logging.Formatter(
fmt=settings.get('LOG_FORMAT'),
datefmt=settings.get('LOG_DATEFORMAT')
)
handler.setFormatter(formatter)
handler.setLevel(settings.get('LOG_LEVEL'))
if settings.getbool('LOG_SHORT_NAMES'):
handler.addFilter(TopLevelFormatter(['scrapy']))
return handler
示例2: populate_vars
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def populate_vars(self, response=None, request=None, spider=None):
import scrapy
self.vars['scrapy'] = scrapy
self.vars['crawler'] = self.crawler
self.vars['item'] = self.item_class()
self.vars['settings'] = self.crawler.settings
self.vars['spider'] = spider
self.vars['request'] = request
self.vars['response'] = response
self.vars['sel'] = _SelectorProxy(response)
if self.inthread:
self.vars['fetch'] = self.fetch
self.vars['view'] = open_in_browser
self.vars['shelp'] = self.print_help
self.update_vars(self.vars)
if not self.code:
self.vars['banner'] = self.get_help()
示例3: main
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def main():
"""Main routine for the execution of the Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print("Item extracted:", item)
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# define the spider for the crawler
crawler.crawl(EuropythonSpyder())
# start scrapy
print("STARTING ENGINE")
crawler.start() #iniciar el crawler llamando al spider definido
print("ENGINE STOPPED")
开发者ID:PacktPublishing,项目名称:Learning-Python-Networking-Second-Edition,代码行数:25,代码来源:EuropythonSpyder.py
示例4: crawl_runner
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def crawl_runner(extra_settings=None):
settings = base_settings.copy()
if extra_settings is not None:
settings.update(extra_settings, priority='cmdline')
if settings.get('SPLASH_URL'):
settings['DUPEFILTER_CLASS'] = 'scrapy_splash.SplashAwareDupeFilter'
settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression'
'.HttpCompressionMiddleware': 810,
})
else:
settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
'autologin.middleware.ExposeCookiesMiddleware': 700,
})
return CrawlerRunner(settings)
示例5: start_requests
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def start_requests(self):
self._finish_init()
settings = self.crawler.settings
self.solver = None
try:
import decaptcha
except ImportError:
self.logger.warning('Decaptcha not installed')
else:
from decaptcha.solvers.deathbycaptcha import DeathbycaptchaSolver
if (settings.get('DECAPTCHA_DEATHBYCAPTCHA_USERNAME') and
settings.get('DECAPTCHA_DEATHBYCAPTCHA_PASSWORD')):
self.solver = DeathbycaptchaSolver(self.crawler)
else:
self.logger.warning('DeathByCaptcha account not provided')
self.retries_left = settings.getint('LOGIN_MAX_RETRIES')
request_kwargs = {}
if self.using_splash:
request_kwargs['args'] = {'full_render': True}
yield self.request(self.start_url, **request_kwargs)
示例6: configure_logging
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def configure_logging(settings=None, install_root_handler=True):
"""
Initialize logging defaults for Scrapy.
:param settings: settings used to create and configure a handler for the
root logger (default: None).
:type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``
:param install_root_handler: whether to install root logging handler
(default: True)
:type install_root_handler: bool
This function does:
- Route warnings and twisted logging through Python standard logging
- Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
- Route stdout to log if LOG_STDOUT setting is True
When ``install_root_handler`` is True (default), this function also
creates a handler for the root logger according to given settings
(see :ref:`topics-logging-settings`). You can override default options
using ``settings`` argument. When ``settings`` is empty or None, defaults
are used.
"""
if not sys.warnoptions:
# Route warnings through python logging
logging.captureWarnings(True)
observer = twisted_log.PythonLoggingObserver('twisted')
observer.start()
dictConfig(DEFAULT_LOGGING)
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
if settings.getbool('LOG_STDOUT'):
sys.stdout = StreamLogger(logging.getLogger('stdout'))
if install_root_handler:
install_scrapy_root_handler(settings)
示例7: install_scrapy_root_handler
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def install_scrapy_root_handler(settings):
global _scrapy_root_handler
if (_scrapy_root_handler is not None
and _scrapy_root_handler in logging.root.handlers):
logging.root.removeHandler(_scrapy_root_handler)
logging.root.setLevel(logging.NOTSET)
_scrapy_root_handler = _get_handler(settings)
logging.root.addHandler(_scrapy_root_handler)
示例8: log_scrapy_info
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def log_scrapy_info(settings):
logger.info("Scrapy %(version)s started (bot: %(bot)s)",
{'version': scrapy.__version__, 'bot': settings['BOT_NAME']})
logger.info("Versions: %(versions)s",
{'versions': ", ".join("%s %s" % (name, version)
for name, version in scrapy_components_versions()
if name != "Scrapy")})
示例9: __init__
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def __init__(self, crawler, update_vars=None, code=None):
self.crawler = crawler
self.update_vars = update_vars or (lambda x: None)
self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS'])
self.spider = None
self.inthread = not threadable.isInIOThread()
self.code = code
self.vars = {}
示例10: start
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def start(self, url=None, request=None, response=None, spider=None, redirect=True):
# disable accidental Ctrl-C key press from shutting down the engine
signal.signal(signal.SIGINT, signal.SIG_IGN)
if url:
self.fetch(url, spider, redirect=redirect)
elif request:
self.fetch(request, spider)
elif response:
request = response.request
self.populate_vars(response, request, spider)
else:
self.populate_vars()
if self.code:
print(eval(self.code, globals(), self.vars))
else:
"""
Detect interactive shell setting in scrapy.cfg
e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg
[settings]
# shell can be one of ipython, bpython or python;
# to be used as the interactive python console, if available.
# (default is ipython, fallbacks in the order listed above)
shell = python
"""
cfg = get_config()
section, option = 'settings', 'shell'
env = os.environ.get('SCRAPY_PYTHON_SHELL')
shells = []
if env:
shells += env.strip().lower().split(',')
elif cfg.has_option(section, option):
shells += [cfg.get(section, option).strip().lower()]
else: # try all by default
shells += DEFAULT_PYTHON_SHELLS.keys()
# always add standard shell as fallback
shells += ['python']
start_python_console(self.vars, shells=shells,
banner=self.vars.pop('banner', ''))
示例11: get_scrapy_settings
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def get_scrapy_settings(self, item_pipeline=None, hostname=None):
"""
Get a scrapy settings dictionary to use for crawling web applications.
:param item_pipeline: The item pipeline configuration to configure in the settings.
:param hostname: The hostname to request by default in all Scrapy requests.
:return: A scrapy settings dictionary to use for crawling web applications.
"""
item_pipeline = item_pipeline if item_pipeline is not None else self.__get_default_item_pipeline()
return scrapy.settings.Settings(values={
"CONCURRENT_ITEMS": self.concurrent_items,
"CONCURRENT_REQUESTS": self.concurrent_requests,
"DEFAULT_REQUEST_HEADERS": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en",
"Host": hostname,
},
"DEPTH_LIMIT": self.depth_limit,
"DEPTH_PRIORITY": self.depth_priority,
"DOWNLOADER_CLIENTCONTEXTFACTORY": "lib.inspection.web.crawling.WebSightClientContextFactory",
"EXTENSIONS": {
"scrapy.extensions.telnet.TelnetConsole": None,
},
"DOWNLOADER_MIDDLEWARES": {
"scrapy.downloadermiddlewares.redirect.RedirectMiddleware": None,
"scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": None,
},
"SPIDER_MIDDLEWARES": {
"scrapy.spidermiddlewares.offsite.OffsiteMiddleware": None,
},
"DOWNLOAD_MAXSIZE": self.max_size,
"HTTPERROR_ALLOW_ALL": self.allow_all_errors,
"ITEM_PIPELINES": item_pipeline,
"LOG_LEVEL": config.log_crawling_level,
"TELNETCONSOLE_ENABLED": self.enable_telnet,
"USER_AGENT": self.user_agent,
})
示例12: __crawl
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def __crawl(self, spider_kwargs=None, settings=None):
"""
Perform a crawl based on the contents of self._crawling_config.
:param spider_kwargs: Keyword arguments to use to create a spider class.
:param settings: Scrapy settings to use to crawl the remote endpoint.
:return: None
"""
print("SPIDER KWARGS ARE %s." % (spider_kwargs,))
config.globals["%s-hostname" % (os.getpid(),)] = spider_kwargs["input_hostname"]
spider = self.get_spider_class_for_domain(**spider_kwargs)
process = CrawlerProcess(settings)
process.crawl(spider)
process.start()
示例13: crawling_config
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def crawling_config(self):
"""
Get a dictionary containing the spider and Scrapy settings to use to crawl an endpoint.
:return: A dictionary containing the spider and Scrapy settings to use to crawl an endpoint.
"""
return self._crawling_config
# Representation and Comparison
示例14: _finish_init
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def _finish_init(self):
self.using_splash = bool(self.settings.get('SPLASH_URL'))
if self.using_splash:
with open(os.path.join(
os.path.dirname(__file__), 'directives', self.lua_source),
'rb') as f:
lua_source = f.read().decode('utf-8')
self.request = partial(
splash_request, lua_source,
extra_js=self.extra_js)
else:
if self.extra_js:
raise ValueError(
'"extra_js" not supported without "SPLASH_URL"')
self.request = scrapy.Request
示例15: crawl_endpoint_to_file
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import settings [as 别名]
def crawl_endpoint_to_file(
self,
ip_address=None,
port=None,
hostname=None,
use_ssl=False,
use_sni=False,
start_urls=[],
in_separate_process=True,
):
"""
Start crawling the given endpoint using the given list of URLs and write the results to
a local file.
:param ip_address: The IP address to crawl.
:param port: The port where the application resides.
:param hostname: The hostname to submit alongside all requests to the remote endpoint.
:param use_ssl: Whether or not to use SSL to connect to the remote web service.
:param use_sni: Whether or not to use SNI to connect to the remote web service.
:param start_urls: A list of URLs to start crawling from.
:param in_separate_process: Whether or not to spawn off a separate process for the crawl. This
enables us to call this method multiple times in the same process, as a Twisted reactor can only
be started and stopped once per process.
:return: A tuple containing (1) the string containing the local file path where crawling
results are stored and (2) a ScrapyResultWrapper configured to process the contents of the file.
"""
temp_file_path = FilesystemHelper.get_temporary_file_path()
local_file_path = "%s-%s-%s:%s" % (temp_file_path, self.bot_name, ip_address, port)
spider_kwargs = {
"input_ip_address": ip_address,
"input_start_urls": start_urls,
"input_file_path": local_file_path,
"input_hostname": hostname,
"input_use_ssl": use_ssl,
"input_use_sni": use_sni,
"input_port": port,
}
pipeline_settings = self.__get_local_storage_item_pipeline()
requested_hostname = hostname if hostname is not None else ip_address
settings = self.get_scrapy_settings(item_pipeline=pipeline_settings, hostname=requested_hostname)
crawling_config = {
"spider_kwargs": spider_kwargs,
"settings": settings,
}
if in_separate_process:
process = Process(target=self.__crawl, kwargs=crawling_config)
process.start()
process.join()
process.terminate()
else:
self.__crawl(**crawling_config)
return local_file_path, ScrapyResultWrapper.from_file(local_file_path)