本文整理汇总了Python中scrapy.settings.Settings方法的典型用法代码示例。如果您正苦于以下问题:Python settings.Settings方法的具体用法?Python settings.Settings怎么用?Python settings.Settings使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.settings
的用法示例。
在下文中一共展示了settings.Settings方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_project_settings
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def get_project_settings():
if ENVVAR not in os.environ:
project = os.environ.get('SCRAPY_PROJECT', 'default')
init_env(project)
settings = Settings()
settings_module_path = os.environ.get(ENVVAR)
if settings_module_path:
settings.setmodule(settings_module_path, priority='project')
# XXX: remove this hack
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
if pickled_settings:
settings.setdict(pickle.loads(pickled_settings), priority='project')
# XXX: deprecate and remove this functionality
env_overrides = {k[7:]: v for k, v in os.environ.items() if
k.startswith('SCRAPY_')}
if env_overrides:
settings.setdict(env_overrides, priority='project')
return settings
示例2: main
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def main():
"""Main routine for the execution of the Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print("Item extracted:", item)
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# define the spider for the crawler
crawler.crawl(EuropythonSpyder())
# start scrapy
print("STARTING ENGINE")
crawler.start() #iniciar el crawler llamando al spider definido
print("ENGINE STOPPED")
开发者ID:PacktPublishing,项目名称:Learning-Python-Networking-Second-Edition,代码行数:25,代码来源:EuropythonSpyder.py
示例3: test_get_enabled_status
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def test_get_enabled_status():
settings = Settings()
# check for void settings
assert _get_enabled_status(settings) == (False, False)
# plugin enabled with settings
settings.set('PAGE_STORAGE_ENABLED', True)
assert _get_enabled_status(settings) == (True, False)
settings.set('PAGE_STORAGE_ENABLED', None)
# plugin enabled by spider_type
for spider_type in ['auto', 'portia']:
os.environ['SHUB_SPIDER_TYPE'] = spider_type
assert _get_enabled_status(settings) == (True, False)
os.environ['SHUB_SPIDER_TYPE'] = 'other_spider_type'
assert _get_enabled_status(settings) == (False, False)
# plugin enabled on error
settings.set('PAGE_STORAGE_ON_ERROR_ENABLED', True)
assert _get_enabled_status(settings) == (False, True)
示例4: fetch_url
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def fetch_url(cls, session, msites, platform_id, purpose):
"""Actual method to do fetch url action.
Parameters
----------
msites : list
a list of Site model class, contains info to build spiders.
platform_id : int
id of platform, bind fetched url with this id.
purpose : {'update', 'archive'}
indicate which url to fetch.
"""
settings = Settings(cls.conf['crawl']['scrapy'])
settings.set('ITEM_PIPELINES',
{'hoaxy.crawl.pipelines.UrlPipeline': 300})
process = CrawlerProcess(settings)
sll = cls.conf['logging']['loggers']['scrapy']['level']
logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
for ms in msites:
for sm in build_spiders_iter(ms, purpose):
sm['kwargs']['session'] = session
sm['kwargs']['platform_id'] = platform_id
process.crawl(sm['cls'], *sm['args'], **sm['kwargs'])
process.start()
示例5: fetch_html
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def fetch_html(cls, session, url_tuples):
"""Actual method to do fetch html action.
Parameters
----------
session : object
a SQLAlchemy session object.
url_tuples : list
a list of url tuple (id, raw, status_code).
"""
settings = Settings(cls.conf['crawl']['scrapy'])
settings.set('ITEM_PIPELINES',
{'hoaxy.crawl.pipelines.HtmlPipeline': 300})
process = CrawlerProcess(settings)
sll = cls.conf['logging']['loggers']['scrapy']['level']
logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
logger.warning('Number of url to fetch html is: %s', len(url_tuples))
process.crawl(
HtmlSpider,
session=session,
url_tuples=url_tuples,
excluded_domains=cls.conf['crawl']['excluded_domains'])
process.start()
示例6: start_job
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def start_job(self, job=None, callback_fn=None):
print(job)
spider_job = job['spider_job']
runner = job['runner']
spider_cls = spider_job['spider_cls']
spider_settings = spider_job['spider_settings']
spider_kwargs = spider_job['spider_kwargs']
def engine_stopped_callback():
runner.transform_and_index(callback_fn=callback_fn)
if callback_fn:
print("""
==========================================================
WARNING: callback_fn is {}
==========================================================
Since start_job is called with callback_fn, make sure you end the reactor if you want the spider process to
stop after the callback function is executed. By default callback_fn=None will close the reactor.
To write a custom callback_fn
def callback_fn():
print ("Write your own callback logic")
from twisted.internet import reactor
reactor.stop()
==========================================================
""".format(callback_fn))
spider = Crawler(spider_cls, Settings(spider_settings))
spider.signals.connect(engine_stopped_callback, signals.engine_stopped)
self.runner.crawl(spider, **spider_kwargs)
"""
d = runner.crawl(spider, **spider_kwargs)
# d.addBoth(engine_stopped_callback)
"""
reactor.run()
示例7: test_spidermon_aws_credentials_not_set
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def test_spidermon_aws_credentials_not_set():
settings = Settings()
(aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)
assert aws_access_key_id is None
assert aws_secret_access_key is None
示例8: test_spidermon_aws_credentials
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def test_spidermon_aws_credentials(mocker):
warn_mock = mocker.patch("spidermon.utils.settings.warnings.warn")
settings = Settings(
{
"SPIDERMON_AWS_ACCESS_KEY": "aws_access_key",
"SPIDERMON_AWS_SECRET_KEY": "aws_secret_key",
}
)
(aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)
assert aws_access_key_id == "aws_access_key"
assert aws_secret_access_key == "aws_secret_key"
warn_mock.assert_called_with(mocker.ANY, DeprecationWarning)
示例9: test_spidermon_aws_credentials_scrapy_like
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def test_spidermon_aws_credentials_scrapy_like():
settings = Settings(
{
"SPIDERMON_AWS_ACCESS_KEY_ID": "aws_access_key_id",
"SPIDERMON_AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
}
)
(aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)
assert aws_access_key_id == "aws_access_key_id"
assert aws_secret_access_key == "aws_secret_access_key"
示例10: test_spidermon_aws_credentials_fall_back_to_scrapy
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def test_spidermon_aws_credentials_fall_back_to_scrapy():
settings = Settings(
{
"AWS_ACCESS_KEY_ID": "scrapy_aws_access_key_id",
"AWS_SECRET_ACCESS_KEY": "scrapy_aws_secret_access_key",
}
)
(aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)
assert aws_access_key_id == "scrapy_aws_access_key_id"
assert aws_secret_access_key == "scrapy_aws_secret_access_key"
示例11: test_spidermon_aws_credentials_are_preferred_over_scrapy_ones
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def test_spidermon_aws_credentials_are_preferred_over_scrapy_ones():
settings = Settings(
{
"AWS_ACCESS_KEY_ID": "scrapy_aws_access_key_id",
"AWS_SECRET_ACCESS_KEY": "scrapy_aws_secret_access_key",
"SPIDERMON_AWS_ACCESS_KEY_ID": "spidermon_aws_access_key_id",
"SPIDERMON_AWS_SECRET_ACCESS_KEY": "spidermon_aws_secret_access_key",
}
)
(aws_access_key_id, aws_secret_access_key) = get_aws_credentials(settings)
assert aws_access_key_id == "spidermon_aws_access_key_id"
assert aws_secret_access_key == "spidermon_aws_secret_access_key"
示例12: settings
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def settings(request):
""" Default scrapy-poet settings """
s = dict(
# collect scraped items to .collected_items attribute
ITEM_PIPELINES={
'tests.utils.CollectorPipeline': 100,
},
DOWNLOADER_MIDDLEWARES={
'scrapy_poet.InjectionMiddleware': 543,
},
)
return Settings(s)
示例13: __init__
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def __init__(self, spidercls, settings=None):
if isinstance(spidercls, Spider):
raise ValueError(
'The spidercls argument must be a class, not an object')
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.spidercls = spidercls
self.settings = settings.copy()
self.spidercls.update_settings(self.settings)
self.signals = SignalManager(self)
self.stats = load_object(self.settings['STATS_CLASS'])(self)
handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
logging.root.addHandler(handler)
d = dict(overridden_settings(self.settings))
logger.info("Overridden settings: %(settings)r", {'settings': d})
if get_scrapy_root_handler() is not None:
# scrapy root handler already installed: update it with new settings
install_scrapy_root_handler(self.settings)
# lambda is assigned to Crawler attribute because this way it is not
# garbage collected after leaving __init__ scope
self.__remove_handler = lambda: logging.root.removeHandler(handler)
self.signals.connect(self.__remove_handler, signals.engine_stopped)
lf_cls = load_object(self.settings['LOG_FORMATTER'])
self.logformatter = lf_cls.from_crawler(self)
self.extensions = ExtensionManager.from_crawler(self)
self.settings.freeze()
self.crawling = False
self.spider = None
self.engine = None
示例14: configure_logging
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def configure_logging(settings=None, install_root_handler=True):
"""
Initialize logging defaults for Scrapy.
:param settings: settings used to create and configure a handler for the
root logger (default: None).
:type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``
:param install_root_handler: whether to install root logging handler
(default: True)
:type install_root_handler: bool
This function does:
- Route warnings and twisted logging through Python standard logging
- Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
- Route stdout to log if LOG_STDOUT setting is True
When ``install_root_handler`` is True (default), this function also
creates a handler for the root logger according to given settings
(see :ref:`topics-logging-settings`). You can override default options
using ``settings`` argument. When ``settings`` is empty or None, defaults
are used.
"""
if not sys.warnoptions:
# Route warnings through python logging
logging.captureWarnings(True)
observer = twisted_log.PythonLoggingObserver('twisted')
observer.start()
dictConfig(DEFAULT_LOGGING)
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
if settings.getbool('LOG_STDOUT'):
sys.stdout = StreamLogger(logging.getLogger('stdout'))
if install_root_handler:
install_scrapy_root_handler(settings)
示例15: __init__
# 需要导入模块: from scrapy import settings [as 别名]
# 或者: from scrapy.settings import Settings [as 别名]
def __init__(self, store_uri, download_func=None, settings=None):
if not store_uri:
raise NotConfigured
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
cls_name = "FilesPipeline"
self.store = self._get_store(store_uri)
resolve = functools.partial(self._key_for_pipe,
base_class_name=cls_name,
settings=settings)
self.expires = settings.getint(
resolve('FILES_EXPIRES'), self.EXPIRES
)
if not hasattr(self, "FILES_URLS_FIELD"):
self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
if not hasattr(self, "FILES_RESULT_FIELD"):
self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
self.files_urls_field = settings.get(
resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD
)
self.files_result_field = settings.get(
resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
)
super(FilesPipeline, self).__init__(download_func=download_func, settings=settings)