本文整理汇总了Python中scrapy.utils.project.get_project_settings方法的典型用法代码示例。如果您正苦于以下问题:Python project.get_project_settings方法的具体用法?Python project.get_project_settings怎么用?Python project.get_project_settings使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.utils.project
的用法示例。
在下文中一共展示了project.get_project_settings方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: runspider
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def runspider(name):
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='log/%s.log' % name,
format='%(levelname)s %(asctime)s: %(message)s',
level=logging.DEBUG
)
process = CrawlerProcess(get_project_settings())
try:
logging.info('runspider start spider:%s' % name)
process.crawl(name)
process.start()
except Exception as e:
logging.exception('runspider spider:%s exception:%s' % (name, e))
logging.debug('finish this spider:%s\n\n' % name)
示例2: runspider
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def runspider(spargs):
url = spargs.get('url')
name = spargs.get('name', 'jd')
if not os.path.exists('log'):
os.makedirs('log')
configure_logging(install_root_handler = False)
logging.basicConfig(
filename = 'log/%s.log' % name,
format = '%(levelname)s %(asctime)s: %(message)s',
level = logging.ERROR
)
print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
process = CrawlerProcess(get_project_settings())
start_time = time.time()
try:
logging.info('进入爬虫')
process.crawl(name, **spargs)
process.start()
except Exception, e:
process.stop()
logging.error("url:%s, errorMsg:%s" % (url, e.message))
示例3: runspider
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def runspider(spargs):
url = spargs.get('url')
name = spargs.get('name', 'jd')
guid = spargs.get('guid')
product_id = spargs.get('product_id')
if not os.path.exists('log'):
os.makedirs('log')
configure_logging(install_root_handler = False)
logging.basicConfig(
filename = 'log/%s.log' % name,
format = '%(levelname)s %(asctime)s: %(message)s',
level = logging.ERROR
)
print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
process = CrawlerProcess(get_project_settings())
start_time = time.time()
try:
logging.info('进入爬虫')
process.crawl(name, **spargs)
process.start()
except Exception, e:
process.stop()
logging.error("url:%s, errorMsg:%s" % (url, e.message))
示例4: run
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def run():
configure_logging()
# importing project settings for further usage
# mainly because of the middlewares
settings = get_project_settings()
runner = CrawlerRunner(settings)
# running spiders sequentially (non-distributed)
@defer.inlineCallbacks
def crawl():
yield runner.crawl(IPTesterSpider)
yield runner.crawl(UATesterSpider)
reactor.stop()
crawl()
reactor.run() # block until the last call
示例5: __init__
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def __init__(self, *args, **kwargs):
super(YoutubeHistorySpider, self).__init__(*args, **kwargs)
settings = get_project_settings()
hf = settings.get("CHROME_HEADERS_FILE")
cj = settings.get("COOKIES_JSON")
if hf:
ch = ChromeRequest.from_file(hf)
self.init_cookies = ch.cookies
elif cj:
with open (cj, 'r') as fh:
cookies = parse_cookies(fh.read())
self.init_cookies = cookies
if not hasattr(self, "init_cookies"):
raise ValueError("Need to specify 'CHROME_HEADERS_FILE' "+
"or 'COOKIES_JSON' in settings.")
示例6: get_follow_requests_and_items
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def get_follow_requests_and_items(project_path, spider_name, args):
"""
get follows
:param project_path:
:param spider_name:
:param args:
:return:
"""
work_cwd = os.getcwd()
try:
os.chdir(project_path)
settings = get_project_settings()
check_deprecated_settings(settings)
sp = SpiderParser(settings, spider_name, args)
results = sp.run()
return results
finally:
os.chdir(work_cwd)
示例7: crawler_start
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def crawler_start(usage, tasks):
"""Start specified spiders or validators from cmd with scrapy core api.
There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
assign any tasks, all these spiders will run.
"""
if usage == 'crawler':
maps = CRAWLER_TASK_MAPS
origin_spiders = DEFAULT_CRAWLERS
else:
maps = TEMP_TASK_MAPS
origin_spiders = DEFAULT_VALIDATORS
if not tasks:
spiders = origin_spiders
else:
spiders = list()
cases = list(map(BaseCase, origin_spiders))
for task in tasks:
for case in cases:
if case.check(task, maps):
spiders.append(case.spider)
break
else:
# crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
# task, list(maps.keys())))
pass
if not spiders:
#crawler_logger.warning('no spider starts up, please check your task input')
return
settings = get_project_settings()
configure_logging(settings)
runner = CrawlerRunner(settings)
for spider in spiders:
runner.crawl(spider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例8: __init__
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def __init__(self):
scrapy.spiders.Spider.__init__(self)
self.global_settings = get_project_settings()
if self.global_settings['PLATFORM'] in ['win', 'mac']:
self.driver = webdriver.PhantomJS(executable_path= self.global_settings['PHANTOMJS_PATH'])
elif self.global_settings['PLATFORM'] in ['linux']:
self.driver = webdriver.PhantomJS()
self.driver.set_page_load_timeout(30)
self.driver.implicitly_wait(10)
self.type_id_list = self.global_settings['CRAWLER']['type_id_list']
self.re_type_id = re.compile(self.global_settings['CRAWLER']['re_type_id'])
self.url_template = self.global_settings['CRAWLER']['url_template']
示例9: setUp
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def setUp(self):
self.settings = get_project_settings()
self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test")
# set up redis
self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
port=self.settings['REDIS_PORT'],
db=self.settings['REDIS_DB'])
try:
self.redis_conn.info()
except ConnectionError:
print("Could not connect to Redis")
# plugin is essential to functionality
sys.exit(1)
# clear out older test keys if any
keys = self.redis_conn.keys("test-spider:*")
for key in keys:
self.redis_conn.delete(key)
# set up kafka to consumer potential result
self.consumer = KafkaConsumer(
"demo_test.crawled_firehose",
bootstrap_servers=self.settings['KAFKA_HOSTS'],
group_id="demo-id",
auto_commit_interval_ms=10,
consumer_timeout_ms=5000,
auto_offset_reset='earliest'
)
time.sleep(1)
示例10: runspider
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def runspider(self):
configure_logging(install_root_handler = False)
s = get_project_settings()
runner = CrawlerRunner(settings = s)
@defer.inlineCallbacks
def crawl(**spargs):
yield runner.crawl(JDItemInfoSpider, **spargs)
yield runner.crawl(JDCommentSpider, **spargs)
reactor.stop()
crawl(**self.spargs)
reactor.run() # the script will block here until the last crawl call is finished
# 调度分析
示例11: crawl
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def crawl(url, user_agent):
try:
output = Services.get("output")
# Settings for the crawler
settings = get_project_settings()
settings.set("USER_AGENT", user_agent)
settings.set("LOG_LEVEL", "CRITICAL")
settings.set("RETRY_ENABLED", False)
settings.set("CONCURRENT_REQUESTS", 15)
# Create the process that will perform the crawl
output.info("Start crawling the target website")
process = CrawlerProcess(settings)
allowed_domains.append(str(urlparse(url).hostname))
process.crawl(
SitadelSpider, start_urls=[str(url)], allowed_domains=allowed_domains
)
process.start()
# Clean the results
clean_urls = []
for u in urls:
try:
new_url = urlparse(u).geturl()
clean_urls.append(new_url)
except ValueError:
continue
return clean_urls
except KeyboardInterrupt:
process.stop()
raise
示例12: import_settings
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def import_settings(self):
settings = get_project_settings()
self.password = settings['AUTH_PASSWORD']
self.http_proxy = settings['HTTP_PROXY']
self.control_port = settings['CONTROL_PORT']
self.max_req_per_ip = settings['MAX_REQ_PER_IP']
self.exit_nodes = settings['EXIT_NODES']
if self.exit_nodes:
with Controller.from_port(port=self.control_port) as controller:
controller.authenticate(self.password)
controller.set_conf('ExitNodes', self.exit_nodes)
controller.close()
示例13: execute
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
if settings is None:
settings = get_project_settings()
# set EDITOR from environment if available
try:
editor = os.environ['EDITOR']
except KeyError:
pass
else:
settings['EDITOR'] = editor
check_deprecated_settings(settings)
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
示例14: __init__
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def __init__(self, uri, access_key=None, secret_key=None, acl=None):
# BEGIN Backward compatibility for initialising without keys (and
# without using from_crawler)
no_defaults = access_key is None and secret_key is None
if no_defaults:
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
if 'AWS_ACCESS_KEY_ID' in settings or 'AWS_SECRET_ACCESS_KEY' in settings:
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn(
"Initialising `scrapy.extensions.feedexport.S3FeedStorage` "
"without AWS keys is deprecated. Please supply credentials or "
"use the `from_crawler()` constructor.",
category=ScrapyDeprecationWarning,
stacklevel=2
)
access_key = settings['AWS_ACCESS_KEY_ID']
secret_key = settings['AWS_SECRET_ACCESS_KEY']
# END Backward compatibility
u = urlparse(uri)
self.bucketname = u.hostname
self.access_key = u.username or access_key
self.secret_key = u.password or secret_key
self.is_botocore = is_botocore()
self.keyname = u.path[1:] # remove first "/"
self.acl = acl
if self.is_botocore:
import botocore.session
session = botocore.session.get_session()
self.s3_client = session.create_client(
's3', aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key)
else:
import boto
self.connect_s3 = boto.connect_s3
示例15: prepare_callback_replay
# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def prepare_callback_replay(fixture_path, encoding="utf-8"):
with open(str(fixture_path), 'rb') as f:
raw_data = f.read()
fixture_info = unpickle_data(decompress_data(raw_data), encoding)
if 'fixture_version' in fixture_info:
encoding = fixture_info['encoding']
data = unpickle_data(fixture_info['data'], encoding)
else:
data = fixture_info # legacy tests
settings = get_project_settings()
spider_name = data.get('spider_name')
if not spider_name: # legacy tests
spider_name = os.path.basename(
os.path.dirname(
os.path.dirname(fixture_path)
)
)
spider_cls = get_spider_class(spider_name, settings)
spider_cls.update_settings(settings)
for k, v in data.get('settings', {}).items():
settings.set(k, v, 50)
crawler = Crawler(spider_cls, settings)
spider_args_in = data.get('spider_args', data.get('spider_args_in', {}))
spider = spider_cls.from_crawler(crawler, **spider_args_in)
crawler.spider = spider
return data, crawler, spider, settings