本文整理汇总了Python中scrapy.conf.settings.get方法的典型用法代码示例。如果您正苦于以下问题:Python settings.get方法的具体用法?Python settings.get怎么用?Python settings.get使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.conf.settings
的用法示例。
在下文中一共展示了settings.get方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def __init__(self):
"""Class constructor."""
self._fields_to_check = ['description', 'name', 'summary', 'reviews']
self._minimum_monthly_discount = int(settings.get('MINIMUM_MONTHLY_DISCOUNT', None))
self._minimum_weekly_discount = int(settings.get('MINIMUM_WEEKLY_DISCOUNT', None))
self._skip_list = settings.get('SKIP_LIST', None)
self._cannot_have_regex = settings.get('CANNOT_HAVE', None)
if self._cannot_have_regex:
self._cannot_have_regex = re.compile(str(self._cannot_have_regex), re.IGNORECASE)
self._must_have_regex = settings.get('MUST_HAVE', None)
if self._must_have_regex:
self._must_have_regex = re.compile(str(self._must_have_regex), re.IGNORECASE)
self._web_browser = settings.get('WEB_BROWSER', None)
if self._web_browser:
self._web_browser += ' %s' # append URL placeholder (%s)
示例2: process_request
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def process_request(self, request, spider):
ua = random.choice(settings.get('USER_AGENT_LIST'))
if ua:
request.headers.setdefault('User-Agent', ua)
# class RetryChangeProxyMiddleware(RetryMiddleware):
# def _retry(self, request, reason, spider):
# log.msg('Changing proxy')
# tn = telnetlib.Telnet('127.0.0.1', 9051)
# tn.read_until("Escape character is '^]'.", 2)
# tn.write('AUTHENTICATE "267765"\r\n')
# tn.read_until("250 OK", 2)
# tn.write("signal NEWNYM\r\n")
# tn.read_until("250 OK", 2)
# tn.write("quit\r\n")
# tn.close()
# time.sleep(3)
# log.msg('Proxy changed')
# return RetryMiddleware._retry(self, request, reason, spider)
示例3: find_item
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def find_item(self):
'''
Finds an item from the queue
'''
count = 0
while count <= self.item_retries:
item = self.queue.pop()
if item:
# very basic limiter
time.sleep(1)
return item
# we want the spiders to get slightly out of sync
# with each other for better performance
time.sleep(random.random())
count = count + 1
return None
示例4: get_link_extractor
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def get_link_extractor(self):
return LinkExtractor(allow=r'^http://[a-z2-7]{16}.onion',
deny=[r'^https://blockchainbdgpzk.onion/address/',
r'^https://blockchainbdgpzk.onion/tx/'],
deny_domains=settings.get('FAKE_DOMAINS'))
示例5: __init__
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def __init__(self, *args, **kwargs):
self.rules = [Rule(self.get_link_extractor(),
callback=self.parse_item,
process_links=self.limit_links,
follow=True)]
super(WebSpider, self).__init__(*args, **kwargs)
target_sites = settings.get('TARGET_SITES')
if target_sites and os.path.isfile(target_sites):
# Read a list of URLs from file
# Create the target file list
with open(target_sites) as target_sites_file:
# Make it to Python list
self.start_urls = target_sites_file.read().splitlines()
# Remove empty strings
self.start_urls = [u for u in self.start_urls if u]
else:
self.start_urls = self.default_start_url
示例6: main
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def main():
"""Run all spiders in their own threads.
"""
# ensure we're in the root directory
os.chdir(PROJECT_ROOT)
# get the spider names
spiders_module = settings.get('NEWSPIDER_MODULE').replace('.', os.sep)
path = os.path.abspath(os.path.join(PROJECT_ROOT, spiders_module))
spiders = glob('{path}/*.py'.format(**locals()))
spiders = [
os.path.basename(s)[:-3] for s in spiders
if not s.endswith('__init__.py')]
# start spiders in their own threads
threads = []
for spider in spiders:
t = threading.Thread(target=worker, args=(spider,))
threads.append(t)
t.start()
示例7: __init__
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
"""Class constructor."""
fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
if fields_to_export:
kwargs['fields_to_export'] = fields_to_export
super().__init__(**kwargs)
self.include_headers_line = include_headers_line
self._workbook = openpyxl.workbook.Workbook()
self._worksheet = self._workbook.active
self._headers_not_written = True
self._join_multivalued = join_multivalued
self._filename = file.name
file.close()
示例8: serialize_field
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def serialize_field(self, field, name, value):
serializer = field.get('serializer', self._join_if_needed)
return serializer(value)
示例9: process_item
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def process_item(self, item, spider):
"""Drop items not fitting parameters. Open in browser if specified. Return accepted items."""
if self._skip_list and str(item['id']) in self._skip_list:
raise DropItem('Item in skip list: {}'.format(item['id']))
if self._minimum_monthly_discount and 'monthly_discount' in item:
if item['monthly_discount'] < self._minimum_monthly_discount:
raise DropItem('Monthly discount too low: {}'.format(item['monthly_discount']))
if self._minimum_weekly_discount and 'weekly_discount' in item:
if item['weekly_discount'] < self._minimum_monthly_discount:
raise DropItem('Weekly discount too low: {}'.format(item['weekly_discount']))
# check regexes
if self._cannot_have_regex:
for f in self._fields_to_check:
v = str(item[f].encode('ASCII', 'replace'))
if self._cannot_have_regex.search(v):
raise DropItem('Found: {}'.format(self._cannot_have_regex.pattern))
if self._must_have_regex:
has_must_haves = False
for f in self._fields_to_check:
v = str(item[f].encode('ASCII', 'replace'))
if self._must_have_regex.search(v):
has_must_haves = True
break
if not has_must_haves:
raise DropItem('Not Found: {}'.format(self._must_have_regex.pattern))
# open in browser
if self._web_browser:
webbrowser.get(self._web_browser).open(item['url'])
return item
示例10: process_request
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def process_request(self, request, spider):
proxy = settings.get('PROXY')
logger.info("process request %s using proxy %s" % (request, proxy))
request.meta['proxy'] = proxy
pass
示例11: random_ua
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def random_ua(self):
#randomise user-agent from list to reduce chance of being banned
ua = random.choice(settings.get('USER_AGENT_LIST'))
if ua:
ua='Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36'
return ua
示例12: process_request
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def process_request(self, request, spider):
request.meta['proxy'] = settings.get('HTTP_PROXY')
#
示例13: from_settings
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def from_settings(cls, settings):
server = redis.Redis(host=settings.get('REDIS_HOST'),
port=settings.get('REDIS_PORT'))
persist = settings.get('SCHEDULER_PERSIST', True)
timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
retries = settings.get('SCHEDULER_ITEM_RETRIES', 3)
return cls(server, persist, timeout, retries)
示例14: from_crawler
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(WebSpider, cls).from_crawler(crawler, *args, **kwargs)
if settings.get('FULL_PAGERANK_COMPUTE', False):
crawler.signals.connect(spider.on_idle, signals.spider_idle)
return spider
示例15: process_request
# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def process_request(self, request, spider): # pylint:disable=unused-argument
"""Process incoming request."""
parsed_uri = urlparse(request.url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
if ".onion" in domain and ".onion." not in domain:
tor_proxy_list = settings.get('HTTP_PROXY_TOR_PROXIES')
request.meta['proxy'] = random.choice(tor_proxy_list)
elif ".i2p" in domain and ".i2p." not in domain:
if parsed_uri.scheme == "https":
request.meta['proxy'] = settings.get('HTTPS_PROXY_I2P')
else:
request.meta['proxy'] = settings.get('HTTP_PROXY_I2P')