当前位置: 首页>>代码示例>>Python>>正文


Python settings.get方法代码示例

本文整理汇总了Python中scrapy.conf.settings.get方法的典型用法代码示例。如果您正苦于以下问题:Python settings.get方法的具体用法?Python settings.get怎么用?Python settings.get使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.conf.settings的用法示例。


在下文中一共展示了settings.get方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def __init__(self):
        """Class constructor."""
        self._fields_to_check = ['description', 'name', 'summary', 'reviews']
        self._minimum_monthly_discount = int(settings.get('MINIMUM_MONTHLY_DISCOUNT', None))
        self._minimum_weekly_discount = int(settings.get('MINIMUM_WEEKLY_DISCOUNT', None))

        self._skip_list = settings.get('SKIP_LIST', None)

        self._cannot_have_regex = settings.get('CANNOT_HAVE', None)
        if self._cannot_have_regex:
            self._cannot_have_regex = re.compile(str(self._cannot_have_regex), re.IGNORECASE)

        self._must_have_regex = settings.get('MUST_HAVE', None)
        if self._must_have_regex:
            self._must_have_regex = re.compile(str(self._must_have_regex), re.IGNORECASE)

        self._web_browser = settings.get('WEB_BROWSER', None)
        if self._web_browser:
            self._web_browser += ' %s'  # append URL placeholder (%s) 
开发者ID:bashedev,项目名称:airbnb_scraper,代码行数:21,代码来源:pipelines.py

示例2: process_request

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def process_request(self, request, spider):
        ua  = random.choice(settings.get('USER_AGENT_LIST'))
        if ua:
            request.headers.setdefault('User-Agent', ua)


# class RetryChangeProxyMiddleware(RetryMiddleware):
# 	def _retry(self, request, reason, spider):
# 		log.msg('Changing proxy')
# 		tn = telnetlib.Telnet('127.0.0.1', 9051)
# 		tn.read_until("Escape character is '^]'.", 2)
# 		tn.write('AUTHENTICATE "267765"\r\n')
# 		tn.read_until("250 OK", 2)
# 		tn.write("signal NEWNYM\r\n")
# 		tn.read_until("250 OK", 2)
# 		tn.write("quit\r\n")
# 		tn.close()
# 		time.sleep(3)
# 		log.msg('Proxy changed')
# 		return RetryMiddleware._retry(self, request, reason, spider) 
开发者ID:parul1931,项目名称:amazon,代码行数:22,代码来源:middlewares.py

示例3: find_item

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def find_item(self):
        '''
        Finds an item from the queue
        '''
        count = 0

        while count <= self.item_retries:
            item = self.queue.pop()
            if item:
                # very basic limiter
                time.sleep(1)
                return item
            # we want the spiders to get slightly out of sync
            # with each other for better performance
            time.sleep(random.random())
            count = count + 1

        return None 
开发者ID:WalnutATiie,项目名称:scrapy-cluster,代码行数:20,代码来源:distributed_scheduler.py

示例4: get_link_extractor

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def get_link_extractor(self):
        return LinkExtractor(allow=r'^http://[a-z2-7]{16}.onion',
                             deny=[r'^https://blockchainbdgpzk.onion/address/',
                                   r'^https://blockchainbdgpzk.onion/tx/'],
                             deny_domains=settings.get('FAKE_DOMAINS')) 
开发者ID:ahmia,项目名称:ahmia-crawler,代码行数:7,代码来源:onionspider.py

示例5: __init__

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def __init__(self, *args, **kwargs):
        self.rules = [Rule(self.get_link_extractor(),
                           callback=self.parse_item,
                           process_links=self.limit_links,
                           follow=True)]
        super(WebSpider, self).__init__(*args, **kwargs)
        target_sites = settings.get('TARGET_SITES')
        if target_sites and os.path.isfile(target_sites):
            # Read a list of URLs from file
            # Create the target file list
            with open(target_sites) as target_sites_file:
                # Make it to Python list
                self.start_urls = target_sites_file.read().splitlines()
                # Remove empty strings
                self.start_urls = [u for u in self.start_urls if u]
        else:
            self.start_urls = self.default_start_url 
开发者ID:ahmia,项目名称:ahmia-crawler,代码行数:19,代码来源:base.py

示例6: main

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def main():
    """Run all spiders in their own threads.
    """
    # ensure we're in the root directory
    os.chdir(PROJECT_ROOT)
    # get the spider names
    spiders_module = settings.get('NEWSPIDER_MODULE').replace('.', os.sep)
    path = os.path.abspath(os.path.join(PROJECT_ROOT, spiders_module))
    spiders = glob('{path}/*.py'.format(**locals()))
    spiders = [
        os.path.basename(s)[:-3] for s in spiders
            if not s.endswith('__init__.py')]
    # start spiders in their own threads
    threads = []
    for spider in spiders:
        t = threading.Thread(target=worker, args=(spider,))
        threads.append(t)
        t.start() 
开发者ID:jamiebull1,项目名称:remotor,代码行数:20,代码来源:main.py

示例7: __init__

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
        """Class constructor."""
        fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
        if fields_to_export:
            kwargs['fields_to_export'] = fields_to_export

        super().__init__(**kwargs)

        self.include_headers_line = include_headers_line
        self._workbook = openpyxl.workbook.Workbook()
        self._worksheet = self._workbook.active
        self._headers_not_written = True
        self._join_multivalued = join_multivalued
        self._filename = file.name
        file.close() 
开发者ID:bashedev,项目名称:airbnb_scraper,代码行数:17,代码来源:exporter.py

示例8: serialize_field

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def serialize_field(self, field, name, value):
        serializer = field.get('serializer', self._join_if_needed)
        return serializer(value) 
开发者ID:bashedev,项目名称:airbnb_scraper,代码行数:5,代码来源:exporter.py

示例9: process_item

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def process_item(self, item, spider):
        """Drop items not fitting parameters. Open in browser if specified. Return accepted items."""

        if self._skip_list and str(item['id']) in self._skip_list:
            raise DropItem('Item in skip list: {}'.format(item['id']))

        if self._minimum_monthly_discount and 'monthly_discount' in item:
            if item['monthly_discount'] < self._minimum_monthly_discount:
                raise DropItem('Monthly discount too low: {}'.format(item['monthly_discount']))

        if self._minimum_weekly_discount and 'weekly_discount' in item:
            if item['weekly_discount'] < self._minimum_monthly_discount:
                raise DropItem('Weekly discount too low: {}'.format(item['weekly_discount']))

        # check regexes
        if self._cannot_have_regex:
            for f in self._fields_to_check:
                v = str(item[f].encode('ASCII', 'replace'))
                if self._cannot_have_regex.search(v):
                    raise DropItem('Found: {}'.format(self._cannot_have_regex.pattern))

        if self._must_have_regex:
            has_must_haves = False
            for f in self._fields_to_check:
                v = str(item[f].encode('ASCII', 'replace'))
                if self._must_have_regex.search(v):
                    has_must_haves = True
                    break

            if not has_must_haves:
                raise DropItem('Not Found: {}'.format(self._must_have_regex.pattern))

        # open in browser
        if self._web_browser:
            webbrowser.get(self._web_browser).open(item['url'])

        return item 
开发者ID:bashedev,项目名称:airbnb_scraper,代码行数:39,代码来源:pipelines.py

示例10: process_request

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def process_request(self, request, spider):
        proxy = settings.get('PROXY')
        logger.info("process request %s using proxy %s" % (request, proxy))
        request.meta['proxy'] = proxy
        pass 
开发者ID:JiangFeng07,项目名称:feng-python-apply,代码行数:7,代码来源:Proxy.py

示例11: random_ua

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def random_ua(self):
        #randomise user-agent from list to reduce chance of being banned
        ua  = random.choice(settings.get('USER_AGENT_LIST'))
        if ua:
            ua='Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36'
        return ua 
开发者ID:HashirZahir,项目名称:FIFA-Player-Ratings,代码行数:8,代码来源:fifa_spider.py

示例12: process_request

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def process_request(self, request, spider):
        request.meta['proxy'] = settings.get('HTTP_PROXY')

# 
开发者ID:keepCodingDream,项目名称:finTech,代码行数:6,代码来源:middlewares.py

示例13: from_settings

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def from_settings(cls, settings):
        server = redis.Redis(host=settings.get('REDIS_HOST'),
                                            port=settings.get('REDIS_PORT'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        retries = settings.get('SCHEDULER_ITEM_RETRIES', 3)

        return cls(server, persist, timeout, retries) 
开发者ID:WalnutATiie,项目名称:scrapy-cluster,代码行数:10,代码来源:distributed_scheduler.py

示例14: from_crawler

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(WebSpider, cls).from_crawler(crawler, *args, **kwargs)
        if settings.get('FULL_PAGERANK_COMPUTE', False):
            crawler.signals.connect(spider.on_idle, signals.spider_idle)
        return spider 
开发者ID:ahmia,项目名称:ahmia-crawler,代码行数:7,代码来源:base.py

示例15: process_request

# 需要导入模块: from scrapy.conf import settings [as 别名]
# 或者: from scrapy.conf.settings import get [as 别名]
def process_request(self, request, spider): # pylint:disable=unused-argument
        """Process incoming request."""
        parsed_uri = urlparse(request.url)
        domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
        if ".onion" in domain and ".onion." not in domain:
            tor_proxy_list = settings.get('HTTP_PROXY_TOR_PROXIES')
            request.meta['proxy'] = random.choice(tor_proxy_list)
        elif ".i2p" in domain and ".i2p." not in domain:
            if parsed_uri.scheme == "https":
                request.meta['proxy'] = settings.get('HTTPS_PROXY_I2P')
            else:
                request.meta['proxy'] = settings.get('HTTP_PROXY_I2P') 
开发者ID:ahmia,项目名称:ahmia-crawler,代码行数:14,代码来源:middleware.py


注:本文中的scrapy.conf.settings.get方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。