本文整理汇总了Python中scrapy.spiders.CrawlSpider.__init__方法的典型用法代码示例。如果您正苦于以下问题:Python CrawlSpider.__init__方法的具体用法?Python CrawlSpider.__init__怎么用?Python CrawlSpider.__init__使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.spiders.CrawlSpider
的用法示例。
在下文中一共展示了CrawlSpider.__init__方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self):
# 汎用データ辞書/リスト
self.generalData_dict = dict()
self.generalData_list = list()
self.setup_hooks() # フックセットアップ
self.setup_domains() # ドメイン名セットアップ
# クロールスパイダーを初期化(最後にするのが肝)
CrawlSpider.__init__(self)
示例2: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self):
self.domain = "www.gsmarena.com"
self.name = "gsmarena"
self.custom_settings = {}
self.allowed_domains = ["www.gsmarena.com"]
CrawlSpider.__init__(self)
self.start_urls = ["http://www.gsmarena.com/","http://www.gsmarena.com/makers.php3"]
self.count = 0
self.deny = ""
self.crawl_limt = 0
self.real_count = 0
示例3: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self):
self.domain = "www.gsmarena.com"
self.name = "gsmarena"
self.custom_settings = {}
self.allowed_domains = ["www.gsmarena.com"]
CrawlSpider.__init__(self)
self.start_urls = ["http://www.gsmarena.com/makers.php3",
"http://www.gsmarena.com/acer-phones-59.php",
"http://www.gsmarena.com/alcatel-phones-5.php"]
self.count = 0
self.deny = ""
self.crawl_limt = 0
self.real_count = 0
self.batch_size = 300
self.mobile_product = []
示例4: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self, **kwargs):
'''
:param kwargs:
Read user arguments and initialize variables
'''
CrawlSpider.__init__(self)
self.outDir = kwargs['outDir']
self.startYear = kwargs['startYear']
self.endYear = kwargs['endYear']
print('startYear: ', self.startYear)
print('self.endYear: ', self.endYear)
print('self.outDir: ', self.outDir)
self.headers = ({'User-Agent': 'Mozilla/5.0',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest'})
self.payload = {'username': '[user name for The Globe and Mail]', 'password': '[password for The Globe and Mail]'}
self.apikey = '[API Key for Gigya]'
self.categoryID = 'Production'
示例5: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self, rule, worksheet, logging):
CrawlSpider.__init__(self)
# use any browser you wish
self.browser = webdriver.Firefox()
self.logging = logging
self.rule = rule
self.name = self.rule["ranking_name"]
self.logging.info("==============================")
self.logging.info("self.rule[start_urls]: %s" % self.rule["start_urls"])
self.start_urls = self.rule["start_urls"]
# slef.next_page is a defined array.
self.next_page = self.rule["next_page"] \
if ("next_page" in self.rule) else ["NONE"]
self.logging.info("#### self.next_page %s" % self.next_page)
self.flag = self.rule["flag"] \
if ("flag" in self.rule) else ["NONE"]
self.logging.info("#### self.flag %s" % self.flag)
self.worksheet = worksheet
self.logging.info("Finish the __init__ method ... ")
示例6: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self):
CrawlSpider.__init__(self)
#create database
try :
dbfile = '%s/%s' % (conf.PROJECT_PATH['data'], conf.SQLITE['file'])
if os.path.exists(dbfile):
moveto = '%s.%d' % (dbfile, int(time.time()))
shutil.move(dbfile, moveto)
print 'old db file %s is moved to %s.' % (dbfile, moveto)
conn = sqlite3.connect(dbfile)
cursor = conn.cursor()
for table in conf.SQLITE['tables']:
cursor.execute(table['sql'])
conn.commit()
print 'db initialization complete!'
finally:
conn.close()
示例7: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self):
CrawlSpider.__init__(self)
self.verificationErrors = []
self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com")
self.selenium.start()
示例8: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self, *args, **kwargs)
self.proxy_pool = proxy_list
示例9: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self):
CrawlSpider.__init__(self)
self.verificationErrors=[]
# self.selenium=selenium('localhost',4444,"*chrome")
self.driver=webdriver.Firefox()
示例10: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self, *args, **kwargs)
示例11: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self):
CrawlSpider.__init__(self)
self.verificationErrors = []
self.selenium = webdriver.Firefox()
示例12: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self):
CrawlSpider.__init__(self)
# use any browser you wish
self.browser = webdriver.Firefox()
示例13: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self, *arg, **karg):
self.init_yaml('scrapy_service/templates/product.yaml','lazada_sitemap')
CrawlSpider.__init__(self, *arg)
示例14: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self, *arg, **karg):
self.name = karg['name']
self.init_yaml('scrapy_service/templates/product.yaml',self.name)
CrawlSpider.__init__(self, *arg)
示例15: __init__
# 需要导入模块: from scrapy.spiders import CrawlSpider [as 别名]
# 或者: from scrapy.spiders.CrawlSpider import __init__ [as 别名]
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self)
if 'mining_job_id' in kwargs:
self.mining_job_id = kwargs['mining_job_id']
if 'site_id' in kwargs:
self.site_id = kwargs['site_id']
if 'preview' in kwargs:
self.preview = 1
if 'iteration' in kwargs:
self.iteration = kwargs['iteration']
if 'management_node' in kwargs:
self.management_node = kwargs['management_node']
if 'username' in kwargs:
self.username = kwargs['username']
if 'password' in kwargs:
self.password = kwargs['password']
if 'proxy' in kwargs:
self.proxy = kwargs['proxy']
if 'robots_obey' in kwargs:
settings.set('ROBOTSTXT_OBEY', int(kwargs['robots_obey']), priority='cmdline')
if 'url' in kwargs:
self.start_urls.append(kwargs['url'] + self.url_fragmentanchor)
if 'extract' in kwargs:
self.extract = kwargs['extract']
if 'maxjobs' in kwargs:
self.maxjobs = int(kwargs['maxjobs'])
if 'protocol' in kwargs:
self.protocol = kwargs['protocol']
if 'maximum_try' in kwargs:
self.maximum_try = kwargs['maximum_try']
if 'on_demand' in kwargs:
self.on_demand = kwargs['on_demand']
if 'debug_id' in kwargs:
self.debug_id = kwargs['debug_id']
if 'stale_limit_seconds' in kwargs:
self.stale_limit = int(kwargs['stale_limit_seconds'])
if 'subspider_detector' in kwargs:
self.subspider_detector = True
self.required_fields = self.subspider_detect_fields
# Sending max items to be scraped.
if 'max_items_count' in kwargs:
self.max_items_count = int(kwargs['max_items_count'])
# set spider_valid_cutoff, default 80 percent of max_items_count
spider_valid_cutoff = kwargs.get("valid_cutoff")
if spider_valid_cutoff:
self.spider_valid_cutoff = int(spider_valid_cutoff)
else:
self.spider_valid_cutoff = int(self.max_items_count * 0.8)
# this will reduce extra requstes after a close_spider call
settings.overrides['CONCURRENT_REQUESTS'] = 1
self.debug = int(kwargs.get('debug', '0'))
if 'download_delay' in kwargs or hasattr(self, 'download_delay'):
download_delay = float(kwargs.get('download_delay', getattr(self, 'download_delay', 0)))
settings.set('DOWNLOAD_DELAY', download_delay, priority='cmdline')
if download_delay > 0:
settings.set('AUTOTHROTTLE_ENABLED', True, priority='cmdline')
if self.allowed_domain_bynetloc:
self.allowed_domains.append(urlparse.urlparse(kwargs['url']).netloc) # set list of domain allowed to crawl
self.default_job_field_getters.update({
'url': lambda self, response, item: response.url,
'date': lambda self, response, item: datetime.now().strftime('%Y/%m/%d'),
'language': lambda self, response, item: self.language if hasattr(self, 'language') else None
})
if self.extract_logo:
self.default_job_field_getters.update({'autoextracted_logo_urls': self.get_logos})
if self.extract_email:
self.default_job_field_getters.update({'autoextracted_emails': self.get_emails})
if self.extract_salary:
self.default_job_field_getters.update({'autoextracted_salaries': self.get_salaries})
if self.extract_website:
self.default_job_field_getters.update({'autoextracted_company_websites': self.get_websites})
self.default_fields = self.default_job_field_getters.keys()
self.validate_parse_job_wrapper = validate(fields_to_check=self.required_fields)(type(self).parse_job_wrapper)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)