本文整理汇总了Python中scrapy.spider.BaseSpider.__init__方法的典型用法代码示例。如果您正苦于以下问题:Python BaseSpider.__init__方法的具体用法?Python BaseSpider.__init__怎么用?Python BaseSpider.__init__使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.spider.BaseSpider
的用法示例。
在下文中一共展示了BaseSpider.__init__方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self):
BaseSpider.__init__(self)
self.handle_httpstatus_list = range(0,1000)
self.requestCount = 0
print 'Opening Alexa URL CSV, please wait.'
maxSites = 200000
selectionInterval = 5 #Include every nth site
skipSites = 861010 #Skip the first n sites
csv_file = open('top-1m.csv','r')
alexaReader = csv.reader(csv_file)
rank=1
queuedCount = 0
for line in alexaReader :
domain = line[1]
if (rank % selectionInterval) == 0 and rank > skipSites:
self.allowed_domains.append( domain )
self.start_urls.append(domain)
queuedCount = queuedCount + 1
if (queuedCount >= maxSites) :
break
rank += 1
csv_file.close()
print 'Done opening URLs, starting crawler....'
示例2: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self):
BaseSpider.__init__(self)
# settings
settings.overrides["DOWNLOAD_DELAY"] = 0
settings.overrides["LOG_FILE"] = "scrapy.log"
settings.overrides["LOG_STDOUT"] = True
settings.overrides["DOWNLOAD_TIMEOUT"] = 180
settings.overrides["RETRY_TIMES"] = 10
# base url of all the pages
self.base_url = "http://www.365zn.com/fyc/"
# regex objects
# example: <a href="fyc_h.htm"
self.reobj_word_list_page = re.compile(r"fyc_\w+.htm")
# example: <a href=htm/11474.htm title='把持'>
self.reobj_word_and_page = re.compile(r"href=\S+\s+title='[^']+'")
# 【同义词】 <font color=blue>胸有成竹 心中有数 稳操胜券</font>
self.reobj_synonym = re.compile(r"【同义词】\W+<font color=blue>([^<]*)</font>")
# 【反义词】 <font color=red>心中无数 手忙脚乱</font>
self.reobj_antonym = re.compile(r"【反义词】\W+<font color=red>([^<]*)</font>")
# chinese character(s)
# self.reobj_chinese = re.compile(r"[\u4e00-\u9fa5]+")
self.reobj_chinese = re.compile(r"[\x80-\xff]+")
示例3: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self):
BaseSpider.__init__(self)
# settings
settings.overrides['DOWNLOAD_DELAY'] = 0.1
# regex object for extracting image url
self.reobj_image = re.compile(r"http://\S+.gstatic.com[^\"\s]+")
self.num_images_per_page = 20
self.num_images = 200
# initialize word searching url list
self.base_url = "http://images.google.com/search?tbm=isch&safe=off"
f_word_dict = file(r'SogouLabDic_tab_utf8_linux.dic')
# f_word_dict = file(r'test_dict')
word_lines = f_word_dict.readlines()
print "initialize image searching urls"
for word_line in word_lines:
word = word_line[ : word_line.index("\t")]
start = 0
while start < self.num_images:
self.start_urls.append( self.base_url +
"&q=" + word +
"&start=" + str(start)
)
start += self.num_images_per_page
print "created " + str( len(self.start_urls) ) + " image searching urls."
示例4: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self, **kwargs):
BaseSpider.__init__(self)
try:
self.outDir=kwargs['outDir']
if self.outDir[-1]!= '/': self.outDir += '/'
startYear=int(kwargs['startYear'])
endYear=int(kwargs['endYear'])
assert startYear <= endYear
except:
print >>sys.stderr, "eventSpider needs 3 arguments: outDir, startYear, endYear"
exit(1)
startingAdd = "https://en.wikipedia.org/wiki/"
self.start_urls = []
if startYear < -500:
for i in range(startYear, min(-499, endYear), 10):
add = startingAdd+str(-i)+"_BC"
self.start_urls.append(add)
path = self.outDir+str(-i)+"_BC/"
if not os.path.exists(path): os.makedirs(path)
if endYear > -500: startYear = -499
if startYear >-500 and startYear < 0:
for i in range(max(startYear,-499), min(0,endYear), 1):
add = startingAdd+str(-i)+"_BC"
self.start_urls.append(add)
path = self.outDir+str(-i)+"_BC/"
if not os.path.exists(path): os.makedirs(path)
if endYear > 0: startYear = 1
if startYear > 0:
for i in range(startYear, endYear+1):
add = startingAdd+str(i)
self.start_urls.append(add)
path = self.outDir+str(i)+"/"
if not os.path.exists(path): os.makedirs(path)
示例5: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self, **kwargs):
BaseSpider.__init__(self)
try:
self.outDir=kwargs['outDir']
if self.outDir[-1]!= '/': self.outDir += '/'
self.endYear=int(kwargs['endYear'])
except:
print >>sys.stderr, "eventSpider needs 3 arguments: outDir, outFile, endYear"
exit(1)
startingAdd = "http://en.wikipedia.org/wiki/"
self.start_urls = []
# self.start_urls = [startingAdd+"2011"]
# if not os.path.exists(self.outDir+"2011"): os.makedirs(self.outDir+"2011")
for i in range(1500, 499, -10):
add = startingAdd+str(i)+"_BC"
self.start_urls.append(add)
path = self.outDir+str(i)+"_BC/"
if not os.path.exists(path): os.makedirs(path)
for i in range(499, 0, -1):
add = startingAdd+str(i)+"_BC"
self.start_urls.append(add)
path = self.outDir+str(i)+"_BC/"
if not os.path.exists(path): os.makedirs(path)
for i in range(1, self.endYear+1):
add = startingAdd+str(i)
self.start_urls.append(add)
path = self.outDir+str(i)+"/"
if not os.path.exists(path): os.makedirs(path)
示例6: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self, *arg1, **arg2):
log.msg(message="man_spider, __init__", _level = log.INFO)
BaseSpider.__init__(self, *arg1, **arg2)
self.man_spider_callback = {}
self.man_spider_callback['list'] = self.callback_list
self.man_spider_callback['parse'] = self.callback_parse
self.man_spider_callback['all'] = self.callback_all
示例7: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self):
BaseSpider.__init__(self)
# starting virtual display
# comment this line if you are using desktop
display.start()
# estabilishing browser
self.browser = webdriver.Firefox()
示例8: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self):
BaseSpider.__init__(self)
self.verificationErrors = []
self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme.")
self.browser = webdriver.Firefox(self.profile)
self.duplicatesurl = {}
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
示例9: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self):
BaseSpider.__init__(self)
self.verificationErrors = []
# self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme")
# self.browser = webdriver.Firefox(self.profile)
self.browser = webdriver.Chrome('C:\Users\ZERO\AppData\Local\Google\Chrome\Application\chromedriver.exe')
self.duplicatesurl = {}
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
示例10: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self, **kwargs):
BaseSpider.__init__(self)
startingAdd = "http://en.wikipedia.org/wiki/"
self.inFile=kwargs['infile']
self.outFile=kwargs['outfile']
self.start_urls = []
self.url2locDic = {}
self.readFile(self.inFile)
fout = codecs.open(self.outFile,"w", encoding='utf-8')
fout.close
示例11: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self, domain_name=None):
BaseSpider.__init__(self, domain_name)
consumer_key = config.get('yammer', 'consumer_key')
consumer_secret = config.get('yammer', 'consumer_secret')
app_token = config.get('yammer', 'app_token')
self.consumer = OAuthConsumer(consumer_key, consumer_secret)
self.signature = OAuthSignatureMethod_PLAINTEXT()
self.token = OAuthToken.from_string(app_token)
示例12: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self):
BaseSpider.__init__(self)
self.verificationErrors = []
with open(self.contactsDataFile, 'rb') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
self.log('Initialing with contact urls from file : ' + self.contactsDataFile + ' ...')
for row in csvreader:
if row[1].startswith('https') == True:
self.start_urls.append(row[1])
self.log('Total contacts loaded : %d' % len(self.start_urls))
示例13: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self, name=None, **kwargs):
if not hasattr(self, 'start_urls'):
self.start_urls = []
file_list = [i for i in os.listdir(
self.result_path) if i.endswith('.html')]
for i in file_list:
path = os.path.join(self.result_path, i).replace('?', '%3F')
url = 'file://%s' % (path)
self.start_urls.append(url)
BaseSpider.__init__(self, kwargs=kwargs)
self.item = Commonshop()
示例14: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self):
BaseSpider.__init__(self) # inizializzo il baseSpider con il metodo originale (sto riscrivendo il metodo '__init__()')
self.verificationErrors = []
# --- Disattivare l'apertura del brawser -------------------------------------
# Funziona soltatno con Linux, per via delle dipendenze grafiche...
# self.display = Display(visible=0,backend ='xvnb', size=(800, 600))
# self.display = Display(visible=0, size=(800, 600))
# self.display.start()
# ----------------------------------------------------------------------------
self.driver = webdriver.Firefox(self.disableImages()) # carico il webdriver con il profilo che crea la funzione 'disableImages()'
示例15: __init__
# 需要导入模块: from scrapy.spider import BaseSpider [as 别名]
# 或者: from scrapy.spider.BaseSpider import __init__ [as 别名]
def __init__(self, **kwargs):
BaseSpider.__init__(self)
startingAdd = "http://en.wikipedia.org/wiki/"
self.inFile=kwargs['infile']
self.outFileLoc=kwargs['outfileLoc']
self.outFilePer=kwargs['outfilePer']
self.start_urls = []
self.url2locDic = {}
self.url2urlDic = {}
self.readFile(self.inFile)
fout = open(self.outFileLoc,"w")
fout = open(self.outFilePer,"w")
fout.close