本文整理匯總了Python中scrapy.spider.BaseSpider類的典型用法代碼示例。如果您正苦於以下問題:Python BaseSpider類的具體用法?Python BaseSpider怎麽用?Python BaseSpider使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了BaseSpider類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: __init__
def __init__(self, *arg1, **arg2):
log.msg(message="man_spider, __init__", _level = log.INFO)
BaseSpider.__init__(self, *arg1, **arg2)
self.man_spider_callback = {}
self.man_spider_callback['list'] = self.callback_list
self.man_spider_callback['parse'] = self.callback_parse
self.man_spider_callback['all'] = self.callback_all
示例2: __init__
def __init__(self):
BaseSpider.__init__(self)
# settings
settings.overrides["DOWNLOAD_DELAY"] = 0
settings.overrides["LOG_FILE"] = "scrapy.log"
settings.overrides["LOG_STDOUT"] = True
settings.overrides["DOWNLOAD_TIMEOUT"] = 180
settings.overrides["RETRY_TIMES"] = 10
# base url of all the pages
self.base_url = "http://www.365zn.com/fyc/"
# regex objects
# example: <a href="fyc_h.htm"
self.reobj_word_list_page = re.compile(r"fyc_\w+.htm")
# example: <a href=htm/11474.htm title='把持'>
self.reobj_word_and_page = re.compile(r"href=\S+\s+title='[^']+'")
# 【同義詞】 <font color=blue>胸有成竹 心中有數 穩操勝券</font>
self.reobj_synonym = re.compile(r"【同義詞】\W+<font color=blue>([^<]*)</font>")
# 【反義詞】 <font color=red>心中無數 手忙腳亂</font>
self.reobj_antonym = re.compile(r"【反義詞】\W+<font color=red>([^<]*)</font>")
# chinese character(s)
# self.reobj_chinese = re.compile(r"[\u4e00-\u9fa5]+")
self.reobj_chinese = re.compile(r"[\x80-\xff]+")
示例3: __init__
def __init__(self):
BaseSpider.__init__(self)
# settings
settings.overrides['DOWNLOAD_DELAY'] = 0.1
# regex object for extracting image url
self.reobj_image = re.compile(r"http://\S+.gstatic.com[^\"\s]+")
self.num_images_per_page = 20
self.num_images = 200
# initialize word searching url list
self.base_url = "http://images.google.com/search?tbm=isch&safe=off"
f_word_dict = file(r'SogouLabDic_tab_utf8_linux.dic')
# f_word_dict = file(r'test_dict')
word_lines = f_word_dict.readlines()
print "initialize image searching urls"
for word_line in word_lines:
word = word_line[ : word_line.index("\t")]
start = 0
while start < self.num_images:
self.start_urls.append( self.base_url +
"&q=" + word +
"&start=" + str(start)
)
start += self.num_images_per_page
print "created " + str( len(self.start_urls) ) + " image searching urls."
示例4: __init__
def __init__(self, **kwargs):
BaseSpider.__init__(self)
try:
self.outDir=kwargs['outDir']
if self.outDir[-1]!= '/': self.outDir += '/'
startYear=int(kwargs['startYear'])
endYear=int(kwargs['endYear'])
assert startYear <= endYear
except:
print >>sys.stderr, "eventSpider needs 3 arguments: outDir, startYear, endYear"
exit(1)
startingAdd = "https://en.wikipedia.org/wiki/"
self.start_urls = []
if startYear < -500:
for i in range(startYear, min(-499, endYear), 10):
add = startingAdd+str(-i)+"_BC"
self.start_urls.append(add)
path = self.outDir+str(-i)+"_BC/"
if not os.path.exists(path): os.makedirs(path)
if endYear > -500: startYear = -499
if startYear >-500 and startYear < 0:
for i in range(max(startYear,-499), min(0,endYear), 1):
add = startingAdd+str(-i)+"_BC"
self.start_urls.append(add)
path = self.outDir+str(-i)+"_BC/"
if not os.path.exists(path): os.makedirs(path)
if endYear > 0: startYear = 1
if startYear > 0:
for i in range(startYear, endYear+1):
add = startingAdd+str(i)
self.start_urls.append(add)
path = self.outDir+str(i)+"/"
if not os.path.exists(path): os.makedirs(path)
示例5: __init__
def __init__(self):
BaseSpider.__init__(self)
self.handle_httpstatus_list = range(0,1000)
self.requestCount = 0
print 'Opening Alexa URL CSV, please wait.'
maxSites = 200000
selectionInterval = 5 #Include every nth site
skipSites = 861010 #Skip the first n sites
csv_file = open('top-1m.csv','r')
alexaReader = csv.reader(csv_file)
rank=1
queuedCount = 0
for line in alexaReader :
domain = line[1]
if (rank % selectionInterval) == 0 and rank > skipSites:
self.allowed_domains.append( domain )
self.start_urls.append(domain)
queuedCount = queuedCount + 1
if (queuedCount >= maxSites) :
break
rank += 1
csv_file.close()
print 'Done opening URLs, starting crawler....'
示例6: get_defaults_spider_mw
def get_defaults_spider_mw(self):
crawler = get_crawler()
spider = BaseSpider('foo')
spider.set_crawler(crawler)
defaults = dict([(k, [v]) for k, v in \
crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()])
return defaults, spider, DefaultHeadersMiddleware()
示例7: test_rules_manager_callbacks
def test_rules_manager_callbacks(self):
mycallback = lambda: True
spider = BaseSpider('foo')
spider.parse_item = lambda: True
response1 = HtmlResponse('http://example.org')
response2 = HtmlResponse('http://othersite.org')
rulesman = RulesManager([
Rule('example', mycallback),
Rule('othersite', 'parse_item'),
], spider, default_matcher=UrlRegexMatcher)
rule1 = rulesman.get_rule_from_response(response1)
rule2 = rulesman.get_rule_from_response(response2)
self.failUnlessEqual(rule1.callback, mycallback)
self.failUnlessEqual(rule2.callback, spider.parse_item)
# fail unknown callback
self.assertRaises(AttributeError, RulesManager, [
Rule(BaseMatcher(), 'mycallback')
], spider)
# fail not callable
spider.not_callable = True
self.assertRaises(AttributeError, RulesManager, [
Rule(BaseMatcher(), 'not_callable')
], spider)
示例8: __init__
def __init__(self, **kwargs):
BaseSpider.__init__(self)
try:
self.outDir=kwargs['outDir']
if self.outDir[-1]!= '/': self.outDir += '/'
self.endYear=int(kwargs['endYear'])
except:
print >>sys.stderr, "eventSpider needs 3 arguments: outDir, outFile, endYear"
exit(1)
startingAdd = "http://en.wikipedia.org/wiki/"
self.start_urls = []
# self.start_urls = [startingAdd+"2011"]
# if not os.path.exists(self.outDir+"2011"): os.makedirs(self.outDir+"2011")
for i in range(1500, 499, -10):
add = startingAdd+str(i)+"_BC"
self.start_urls.append(add)
path = self.outDir+str(i)+"_BC/"
if not os.path.exists(path): os.makedirs(path)
for i in range(499, 0, -1):
add = startingAdd+str(i)+"_BC"
self.start_urls.append(add)
path = self.outDir+str(i)+"_BC/"
if not os.path.exists(path): os.makedirs(path)
for i in range(1, self.endYear+1):
add = startingAdd+str(i)
self.start_urls.append(add)
path = self.outDir+str(i)+"/"
if not os.path.exists(path): os.makedirs(path)
示例9: test_scheduler_persistent
def test_scheduler_persistent(self):
messages = []
spider = BaseSpider('myspider')
spider.log = lambda *args, **kwargs: messages.append([args, kwargs])
self.scheduler.persist = True
self.scheduler.open(spider)
self.assertEqual(messages, [])
self.scheduler.enqueue_request(Request('http://example.com/page1'))
self.scheduler.enqueue_request(Request('http://example.com/page2'))
self.assertTrue(self.scheduler.has_pending_requests())
self.scheduler.close('finish')
self.scheduler.open(spider)
self.assertEqual(messages, [
[('Resuming crawl (2 requests scheduled)',), {}],
])
self.assertEqual(len(self.scheduler), 2)
self.scheduler.persist = False
self.scheduler.close('finish')
self.assertEqual(len(self.scheduler), 0)
示例10: __init__
def __init__(self):
BaseSpider.__init__(self)
# starting virtual display
# comment this line if you are using desktop
display.start()
# estabilishing browser
self.browser = webdriver.Firefox()
示例11: __init__
def __init__(self):
BaseSpider.__init__(self)
self.verificationErrors = []
self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme.")
self.browser = webdriver.Firefox(self.profile)
self.duplicatesurl = {}
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
示例12: __init__
def __init__(self):
BaseSpider.__init__(self)
self.verificationErrors = []
# self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme")
# self.browser = webdriver.Firefox(self.profile)
self.browser = webdriver.Chrome('C:\Users\ZERO\AppData\Local\Google\Chrome\Application\chromedriver.exe')
self.duplicatesurl = {}
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
示例13: __init__
def __init__(self, domain_name=None):
BaseSpider.__init__(self, domain_name)
consumer_key = config.get('yammer', 'consumer_key')
consumer_secret = config.get('yammer', 'consumer_secret')
app_token = config.get('yammer', 'app_token')
self.consumer = OAuthConsumer(consumer_key, consumer_secret)
self.signature = OAuthSignatureMethod_PLAINTEXT()
self.token = OAuthToken.from_string(app_token)
示例14: __init__
def __init__(self, **kwargs):
BaseSpider.__init__(self)
startingAdd = "http://en.wikipedia.org/wiki/"
self.inFile=kwargs['infile']
self.outFile=kwargs['outfile']
self.start_urls = []
self.url2locDic = {}
self.readFile(self.inFile)
fout = codecs.open(self.outFile,"w", encoding='utf-8')
fout.close
示例15: __init__
def __init__(self):
BaseSpider.__init__(self)
self.verificationErrors = []
with open(self.contactsDataFile, 'rb') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
self.log('Initialing with contact urls from file : ' + self.contactsDataFile + ' ...')
for row in csvreader:
if row[1].startswith('https') == True:
self.start_urls.append(row[1])
self.log('Total contacts loaded : %d' % len(self.start_urls))