當前位置: 首頁>>代碼示例>>Python>>正文


Python spider.BaseSpider類代碼示例

本文整理匯總了Python中scrapy.spider.BaseSpider的典型用法代碼示例。如果您正苦於以下問題:Python BaseSpider類的具體用法?Python BaseSpider怎麽用?Python BaseSpider使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


在下文中一共展示了BaseSpider類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: __init__

	def __init__(self, *arg1, **arg2):
		log.msg(message="man_spider, __init__", _level = log.INFO)
		BaseSpider.__init__(self, *arg1, **arg2)
		self.man_spider_callback = {}
		self.man_spider_callback['list'] = self.callback_list
		self.man_spider_callback['parse'] = self.callback_parse
		self.man_spider_callback['all'] = self.callback_all
開發者ID:smart--petea,項目名稱:scrapy_manning,代碼行數:7,代碼來源:man_spider.py

示例2: __init__

    def __init__(self):
        BaseSpider.__init__(self)

        # settings
        settings.overrides["DOWNLOAD_DELAY"] = 0
        settings.overrides["LOG_FILE"] = "scrapy.log"
        settings.overrides["LOG_STDOUT"] = True
        settings.overrides["DOWNLOAD_TIMEOUT"] = 180
        settings.overrides["RETRY_TIMES"] = 10

        # base url of all the pages
        self.base_url = "http://www.365zn.com/fyc/"

        # regex objects

        # example: <a href="fyc_h.htm"
        self.reobj_word_list_page = re.compile(r"fyc_\w+.htm")

        # example: <a href=htm/11474.htm title='把持'>
        self.reobj_word_and_page = re.compile(r"href=\S+\s+title='[^']+'")

        # 【同義詞】 <font color=blue>胸有成竹&nbsp;&nbsp;心中有數&nbsp;&nbsp;穩操勝券</font>
        self.reobj_synonym = re.compile(r"【同義詞】\W+<font color=blue>([^<]*)</font>")

        # 【反義詞】 <font color=red>心中無數&nbsp;&nbsp;手忙腳亂</font>
        self.reobj_antonym = re.compile(r"【反義詞】\W+<font color=red>([^<]*)</font>")

        # chinese character(s)
        #        self.reobj_chinese = re.compile(r"[\u4e00-\u9fa5]+")
        self.reobj_chinese = re.compile(r"[\x80-\xff]+")
開發者ID:czhedu,項目名稱:MAM,代碼行數:30,代碼來源:365zn_chinese_treasure.py

示例3: __init__

    def __init__(self):
        BaseSpider.__init__(self)

        # settings 
        settings.overrides['DOWNLOAD_DELAY'] = 0.1

        # regex object for extracting image url
        self.reobj_image = re.compile(r"http://\S+.gstatic.com[^\"\s]+")

        self.num_images_per_page = 20
        self.num_images = 200

        # initialize word searching url list
        self.base_url = "http://images.google.com/search?tbm=isch&safe=off"

        f_word_dict = file(r'SogouLabDic_tab_utf8_linux.dic')
#        f_word_dict = file(r'test_dict') 
        word_lines = f_word_dict.readlines()

        print "initialize image searching urls"
        for word_line in word_lines:
            word = word_line[ : word_line.index("\t")]

            start = 0 
            while start < self.num_images:
                self.start_urls.append( self.base_url + 
                                        "&q=" + word + 
                                        "&start=" + str(start)
                                      )
                start += self.num_images_per_page
        print "created " + str( len(self.start_urls) ) + " image searching urls."
開發者ID:czhedu,項目名稱:MAM,代碼行數:31,代碼來源:win_chinese_word_image_spider.py

示例4: __init__

	def __init__(self, **kwargs):
		BaseSpider.__init__(self)
		try:
			self.outDir=kwargs['outDir']
			if self.outDir[-1]!= '/': self.outDir += '/'
			startYear=int(kwargs['startYear'])
			endYear=int(kwargs['endYear'])
			assert startYear <= endYear 
		except:
			print >>sys.stderr, "eventSpider needs 3 arguments: outDir, startYear, endYear"
			exit(1)
		startingAdd = "https://en.wikipedia.org/wiki/"
		self.start_urls = []
                if startYear < -500:
		    	for i in range(startYear, min(-499, endYear), 10):
				add = startingAdd+str(-i)+"_BC"
				self.start_urls.append(add)
				path = self.outDir+str(-i)+"_BC/"
    				if not os.path.exists(path):   os.makedirs(path)
			if endYear > -500: startYear = -499
		if startYear >-500 and startYear < 0:
			for i in range(max(startYear,-499), min(0,endYear), 1):
				add = startingAdd+str(-i)+"_BC"
				self.start_urls.append(add)
				path = self.outDir+str(-i)+"_BC/"
	    			if not os.path.exists(path):   os.makedirs(path)
			if endYear > 0: startYear = 1
		if startYear > 0:
			for i in range(startYear, endYear+1):
				add = startingAdd+str(i)
				self.start_urls.append(add)
				path = self.outDir+str(i)+"/"
	   			if not os.path.exists(path):   os.makedirs(path)
開發者ID:avacariu,項目名稱:lensingwikipedia,代碼行數:33,代碼來源:eventSpider.py

示例5: __init__

    def __init__(self):
        BaseSpider.__init__(self)
        
        self.handle_httpstatus_list = range(0,1000)
        self.requestCount = 0
        
        print 'Opening Alexa URL CSV, please wait.'
        maxSites = 200000
        selectionInterval = 5   #Include every nth site
        skipSites = 861010      #Skip the first n sites
        
        csv_file = open('top-1m.csv','r') 
        alexaReader = csv.reader(csv_file)
        
        rank=1
        queuedCount = 0
        for line in alexaReader :
            domain = line[1]
            if (rank % selectionInterval) == 0 and rank > skipSites:
                self.allowed_domains.append( domain )
                self.start_urls.append(domain)
                queuedCount = queuedCount + 1
                if (queuedCount >= maxSites) :
                    break

            rank += 1
        
        csv_file.close()
        print 'Done opening URLs, starting crawler....'
開發者ID:tbook,項目名稱:comp527-serversurvey,代碼行數:29,代碼來源:SurveySpider.py

示例6: get_defaults_spider_mw

 def get_defaults_spider_mw(self):
     crawler = get_crawler()
     spider = BaseSpider('foo')
     spider.set_crawler(crawler)
     defaults = dict([(k, [v]) for k, v in \
         crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()])
     return defaults, spider, DefaultHeadersMiddleware()
開發者ID:Aaron1011,項目名稱:oh-mainline,代碼行數:7,代碼來源:test_downloadermiddleware_defaultheaders.py

示例7: test_rules_manager_callbacks

    def test_rules_manager_callbacks(self):
        mycallback = lambda: True

        spider = BaseSpider('foo')
        spider.parse_item = lambda: True

        response1 = HtmlResponse('http://example.org')
        response2 = HtmlResponse('http://othersite.org')

        rulesman = RulesManager([
            Rule('example', mycallback),
            Rule('othersite', 'parse_item'),
                ], spider, default_matcher=UrlRegexMatcher)

        rule1 = rulesman.get_rule_from_response(response1)
        rule2 = rulesman.get_rule_from_response(response2)

        self.failUnlessEqual(rule1.callback, mycallback)
        self.failUnlessEqual(rule2.callback, spider.parse_item)

        # fail unknown callback
        self.assertRaises(AttributeError, RulesManager, [
                            Rule(BaseMatcher(), 'mycallback')
                            ], spider)
        # fail not callable
        spider.not_callable = True
        self.assertRaises(AttributeError, RulesManager, [
                            Rule(BaseMatcher(), 'not_callable')
                            ], spider)
開發者ID:bihicheng,項目名稱:scrapy,代碼行數:29,代碼來源:test_contrib_exp_crawlspider_rules.py

示例8: __init__

	def __init__(self, **kwargs):
		BaseSpider.__init__(self)
		try:
			self.outDir=kwargs['outDir']
			if self.outDir[-1]!= '/': self.outDir += '/'
			self.endYear=int(kwargs['endYear'])
		except:
			print >>sys.stderr, "eventSpider needs 3 arguments: outDir, outFile, endYear"
			exit(1)
		startingAdd = "http://en.wikipedia.org/wiki/"
		self.start_urls = []
#		self.start_urls = [startingAdd+"2011"]
#  		if not os.path.exists(self.outDir+"2011"):   os.makedirs(self.outDir+"2011")
		for i in range(1500, 499, -10):
			add = startingAdd+str(i)+"_BC"
			self.start_urls.append(add)
			path = self.outDir+str(i)+"_BC/"
    			if not os.path.exists(path):   os.makedirs(path)
		for i in range(499, 0, -1):
			add = startingAdd+str(i)+"_BC"
			self.start_urls.append(add)
			path = self.outDir+str(i)+"_BC/"
    			if not os.path.exists(path):   os.makedirs(path)
		for i in range(1, self.endYear+1):
			add = startingAdd+str(i)
			self.start_urls.append(add)
			path = self.outDir+str(i)+"/"
   			if not os.path.exists(path):   os.makedirs(path)
開發者ID:mhscientist,項目名稱:lensingwikipedia,代碼行數:28,代碼來源:eventSpider.py

示例9: test_scheduler_persistent

    def test_scheduler_persistent(self):
        messages = []
        spider = BaseSpider('myspider')
        spider.log = lambda *args, **kwargs: messages.append([args, kwargs])

        self.scheduler.persist = True
        self.scheduler.open(spider)

        self.assertEqual(messages, [])

        self.scheduler.enqueue_request(Request('http://example.com/page1'))
        self.scheduler.enqueue_request(Request('http://example.com/page2'))

        self.assertTrue(self.scheduler.has_pending_requests())
        self.scheduler.close('finish')

        self.scheduler.open(spider)
        self.assertEqual(messages, [
            [('Resuming crawl (2 requests scheduled)',), {}],
        ])
        self.assertEqual(len(self.scheduler), 2)

        self.scheduler.persist = False
        self.scheduler.close('finish')

        self.assertEqual(len(self.scheduler), 0)
開發者ID:Mondego,項目名稱:pyreco,代碼行數:26,代碼來源:allPythonContent.py

示例10: __init__

    def __init__(self):
        BaseSpider.__init__(self)
        # starting virtual display
        # comment this line if you are using desktop
        display.start()

        # estabilishing browser
        self.browser = webdriver.Firefox() 
開發者ID:MTN111,項目名稱:AWO121-Full,代碼行數:8,代碼來源:twitterscrape.py

示例11: __init__

 def __init__(self):
     BaseSpider.__init__(self)
     self.verificationErrors = []
     self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme.")
     self.browser = webdriver.Firefox(self.profile)
             
     self.duplicatesurl = {}
     dispatcher.connect(self.spider_opened, signals.spider_opened)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
開發者ID:Suluo,項目名稱:UGC-profile-identify,代碼行數:9,代碼來源:pachong.py

示例12: __init__

    def __init__(self):
        BaseSpider.__init__(self)
        self.verificationErrors = []
        # self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme")
        # self.browser = webdriver.Firefox(self.profile)
        self.browser = webdriver.Chrome('C:\Users\ZERO\AppData\Local\Google\Chrome\Application\chromedriver.exe')

        self.duplicatesurl = {}
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
開發者ID:Suluo,項目名稱:UGC-profile-identify,代碼行數:10,代碼來源:pachong.py

示例13: __init__

    def __init__(self, domain_name=None):
        BaseSpider.__init__(self, domain_name)

        consumer_key    = config.get('yammer', 'consumer_key')
        consumer_secret = config.get('yammer', 'consumer_secret')
        app_token       = config.get('yammer', 'app_token')

        self.consumer  = OAuthConsumer(consumer_key, consumer_secret)
        self.signature = OAuthSignatureMethod_PLAINTEXT()
        self.token     = OAuthToken.from_string(app_token)
開發者ID:c0ffee,項目名稱:couch-crawler,代碼行數:10,代碼來源:yammer.py

示例14: __init__

	def __init__(self, **kwargs):
		BaseSpider.__init__(self)
		startingAdd = "http://en.wikipedia.org/wiki/"
		self.inFile=kwargs['infile']
		self.outFile=kwargs['outfile']
		self.start_urls = []
		self.url2locDic = {}
		self.readFile(self.inFile)
		fout = codecs.open(self.outFile,"w", encoding='utf-8')
		fout.close
開發者ID:mhscientist,項目名稱:lensingwikipedia,代碼行數:10,代碼來源:nerLocSpider.py

示例15: __init__

 def __init__(self):
    BaseSpider.__init__(self)
    self.verificationErrors = []
    with open(self.contactsDataFile, 'rb') as csvfile:
      csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
      self.log('Initialing with contact urls from file : ' + self.contactsDataFile + ' ...')
      for row in csvreader:
         if row[1].startswith('https') == True:
            self.start_urls.append(row[1])
            
    self.log('Total contacts loaded : %d' % len(self.start_urls))
開發者ID:venkatra,項目名稱:JigsawScrapper,代碼行數:11,代碼來源:RetrieveContactDetails.py


注:本文中的scrapy.spider.BaseSpider類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。