當前位置: 首頁>>代碼示例>>Python>>正文


Python spider.Spider類代碼示例

本文整理匯總了Python中spider.Spider的典型用法代碼示例。如果您正苦於以下問題:Python Spider類的具體用法?Python Spider怎麽用?Python Spider使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


在下文中一共展示了Spider類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: main

def main():
    """ 程序主入口
        獲取命令行參數並做判斷和處理,根據參數設置logger,創建線程池和 spider,線程池中加入
        工作線程 處理線程任務,spider向線程池中加入任務。
    """
    # 獲取命令行參數並處理
    args = base.get_arg()
    if not base.check_args(args):
        print 'Args error!'
        sys.exit()
    base.handle_args(args)

    # 設置logger
    if not base.set_logger(args.log_file, args.log_level):
        print 'Set logger error'
        sys.exit()
    logger.debug('Get args :%s' % args)

    # 程序自檢
    if args.test_self:
        base.test_self()
        sys.exit()

    database =  Sqlite3DB(args.db_file)

    # 創建 spider 和 線程池。根據 thread_num 向線程池加入多個工作線程。
    # 在 spider 中建立多個任務 放入到線程池中。
    spider = Spider(args.url, args.depth, args.thread_num, args.key_word,
                    args.down_file, database)
    main_thread = MainThread(spider)
    main_thread.start()
    spider.start()
開發者ID:micheal-xudb,項目名稱:py-spider,代碼行數:32,代碼來源:main.py

示例2: work

def work():
    while True:
        item = thread_queue.get()
        url = item['url']
        distance = item['distance']
        Spider.crawl_page(threading.current_thread().name, url, distance)
        thread_queue.task_done()
開發者ID:nenad1001,項目名稱:CrazyS,代碼行數:7,代碼來源:main.py

示例3: walk

    def walk(self, url, outfile):
        self.pageinfo = {}
        self.errors = []

        Spider.walk(self, url, self.iswebpage)

        print("\r[ ] Processed %i urls" % (len(self.pageinfo)))

        urlset = ET.Element('urlset', {'xmlns':"http://www.sitemaps.org/schemas/sitemap/0.9"})
        for page in self.pageinfo:
            url = ET.SubElement(urlset, 'url')
            loc = ET.SubElement(url, 'loc')
            lastmod = ET.SubElement(url, 'lastmod')
            changefreq = ET.SubElement(url, 'changefreq')
            priority = ET.SubElement(url, 'priority')

            loc.text = page
            lastmod.text = self.pageinfo[page]['lastmod']
            changefreq.text = self.pageinfo[page]['change']
            priority.text = '%0.1f' % self.pageinfo[page]['pri']

        tree = ET.ElementTree(urlset)
        tree.write(outfile, encoding='utf-8', xml_declaration=True)

        if len(self.errors) > 0:
            print("[!] The following pages produced errors:")
            for e in self.errors:
                print("    %i %s" % (e[1], e[0]))
開發者ID:nada-labs,項目名稱:sitemap-generator,代碼行數:28,代碼來源:sitemap.py

示例4: work

def work():
    print('main.py/work()')
    while True:
        url=queue.get()
        Spider.crawl_page(threading.current_thread().name,url)
        queue.task_done()
    print('main.py/work()/end')
開發者ID:AllenDrake2016,項目名稱:Readme,代碼行數:7,代碼來源:main.py

示例5: downloadArchivesList

def downloadArchivesList(aList, container, extension='.txt.gz', numThreads=5):
    '''Set up downloader'''
    queue = initDownloader(numThreads)

    import csv
    f = open(aList, 'rb')
    reader = csv.reader(f)
    for row in reader:
        startURL = row[0]
        
        mlName = startURL.split('/')[-2]
        
        spider = Spider(startURL)
        spider.process_page(startURL)
            
        '''Only the links to archive files are interesting:
        mailing list archive file names end with '.txt.gz' '''
        urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)]
        if len(urlList):
            print '%s: %d archives' % (mlName, len(urlList))
            store = os.path.join(container, mlName)
            if not (os.path.isdir(store)):
                    os.system("mkdir %s" % store)
                
            '''Download each archive'''
            addToQ(queue, urlList, store)
                        
    '''If here, download finished. Stop threads'''
    stopDownloader(queue, numThreads)
開發者ID:cupescapone,項目名稱:miningTools,代碼行數:29,代碼來源:mlArchivesDownloader.py

示例6: __init__

 def __init__(self, **kwargs):
     kwargs['enable_reborn'] = True
     kwargs['enable_proxy'] = False
     kwargs['max_login_tries'] = 8
     #kwargs['ips_obj'] = self.ips_obj
     self.out = open('out.txt', 'w+')
     self.login_status = False
     Spider.__init__(self, **kwargs)
開發者ID:vv1133,項目名稱:spider_engine,代碼行數:8,代碼來源:example_brand.py

示例7: run

 def run(self):
     sp = Spider()
     if not sp.login_succeed:
         self.stop()
     else:
         while True:
             new_stuff = sp.update()
             if len(new_stuff) > 0:
                 print str(len(new_stuff)) + " weibos to update"
                 glob.newswall.notifyCallbacks(new_stuff)
             time.sleep(crawl_interval)
開發者ID:brotherb,項目名稱:weibo_sync_wall,代碼行數:11,代碼來源:crawlerthread.py

示例8: spider

 def spider(self):
     # 請求頭增加cc
     s = Spider(additional_headers={'Cache-Control': 'max-age=0'})
     try:
         s.fetch(self.url)
     except HTTPError as e:
         # 檢查該電影相關頁麵是否存在
         if e.msg == 'Not Found':
             return
     # 因為中文被編碼成utf-8之後變成'/u2541'之類的形式,lxml一遇到"/"就會認為其標簽結束
     return etree.HTML(s.content.decode('utf-8'))
開發者ID:StevenLOL,項目名稱:Mtime,代碼行數:11,代碼來源:parse.py

示例9: main

def main():
    args = command_parser()
    target_url = args.target_url[0]
    depth = int(args.depth[0])
    log_level = int(args.log_level)
    log_file = args.log_file
    thread_number = int(args.thread_number)
    key = args.key
    db_file = args.db_file
    test_self = args.test_self
    spider = Spider(target_url, depth=depth, thread_number=thread_number)
    spider.start()
開發者ID:franciumzh,項目名稱:spider,代碼行數:12,代碼來源:main.py

示例10: __init__

    def __init__(self, master):
        self.master = master
        
        east_group = LabelFrame(master, text='東部')
        east_group.grid(row=0, column=0, padx=5, pady=5)
        west_group = LabelFrame(master, text='西部')
        west_group.grid(row=1, column=0, padx=5, pady=5)
        
        # 東部排名
        east_ranking = LabelFrame(master, text='東部排名')
        east_ranking.grid(row=0, column=1, rowspan=2, padx=5, pady=5, sticky=N)
        self.east_ranking_list = self.creat_teams_ranking_list(east_ranking)

        # 西部排名
        west_ranking = LabelFrame(master, text='西部排名')
        west_ranking.grid(row=0, column=2, rowspan=2, padx=5, pady=5, sticky=N)
        self.west_ranking_list = self.creat_teams_ranking_list(west_ranking)

        # 東部
        atlantic_group = LabelFrame(east_group, text='大西洋區')
        atlantic_group.grid(row=0, column=0, padx=5, pady=5)
        central_group = LabelFrame(east_group, text='中部區')
        central_group.grid(row=0, column=1, padx=5, pady=5)
        southeast_group = LabelFrame(east_group, text='東南區')
        southeast_group.grid(row=0, column=2, padx=5, pady=5)

        # 西部
        pacific_group = LabelFrame(west_group, text='太平洋區')
        pacific_group.grid(row=1, column=0, padx=5, pady=5)
        southwest_group = LabelFrame(west_group, text='西南區')
        southwest_group.grid(row=1, column=1, padx=5, pady=5)
        northwest_group = LabelFrame(west_group, text='西北區')
        northwest_group.grid(row=1, column=2, padx=5, pady=5)


        spider = Spider()
        index_data = spider.load_teams_index()
        teams_ranking_data = spider.load_teams_ranking()

        analyzer = Analyzer()
        teams_data = analyzer.analyze_teams_data(index_data)
        self.teams_ranking = analyzer.analyze_teams_ranking(teams_ranking_data)

        self.load_teams_ranking()

        self.teams_logo = utils.load_teams_logos()
        self.load_group(atlantic_group, teams_data[0:5])
        self.load_group(pacific_group, teams_data[5:10])
        self.load_group(central_group, teams_data[10:15])
        self.load_group(southwest_group, teams_data[15:20])
        self.load_group(southeast_group, teams_data[20:25])
        self.load_group(northwest_group, teams_data[25:30])
開發者ID:Yuanlimakefun,項目名稱:NBATeams,代碼行數:52,代碼來源:app_frame.py

示例11: work

def work():
    while True:
        url = queue.get()
        table_name = 'url_title_rel'
        title = Spider.crawl_page(threading.current_thread().name, url, DB_FILE_PATH, table_name)
        #print title
        queue.task_done()
開發者ID:Changjinxing,項目名稱:titleCrawler,代碼行數:7,代碼來源:main.py

示例12: grab_crawler

def grab_crawler(data):
    bot = Spider()
    bot.initial_urls = [data['site_url']]
    bot.total = data['image_count']
    bot.result_status = 'inprogress'
    bot.image_type = data['image_type']
    bot.run()
開發者ID:NewOldMax,項目名稱:image-grabber,代碼行數:7,代碼來源:app.py

示例13: create_spider

    def create_spider(self):
        spider = Spider()

        xml = parse(self._filename)
        params = xml.getElementsByTagName(self._parameters)
        if params is not None:
            params = params[0]

            pages = params.getElementsByTagName(self._page)
            for page in pages:
                print(page.firstChild.data)
                spider.add_url(page.firstChild.data)

            domains = params.getElementsByTagName(self._domain)
            for domain in domains:
                print(domain.firstChild.data)
                spider.add_domain(domain.firstChild.data)

            depth = params.getElementsByTagName(self._depth)
            if depth is not None:
                depth = depth[0]
                print(depth.firstChild.data)
                spider.set_max_depth(depth.firstChild.data)

        return spider
開發者ID:2gisprojectT,項目名稱:terehov-soundcloud,代碼行數:25,代碼來源:spider_xml_factory.py

示例14: work

def work():
    while True:
        url=queue.get()
        Spider.crawl_page(threading.current_thread().name,url)
    queue.task.done()





    
    
    
    
    
    
    
    
    
開發者ID:99sbr,項目名稱:Scrapy-Spider,代碼行數:5,代碼來源:main.py

示例15: TestSpider

class TestSpider(unittest.TestCase):

    def setUp(self):
        self.test_spider = Spider("aladinfoods.bg")

    def test_spider_init(self):
        self.assertEqual(self.test_spider.scaned_url, [])
        self.assertEqual(self.test_spider.domain, "aladinfoods.bg")

    def test_is_outgoing(self):
        self.assertFalse(self.test_spider.is_outgoing("http://aladinfoods.bg"))

    def test_is_not_outgoing(self):
        self.assertTrue(self.test_spider.is_outgoing("http://hackbulgaria.com"))

    def test_is_valid(self):
        self.assertTrue(self.test_spider.is_valid("http://aladinfoods.bg/menu"))

    def test_is_not_valid(self):
        self.assertFalse(self.test_spider.is_valid("http://hackbulgaria.com"))
開發者ID:AlexanderTankov,項目名稱:SearchEngine,代碼行數:20,代碼來源:spider_test.py


注:本文中的spider.Spider類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。