当前位置: 首页>>代码示例>>Python>>正文


Python Crawler.Crawler类代码示例

本文整理汇总了Python中Crawler.Crawler的典型用法代码示例。如果您正苦于以下问题:Python Crawler类的具体用法?Python Crawler怎么用?Python Crawler使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Crawler类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_article

    def get_article(self, url):
        crawler = Crawler()
        # get html data from url
        web_data = crawler.get_page(url)
        soup = BeautifulSoup(web_data, 'html.parser')

        # remove link news 
        [e.extract() for e in soup('div', {'class':'link_news'})]

        # article title
        self.title = soup('h3', {'id':'articleTitle'})[0].text

        # create date and time of article
        date_time = soup('span', {'class':'t11'})[0].text.split()
        self.date = date_time[0]
        self.time = date_time[1]

        # press name
        press_logo = soup('div', {'class':'press_logo'})[0]
        self.press = press_logo.find('img')['alt']
        del press_logo

        # article contents
        self.contents = soup('div', {'id':'articleBodyContents'})[0].text
        self.contents = re.sub('[\n\r]', '', self.contents)
开发者ID:ByeongkiJeong,项目名称:naverNewsCrawler,代码行数:25,代码来源:navernews.py

示例2: __init__

    def __init__(self):
        myCrawler = Crawler(self.LINKS)
        crawledURLs = myCrawler.getVisited()
        linkStructure = myCrawler.getLinkStructure()
        print("Link-Struktur:\n")
        myCrawler.printLinkStructure()

        myPageRank = PageRank(linkStructure)
        pageRanks = myPageRank.getPageRank()
        print("\n\nPageRanks:\n")
        myPageRank.printPageRank()

        myIndex = Index(self.STOPWORDS, crawledURLs)
        index = myIndex.getIndex()
        print("\n\nIndex:\n")
        myIndex.printIndex()

        myScorer = Scorer(pageRanks, index,linkStructure)
        #myScorer.usePageRank(True)
        print("\n\nDokumentenlängen:\n")
        myScorer.printDocumentLengths()
        print("\n\nSuchergebnisse:\n")
        myScorer.calculateScores(["tokens"])
        myScorer.calculateScores(["index"])
        myScorer.calculateScores(["classification"])
        myScorer.calculateScores(["tokens", "classification"])
开发者ID:Sophiameta,项目名称:SimpleSearchEngine,代码行数:26,代码来源:SearchEngine.py

示例3: run

 def run(self):
     robot_url = "http://allrecipes.com/"
     root = 'http://allrecipes.com/Recipes/ViewAll.aspx?Page=1'
     depth_limit = 5
     confine_reg = ['http://allrecipes.com/Recipes/ViewAll.aspx\?Page\=[0-9]*$','http://allrecipes.com/Recipe/[a-zA-Z0-9\-]*/Detail.aspx$']
     c = Crawler(root, depth_limit,confine_reg,robot_url)  
     c.crawl()     
开发者ID:ggppwx,项目名称:webCrawler,代码行数:7,代码来源:Thread.py

示例4: main

def main():
	t1 = task("http://www.laurentluce.com/posts/python-threads-synchronization-locks-rlocks-semaphores-conditions-events-and-queues/")
	t2 = task("http://stackoverflow.com/questions/15651128/in-this-semaphore-example-is-it-necessary-to-lock-for-refill-and-buy")
	t3 = task("http://bbs.byr.cn/")
	event = Event()
	tasks = TaskQueue(event)
	pages = TaskQueue(None)
	tasks.add(t1)
	tasks.add(t2)
	tasks.add(t3)

	taskLock = BoundedSemaphore(tasks.numOfNewTasks)
	pageLock = BoundedSemaphore(1)
	f = open("test.txt",'w')
	Connector0 = Connector(tasks,taskLock,pages,pageLock,event,'',f, 3000)
	Connector1 = Connector(tasks,taskLock,pages,pageLock,event,'',f, 3001)
	Connector0.start()
	Connector1.start()

	Crawler0 = Crawler('',3000)
	Crawler1 = Crawler('',3001)

	Crawler0.start()
	Crawler1.start()

	Connector1.join()
	Connector0.join()
	Crawler0.join()
	Crawler1.join()
	f.close()
开发者ID:royzxq,项目名称:Programs-for-Fun,代码行数:30,代码来源:test.py

示例5: start

def start():
	'''抓取进程开始,每次取出一个节点抓取 '''
	# 初始化
	mongo_peoples , redis_client = Init()

	# 待抓取节点集合是否为空
	while redis_client.scard(info_success_set) == 0: # 为空
		# 等待 waiting_size 秒
		time.sleep(wait_time)

	# 从待抓取节点集合随机(右端)取出一个节点
	node = redis_client.spop(info_success_set)
	urlToken = node

	# 抓取节点代表用户的个人主页
	# printx('准备代理……')
	printx('正在抓取用户 %s 的个人信息……'%urlToken)
	try_cnt = try_limit
	while try_cnt > 0:
		try:
			c = Crawler(isCookie=False,timeout=socket_timeout)
			# 手动设置代理IP
			ip = proxyip.get()
			c.set_proxyip(ip)

			people = get_Info(c,urlToken)
			if people==None:
				raise Exception,'抓取的用户信息为空'
		except Exception,e:
			try_cnt -= 1
			print e
			printx('用户 %s 个人信息抓取出错,还可以尝试抓取 %d 次'%(urlToken,try_cnt))
		else:
			break
开发者ID:flygodfish,项目名称:zhihu-crawler-people,代码行数:34,代码来源:list_crawler.py

示例6: __init__

    def __init__(self):
        # parent construct
        Crawler.__init__(self)

        self.crawl_cookie = {}
        self.status_code = ''
        self.history = ''
开发者ID:xzhoutxd,项目名称:tb,代码行数:7,代码来源:TBCrawler.py

示例7: main

def main():
    try:
        spider = Crawler()
        spider.go()
    
    except KeyboardInterrupt:
        print("Stopped!")
开发者ID:maddogfyg,项目名称:iCrawler,代码行数:7,代码来源:iCrawler.py

示例8: test_crawl_multithread_mmcoreAsync

 def test_crawl_multithread_mmcoreAsync(self):
     parameter_data = FileOperations.get_from_JSON_file("./test/search_async.json")
     crawlers = parameter_data["crawlers"]
     crawlerName = "dotAsync"
     c = Crawler(crawlerName, parameters=crawlers[crawlerName])
     data = c.crawl_native(threads=None)
     self.assertTrue(len(data) > 0)
     c.save_crawler_data(data, crawlers[crawlerName]["output"])
开发者ID:glebedel,项目名称:FileCrawler,代码行数:8,代码来源:test_crawler.py

示例9: test_crawl_clientIntegrations

 def test_crawl_clientIntegrations(self):
     parameter_data = FileOperations.get_from_JSON_file("./test/search_integration.json")
     crawlers = parameter_data["crawlers"]
     crawlerName = "Integration"
     c = Crawler(crawlerName, parameters=crawlers[crawlerName])
     data = c.crawl_native()
     self.assertTrue(len(data) > 0)
     c.save_crawler_data(data, crawlers[crawlerName]["output"])
开发者ID:glebedel,项目名称:FileCrawler,代码行数:8,代码来源:test_crawler.py

示例10: test_crawl_native_fakeCrawler

 def test_crawl_native_fakeCrawler(self):
     parameter_file = "./test/search_parameters.json"
     c = Crawler("SimpleTest", parameters=FileOperations.get_from_JSON_file(parameter_file))
     self.assertEqual(c.name, "SimpleTest")
     c.crawl_native()
     self.assertTrue(os.path.isfile(parameter_file))
     result_from_file = FileOperations.get_from_JSON_file(c.output["path"])
     self.assertEqual(len(result_from_file), 3)
开发者ID:glebedel,项目名称:FileCrawler,代码行数:8,代码来源:test_crawler.py

示例11: __init__

    def __init__(self, forced=False):
        Crawler.__init__(self)
        self.results = set()
        self.forced = forced
        self.success_count = None
        self.failure_count = None

        self.blacklist = []
        self.name_exceptions = ["http://www.cplusplus.com/reference/string/swap/"]
开发者ID:Mondego,项目名称:pyreco,代码行数:9,代码来源:allPythonContent.py

示例12: test__process_html

    def test__process_html(self):
        soup = BeautifulSoup(self.html_test_string)
        c = Crawler("http://test.com")
        c._process_html_asset = mock.Mock()
        c._process_html_link = mock.Mock()

        c._process_html(soup)
        self.assertEqual(c._process_html_asset.call_count, 3)
        self.assertEqual(c._process_html_link.call_count, 4)
开发者ID:vhamid,项目名称:crawler-example,代码行数:9,代码来源:test_crawler.py

示例13: test__process_html_link

    def test__process_html_link(self):
        c = Crawler("http://test.com")
        soup = BeautifulSoup(self.html_test_string)

        for link in soup.find_all("a"):
            c._process_html_link(link, "/")

        self.assertEqual(len(c.sitemap.nodes()), 3)
        self.assertEqual(len(c.sitemap.edges()), 2)
        self.assertEqual(len(c.process_q), 3)
开发者ID:vhamid,项目名称:crawler-example,代码行数:10,代码来源:test_crawler.py

示例14: test_render_sitemap

    def test_render_sitemap(self):
        try:
            os.remove("sitemap.pdf")
        except OSError:
            pass

        self.assertEqual(os.path.exists("sitemap.pdf"), False)
        c = Crawler("http://a.com")
        c.render_sitemap()
        self.assertEqual(os.path.exists("sitemap.pdf"), True)
开发者ID:vhamid,项目名称:crawler-example,代码行数:10,代码来源:test_crawler.py

示例15: test__process_html_good_asset

    def test__process_html_good_asset(self):
        c = Crawler("http://test.com")
        soup = BeautifulSoup(self.html_test_string)

        c._does_static_file_exist = mock.Mock(return_value=True)
        for asset in soup.find_all(True, src=True):
            c._process_html_asset(asset, "/")

        self.assertEqual(c._does_static_file_exist.call_count, 2)
        self.assertEqual(len(c.sitemap.nodes()), 3)
        self.assertEqual(len(c.sitemap.edges()), 2)
开发者ID:vhamid,项目名称:crawler-example,代码行数:11,代码来源:test_crawler.py


注:本文中的Crawler.Crawler类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。