本文整理汇总了Python中Crawler.Crawler类的典型用法代码示例。如果您正苦于以下问题:Python Crawler类的具体用法?Python Crawler怎么用?Python Crawler使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Crawler类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_article
def get_article(self, url):
crawler = Crawler()
# get html data from url
web_data = crawler.get_page(url)
soup = BeautifulSoup(web_data, 'html.parser')
# remove link news
[e.extract() for e in soup('div', {'class':'link_news'})]
# article title
self.title = soup('h3', {'id':'articleTitle'})[0].text
# create date and time of article
date_time = soup('span', {'class':'t11'})[0].text.split()
self.date = date_time[0]
self.time = date_time[1]
# press name
press_logo = soup('div', {'class':'press_logo'})[0]
self.press = press_logo.find('img')['alt']
del press_logo
# article contents
self.contents = soup('div', {'id':'articleBodyContents'})[0].text
self.contents = re.sub('[\n\r]', '', self.contents)
示例2: __init__
def __init__(self):
myCrawler = Crawler(self.LINKS)
crawledURLs = myCrawler.getVisited()
linkStructure = myCrawler.getLinkStructure()
print("Link-Struktur:\n")
myCrawler.printLinkStructure()
myPageRank = PageRank(linkStructure)
pageRanks = myPageRank.getPageRank()
print("\n\nPageRanks:\n")
myPageRank.printPageRank()
myIndex = Index(self.STOPWORDS, crawledURLs)
index = myIndex.getIndex()
print("\n\nIndex:\n")
myIndex.printIndex()
myScorer = Scorer(pageRanks, index,linkStructure)
#myScorer.usePageRank(True)
print("\n\nDokumentenlängen:\n")
myScorer.printDocumentLengths()
print("\n\nSuchergebnisse:\n")
myScorer.calculateScores(["tokens"])
myScorer.calculateScores(["index"])
myScorer.calculateScores(["classification"])
myScorer.calculateScores(["tokens", "classification"])
示例3: run
def run(self):
robot_url = "http://allrecipes.com/"
root = 'http://allrecipes.com/Recipes/ViewAll.aspx?Page=1'
depth_limit = 5
confine_reg = ['http://allrecipes.com/Recipes/ViewAll.aspx\?Page\=[0-9]*$','http://allrecipes.com/Recipe/[a-zA-Z0-9\-]*/Detail.aspx$']
c = Crawler(root, depth_limit,confine_reg,robot_url)
c.crawl()
示例4: main
def main():
t1 = task("http://www.laurentluce.com/posts/python-threads-synchronization-locks-rlocks-semaphores-conditions-events-and-queues/")
t2 = task("http://stackoverflow.com/questions/15651128/in-this-semaphore-example-is-it-necessary-to-lock-for-refill-and-buy")
t3 = task("http://bbs.byr.cn/")
event = Event()
tasks = TaskQueue(event)
pages = TaskQueue(None)
tasks.add(t1)
tasks.add(t2)
tasks.add(t3)
taskLock = BoundedSemaphore(tasks.numOfNewTasks)
pageLock = BoundedSemaphore(1)
f = open("test.txt",'w')
Connector0 = Connector(tasks,taskLock,pages,pageLock,event,'',f, 3000)
Connector1 = Connector(tasks,taskLock,pages,pageLock,event,'',f, 3001)
Connector0.start()
Connector1.start()
Crawler0 = Crawler('',3000)
Crawler1 = Crawler('',3001)
Crawler0.start()
Crawler1.start()
Connector1.join()
Connector0.join()
Crawler0.join()
Crawler1.join()
f.close()
示例5: start
def start():
'''抓取进程开始,每次取出一个节点抓取 '''
# 初始化
mongo_peoples , redis_client = Init()
# 待抓取节点集合是否为空
while redis_client.scard(info_success_set) == 0: # 为空
# 等待 waiting_size 秒
time.sleep(wait_time)
# 从待抓取节点集合随机(右端)取出一个节点
node = redis_client.spop(info_success_set)
urlToken = node
# 抓取节点代表用户的个人主页
# printx('准备代理……')
printx('正在抓取用户 %s 的个人信息……'%urlToken)
try_cnt = try_limit
while try_cnt > 0:
try:
c = Crawler(isCookie=False,timeout=socket_timeout)
# 手动设置代理IP
ip = proxyip.get()
c.set_proxyip(ip)
people = get_Info(c,urlToken)
if people==None:
raise Exception,'抓取的用户信息为空'
except Exception,e:
try_cnt -= 1
print e
printx('用户 %s 个人信息抓取出错,还可以尝试抓取 %d 次'%(urlToken,try_cnt))
else:
break
示例6: __init__
def __init__(self):
# parent construct
Crawler.__init__(self)
self.crawl_cookie = {}
self.status_code = ''
self.history = ''
示例7: main
def main():
try:
spider = Crawler()
spider.go()
except KeyboardInterrupt:
print("Stopped!")
示例8: test_crawl_multithread_mmcoreAsync
def test_crawl_multithread_mmcoreAsync(self):
parameter_data = FileOperations.get_from_JSON_file("./test/search_async.json")
crawlers = parameter_data["crawlers"]
crawlerName = "dotAsync"
c = Crawler(crawlerName, parameters=crawlers[crawlerName])
data = c.crawl_native(threads=None)
self.assertTrue(len(data) > 0)
c.save_crawler_data(data, crawlers[crawlerName]["output"])
示例9: test_crawl_clientIntegrations
def test_crawl_clientIntegrations(self):
parameter_data = FileOperations.get_from_JSON_file("./test/search_integration.json")
crawlers = parameter_data["crawlers"]
crawlerName = "Integration"
c = Crawler(crawlerName, parameters=crawlers[crawlerName])
data = c.crawl_native()
self.assertTrue(len(data) > 0)
c.save_crawler_data(data, crawlers[crawlerName]["output"])
示例10: test_crawl_native_fakeCrawler
def test_crawl_native_fakeCrawler(self):
parameter_file = "./test/search_parameters.json"
c = Crawler("SimpleTest", parameters=FileOperations.get_from_JSON_file(parameter_file))
self.assertEqual(c.name, "SimpleTest")
c.crawl_native()
self.assertTrue(os.path.isfile(parameter_file))
result_from_file = FileOperations.get_from_JSON_file(c.output["path"])
self.assertEqual(len(result_from_file), 3)
示例11: __init__
def __init__(self, forced=False):
Crawler.__init__(self)
self.results = set()
self.forced = forced
self.success_count = None
self.failure_count = None
self.blacklist = []
self.name_exceptions = ["http://www.cplusplus.com/reference/string/swap/"]
示例12: test__process_html
def test__process_html(self):
soup = BeautifulSoup(self.html_test_string)
c = Crawler("http://test.com")
c._process_html_asset = mock.Mock()
c._process_html_link = mock.Mock()
c._process_html(soup)
self.assertEqual(c._process_html_asset.call_count, 3)
self.assertEqual(c._process_html_link.call_count, 4)
示例13: test__process_html_link
def test__process_html_link(self):
c = Crawler("http://test.com")
soup = BeautifulSoup(self.html_test_string)
for link in soup.find_all("a"):
c._process_html_link(link, "/")
self.assertEqual(len(c.sitemap.nodes()), 3)
self.assertEqual(len(c.sitemap.edges()), 2)
self.assertEqual(len(c.process_q), 3)
示例14: test_render_sitemap
def test_render_sitemap(self):
try:
os.remove("sitemap.pdf")
except OSError:
pass
self.assertEqual(os.path.exists("sitemap.pdf"), False)
c = Crawler("http://a.com")
c.render_sitemap()
self.assertEqual(os.path.exists("sitemap.pdf"), True)
示例15: test__process_html_good_asset
def test__process_html_good_asset(self):
c = Crawler("http://test.com")
soup = BeautifulSoup(self.html_test_string)
c._does_static_file_exist = mock.Mock(return_value=True)
for asset in soup.find_all(True, src=True):
c._process_html_asset(asset, "/")
self.assertEqual(c._does_static_file_exist.call_count, 2)
self.assertEqual(len(c.sitemap.nodes()), 3)
self.assertEqual(len(c.sitemap.edges()), 2)