本文整理汇总了Python中webpage.WebPage.parseLinks方法的典型用法代码示例。如果您正苦于以下问题:Python WebPage.parseLinks方法的具体用法?Python WebPage.parseLinks怎么用?Python WebPage.parseLinks使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类webpage.WebPage
的用法示例。
在下文中一共展示了WebPage.parseLinks方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Crawler
# 需要导入模块: from webpage import WebPage [as 别名]
# 或者: from webpage.WebPage import parseLinks [as 别名]
class Crawler():
def __init__(self):
self.downloader = DownloadManager()#下载网页的对象
self.webpage = None#解析页面的对象
self.initDatabase()
self.rules = {}
#初始化数据库
def initDatabase(self):
self.queue = QueueDB()#TODO 表
self.webpagedb = WebpageDB()
self.duplcheck = DuplCheckDB()
#增加种子url
#参数: links url 列表
def addSeeds(self, links):
new_links = self.duplcheck.filterDuplUrls(links)#把重复的url过滤掉
self.duplcheck.addUrls(new_links)#已经访问过的url
self.queue.pushUrls(new_links)#向todo表中增加url
def addRules(self, rules):
self.rules = {}
for url, inurls in rules.items():
reurl = re.compile(url)
repatn = []
for u in inurls:
repatn.append(re.compile(u))
self.rules[reurl] = repatn
def get_patterns_from_rules(self,url):
patns = []
for purl,ru in self.rules.items():
if purl.match(url)!= None:
patns.extend(ru)
return list(set(patns))
#开始执行
def start(self):
while 1:
url = self.queue.popUrl()
print url
if url == None:
print "crawling task is done."
break
error_msg, url, redirected_url, html = self.downloader.download(url)
#print error_msg, url, redirected_url, html
if html !=None:
self.webpagedb.storeHtmlToDb(url,html)#把网页存储起来
self.webpage = WebPage(url,html)#开始解析网页
self.webpage.parseLinks()#得到全部的超链接
ruptn = self.get_patterns_from_rules(url)
print ruptn
links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)#得到None
if links:
self.addSeeds(links)
self.mysleep(3)#休息一下再继续爬
def mysleep(self, n):
for i in range(1,n+1):
time.sleep(1)
print "sleep",i,"of",n