本文整理汇总了Python中webpage.WebPage.filter_links方法的典型用法代码示例。如果您正苦于以下问题:Python WebPage.filter_links方法的具体用法?Python WebPage.filter_links怎么用?Python WebPage.filter_links使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类webpage.WebPage
的用法示例。
在下文中一共展示了WebPage.filter_links方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Crawler
# 需要导入模块: from webpage import WebPage [as 别名]
# 或者: from webpage.WebPage import filter_links [as 别名]
class Crawler():
def __init__(self ):
self.downloader = DownloadManager()
self.webpage = None
self.init_database()
self.rules = {}
def init_database(self):
self.queue = QueueDB('queue.db')
self.webpagedb = WebpageDB('webpage.db')
self.duplcheck = DuplCheckDB('duplcheck.db')
def add_seeds(self, links):
new_links = self.duplcheck.filter_dupl_urls(links)
self.duplcheck.add_urls(new_links)
self.queue.push_urls(new_links)
def add_rules(self, rules):
self.rules = {}
for url, inurls in rules.items():
reurl = re.compile(url)
repatn = []
for u in inurls:
repatn.append(re.compile(u))
self.rules[reurl] = repatn
def get_patterns_from_rules(self,url):
patns = []
for purl,ru in self.rules.items():
if purl.match(url)!= None:
patns.extend(ru)
return list(set(patns))
def start(self):
while 1:
url = self.queue.pop_url()
print url
if url == None:
print "crawling task is done."
break
error_msg, url, redirected_url, html = self.downloader.download(url)
#print error_msg, url, redirected_url, html
if html !=None:
self.webpagedb.html2db(url,html)
self.webpage = WebPage(url,html)
self.webpage.parse_links()
ruptn = self.get_patterns_from_rules(url)
print ruptn
links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
self.add_seeds(links)
self.mysleep(3)
def mysleep(self, n):
for i in range(n):
time.sleep(1)
print "sleep",i,"of",n
示例2: Crawler
# 需要导入模块: from webpage import WebPage [as 别名]
# 或者: from webpage.WebPage import filter_links [as 别名]
class Crawler():
def __init__(self):
self.downloader = DownloadManager()
self.webpage = None
self.rules = {}
self.dbop = OperatorDB()
def add_seeds(self, links):
self.dbop.add_seeds(links)
def add_rules(self, rules):
self.rules = {}
for url, inurls in rules.items():
reurl = re.compile(url)
repatn = []
for u in inurls:
repatn.append(re.compile(u))
self.rules[reurl] = repatn
def get_patterns_from_rules(self, url):
patns = []
for purl, ru in self.rules.items():
if purl.match(url) != None:
patns.extend(ru)
return list(set(patns))
def start(self):
while 1:
try:
url = self.dbop.pop_url()
print "url: %s" % url
if url == None:
print "crawling task is done."
break
error_msg, url, redirected_url, html = self.downloader.download(url)
#print error_msg, url, redirected_url, html
if html != None:
self.webpage = WebPage(url, html)
article = self.webpage.extract()
if len(article) > 5:
addtime = "%s %s" % (article[1], article[2])
self.dbop.html2db(url, html,
article[0],
addtime,
article[3],
article[5])
else:
self.dbop.html2db(url, html)
print self.webpage.parse_links()
ruptn = self.get_patterns_from_rules(url)
links = self.webpage.filter_links(tags=['a'],
str_patterns=ruptn)
self.add_seeds(links)
self.mysleep(3)
except Exception, err:
print "!!error!! Exception happend! %s %s" % (url, err)
self.dbop.close()
示例3: Crawler
# 需要导入模块: from webpage import WebPage [as 别名]
# 或者: from webpage.WebPage import filter_links [as 别名]
class Crawler():
def __init__(self ):
self.downloader = DownloadManager()
self.webpage = None
self.init_database()
self.rules = {}
self.files = []
self.file_rule = ".+"
def init_database(self):
self.queue = QueueDB('queue.db')
self.webpagedb = WebpageDB('webpage.db')
self.duplcheck = DuplCheckDB('duplcheck.db')
self.repodb = RepoStateDB()
def add_seeds(self, links):
new_links = self.duplcheck.filter_dupl_urls(links)
self.duplcheck.add_urls(new_links)
self.queue.push_urls(new_links)
def add_rules(self, rules):
self.rules = {}
for url, inurls in rules.items():
reurl = re.compile(url)
repatn = []
for u in inurls:
repatn.append(re.compile(u))
self.rules[reurl] = repatn
def set_file_rule(self, rule):
self.file_rule = rule
def get_patterns_from_rules(self,url):
patns = []
for purl,ru in self.rules.items():
if purl.match(url)!= None:
patns.extend(ru)
return list(set(patns))
def download_files(self, files):
for f in files:
#cmd = "wget --force-directories -c " + f + " -P " + config.repos_dir
cmd = "wget -c " + f + " -P " + config.repos_dir
ret_code = os.system(cmd)
self.repodb.update(f, ret_code == 0)
def start(self):
while 1:
url = self.queue.pop_url()
print url
if url == None:
print "crawling task is done."
break
error_msg, url, redirected_url, html = self.downloader.download(url)
# print error_msg, url, redirected_url, html
if html !=None:
self.webpagedb.html2db(url,html)
self.webpage = WebPage(url,html)
self.webpage.parse_links()
ruptn = self.get_patterns_from_rules(url)
#print ruptn
links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
print links
self.add_seeds(links)
file_pattern = []
file_pattern.append(re.compile(self.file_rule))
files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern)
self.files.append(files)
#TODO:
self.download_files(files)
print files
def mysleep(self, n):
for i in range(n):
time.sleep(1)
print "sleep",i,"of",n
示例4: Crawler
# 需要导入模块: from webpage import WebPage [as 别名]
# 或者: from webpage.WebPage import filter_links [as 别名]
class Crawler():
def __init__(self):
self.downloader = DownloadManager()#下载网页的对象
self.webpage = None#解析页面的对象
self.initDatabase()
self.rules = {}
#初始化数据库
def initDatabase(self):
self.queue = QueueDB()#TODO 表
self.webpagedb = WebpageDB()
self.duplcheck = DuplCheckDB()
#增加种子url
#参数: links url 列表
def addSeeds(self, links):
new_links = self.duplcheck.filterDuplUrls(links)#把重复的url过滤掉
self.duplcheck.addUrls(new_links)#已经访问过的url
self.queue.pushUrls(new_links)#向todo表中增加url
def addRules(self, rules):
self.rules = {}
for url, inurls in rules.items():
reurl = re.compile(url)
repatn = []
for u in inurls:
repatn.append(re.compile(u))
self.rules[reurl] = repatn
def get_patterns_from_rules(self,url):
patns = []
for purl,ru in self.rules.items():
if purl.match(url)!= None:
patns.extend(ru)
return list(set(patns))
#开始执行
def start(self):
while 1:
url = self.queue.popUrl()
print url
if url == None:
print "crawling task is done."
break
error_msg, url, redirected_url, html = self.downloader.download(url)
#print error_msg, url, redirected_url, html
if html !=None:
self.webpagedb.storeHtmlToDb(url,html)#把网页存储起来
self.webpage = WebPage(url,html)#开始解析网页
self.webpage.parseLinks()#得到全部的超链接
ruptn = self.get_patterns_from_rules(url)
print ruptn
links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)#得到None
if links:
self.addSeeds(links)
self.mysleep(3)#休息一下再继续爬
def mysleep(self, n):
for i in range(1,n+1):
time.sleep(1)
print "sleep",i,"of",n
示例5: DownloadManager
# 需要导入模块: from webpage import WebPage [as 别名]
# 或者: from webpage.WebPage import filter_links [as 别名]
#'''
url = "http://www.cnbeta.com/"
downloader = DownloadManager()
error_msg, url, redirected_url, html = downloader.download(url)
print "error_msg=%s" %error_msg
print "url=%s" %url
print "redirected_url=%s" %redirected_url
f = open("www.cnbeta.com.html",'w')
f.write(html)
f.close()
webpage = WebPage(url, html)
webpage.parse_links()
website = 'cnbeta\.com'
patnstr = '^(http|https)://(.*\.' + website + ')(.+)$';
links = webpage.filter_links(tags=['a'], str_patterns=[patnstr])
links.sort()
f_filter_links = open('filter_links_cnbeta.txt', 'w')
#print links
f = open('links_regged_cnbeta.txt','w')
for link in links:
f_filter_links.write('%s\n' % link)
f.write('%s\n' % link)
for elem, attr, lnk, pos in webpage.doc.iterlinks():
absolute = urlparse.urljoin(webpage.url, lnk.strip())
if absolute == link and elem.text:
f.write('%s\n' % elem.text.encode('utf-8'))
f.close()
f_filter_links.close()