本文整理汇总了Python中dictionary.Dictionary.all方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.all方法的具体用法?Python Dictionary.all怎么用?Python Dictionary.all使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dictionary.Dictionary
的用法示例。
在下文中一共展示了Dictionary.all方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from dictionary import Dictionary [as 别名]
# 或者: from dictionary.Dictionary import all [as 别名]
class Crawler:
fileno = 0
total_data_downloaded = 0
total_status_200 = 0
total_status_401 = 0
total_status_404 = 0
total_status_500 = 0
def __init__(self, max_links_allowed, compress_status):
self.__html_parser = Parser()
self.__bfs_tree = Queue()
self.__unique_links = Dictionary(max_links_allowed)
self.__compress = compress_status
self.__pyurlopener = lib.PyURLOpener()
self.__start_time = datetime.now()
def format_validate_and_save_links(self, links_to_crawl, base_link, depth):
links_to_crawl = self.__remove_duplicates(links_to_crawl)
for link in links_to_crawl:
link_to_process = Link(link, depth)
link_to_process.fix_relative_link(base_link)
link_to_process.set_link_attrs()
link_to_process.remove_duplicate_link(self.__unique_links)
link_to_process.remove_invalid_link()
link_to_process.remove_uncrawlable_link()
self.__save_link(link_to_process)
def crawl(self):
next_link = self.__next_url()
links_to_crawl = self.__html_parser.get_links(next_link.redirected)
return {'links_to_crawl': links_to_crawl, 'next_link': next_link.redirected, 'depth': (next_link.depth + 1)}
def clear(self):
self.__html_parser.clear()
def display(self):
print self.__unique_links.all()
def get_num_links_saved(self):
return self.__unique_links.size()
# PRIVATE
def __remove_duplicates(self, links_to_crawl):
return list(set(links_to_crawl))
def __save_link(self, link):
if link.redirected is not None:
insert_status = self.__unique_links.insert(link.normalized, link.redirected)
self.__bfs_tree.enqueue(link)
crawled_urls = open("data/crawled_urls.txt", "a+")
if insert_status:
file_size = self.__save_page(link)
self.__save_url(link, file_size, crawled_urls)
else:
crawled_urls.write(self.__stats())
crawled_urls.close
sys.exit()
crawled_urls.close
def __save_page(self, link):
new_file = open("data/pages/" + str(self.fileno) + ".html", "w")
page_html = link.opened.read()
if page_html is None:
page_html = "Link [" + link.redirected + "]: HTML not retrieved"
else:
page_html = link.redirected + "\n\n" + page_html
if self.__compress:
new_file.write(zlib.compress(page_html))
else:
new_file.write(page_html)
new_file_size = new_file.tell()
self.__update_stats(new_file_size, link.code)
link.opened.close()
new_file.close
return new_file_size
def __save_url(self, link, file_size, crawled_urls):
crawled_urls.write("[" + time.asctime(time.localtime(time.time())) + "][" + self.__to_mb(file_size) + " MB][" + str(link.code) + "][" + str(link.depth) + "] " + link.redirected + "\n")
def __next_url(self):
return self.__bfs_tree.dequeue()
def __stats(self):
time_taken = datetime.now() - self.__start_time
stats = "\nCrawl Stats:"
stats += "-------------------------------------\n"
stats += "- Pages crawled: " + str(self.__unique_links.size()) + " pages\n"
stats += "- Data downloaded: " + self.__to_mb(self.total_data_downloaded) + " MB\n"
stats += "- Time Taken: " + str(time_taken.seconds) + " seconds\n"
#.........这里部分代码省略.........