当前位置: 首页>>代码示例>>Python>>正文


Python Dictionary.all方法代码示例

本文整理汇总了Python中dictionary.Dictionary.all方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.all方法的具体用法?Python Dictionary.all怎么用?Python Dictionary.all使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在dictionary.Dictionary的用法示例。


在下文中一共展示了Dictionary.all方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from dictionary import Dictionary [as 别名]
# 或者: from dictionary.Dictionary import all [as 别名]
class Crawler:
    fileno = 0
    total_data_downloaded = 0
    total_status_200 = 0
    total_status_401 = 0
    total_status_404 = 0
    total_status_500 = 0

    def __init__(self, max_links_allowed, compress_status):
        self.__html_parser = Parser()
        self.__bfs_tree = Queue()
        self.__unique_links = Dictionary(max_links_allowed)
        self.__compress = compress_status
        self.__pyurlopener = lib.PyURLOpener()
        self.__start_time = datetime.now()

    def format_validate_and_save_links(self, links_to_crawl, base_link, depth):
        links_to_crawl = self.__remove_duplicates(links_to_crawl)

        for link in links_to_crawl:
            link_to_process = Link(link, depth)
            link_to_process.fix_relative_link(base_link)
            link_to_process.set_link_attrs()
            link_to_process.remove_duplicate_link(self.__unique_links)
            link_to_process.remove_invalid_link()
            link_to_process.remove_uncrawlable_link()
            self.__save_link(link_to_process)

    def crawl(self):
        next_link = self.__next_url()
        links_to_crawl = self.__html_parser.get_links(next_link.redirected)
        return {'links_to_crawl': links_to_crawl, 'next_link': next_link.redirected, 'depth': (next_link.depth + 1)}

    def clear(self):
        self.__html_parser.clear()

    def display(self):
        print self.__unique_links.all()

    def get_num_links_saved(self):
        return self.__unique_links.size()

    # PRIVATE

    def __remove_duplicates(self, links_to_crawl):
        return list(set(links_to_crawl))

    def __save_link(self, link):
        if link.redirected is not None:
            insert_status = self.__unique_links.insert(link.normalized, link.redirected)
            self.__bfs_tree.enqueue(link)
    
            crawled_urls = open("data/crawled_urls.txt", "a+")
            
            if insert_status:
                file_size = self.__save_page(link)
                self.__save_url(link, file_size, crawled_urls)
            else:
                crawled_urls.write(self.__stats())
                crawled_urls.close
                sys.exit()
            
            crawled_urls.close

    def __save_page(self, link):
        new_file = open("data/pages/" + str(self.fileno) + ".html", "w")
        page_html = link.opened.read()

        if page_html is None:
            page_html = "Link [" + link.redirected + "]: HTML not retrieved"
        else:
            page_html = link.redirected + "\n\n" + page_html
        
        if self.__compress:
            new_file.write(zlib.compress(page_html))
        else:
            new_file.write(page_html)

        new_file_size = new_file.tell()
        self.__update_stats(new_file_size, link.code)

        link.opened.close()
        new_file.close

        return new_file_size

    def __save_url(self, link, file_size, crawled_urls):
        crawled_urls.write("[" + time.asctime(time.localtime(time.time())) + "][" + self.__to_mb(file_size) + " MB][" + str(link.code) + "][" + str(link.depth) + "] " + link.redirected + "\n")

    def __next_url(self):
        return self.__bfs_tree.dequeue()

    def __stats(self):
        time_taken = datetime.now() - self.__start_time
        
        stats =  "\nCrawl Stats:"
        stats += "-------------------------------------\n"
        stats += "- Pages crawled:   " + str(self.__unique_links.size()) + " pages\n" 
        stats += "- Data downloaded: " + self.__to_mb(self.total_data_downloaded) + " MB\n"
        stats += "- Time Taken:      " + str(time_taken.seconds) + " seconds\n"
#.........这里部分代码省略.........
开发者ID:nitindhar7,项目名称:pycrawler,代码行数:103,代码来源:crawler.py


注:本文中的dictionary.Dictionary.all方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。