当前位置: 首页>>代码示例>>Python>>正文


Python Page.scrap方法代码示例

本文整理汇总了Python中page.Page.scrap方法的典型用法代码示例。如果您正苦于以下问题:Python Page.scrap方法的具体用法?Python Page.scrap怎么用?Python Page.scrap使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在page.Page的用法示例。


在下文中一共展示了Page.scrap方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: visit_page

# 需要导入模块: from page import Page [as 别名]
# 或者: from page.Page import scrap [as 别名]
    def visit_page(self, product_list_url, page_num):
        print "Page-{}, {}".format(page_num, product_list_url)

        response = urllib2.urlopen(product_list_url)

        if response.code is self.HTTP_STATUS_OK:
            page = Page()
            page.set_data(response.read())
            page.set_url(product_list_url)
            page.set_out_file(self.output_filename + "-page-" + str(page_num) + "-")
            page.scrap()

            print "{} - Completed".format(product_list_url)
开发者ID:samundra,项目名称:amazon-search-scrapper,代码行数:15,代码来源:scraper.py

示例2: __init__

# 需要导入模块: from page import Page [as 别名]
# 或者: from page.Page import scrap [as 别名]
class Scraper:
    ROOT_DOMAIN = 'http://www.amazon.com'
    HTTP_STATUS_OK = 200
    THREAD_NUM = 5

    def __init__(self, url):
        self.url = None
        self.set_url(url)
        self.page = Page()
        self.product = Product()
        self.page_links = []
        self.output_filename = "out_"
        self.threads = []

    def set_url(self, url):
        self.url = url
        return self

    def set_output_file(self, keyword):
        self.output_filename = keyword.replace(" ", "_").lower()

    def get_page_links(self):
        return self.page_links

    def get_url(self):
        return self.url

    def build_page_links(self, max_page_num, pagination_format):

        if pagination_format is not None:
            query_string = pagination_format.split("&")
            # pages = query_string[1].split("=")
            match = re.search("page=(\d+)", pagination_format)

            page_num = int(match.group(1))

            page_dict = {
                "page": str(page_num),
                "url": self.ROOT_DOMAIN + pagination_format
            }

            self.page_links.append(page_dict)

            while page_num < int(max_page_num):
                page_num += 1

                query_string[1] = 'page=' + str(page_num)
                pack_query_string = '&'.join(query_string)

                page_dict = {
                    "page": str(page_num),
                    "url": self.ROOT_DOMAIN + pack_query_string
                }

                self.page_links.append(page_dict)

    def visit_page(self, product_list_url, page_num):
        print "Page-{}, {}".format(page_num, product_list_url)

        response = urllib2.urlopen(product_list_url)

        if response.code is self.HTTP_STATUS_OK:
            page = Page()
            page.set_data(response.read())
            page.set_url(product_list_url)
            page.set_out_file(self.output_filename + "-page-" + str(page_num) + "-")
            page.scrap()

            print "{} - Completed".format(product_list_url)
            # list_page.set_response(response)

            # products = self.page.scrap()

    def scrap_all_products(self):

        # total_links = len(self.page_links)
        # print "Total Links : " + str(total_links)
        #
        # chunks = total_links / self.THREAD_NUM
        # remaining = total_links % self.THREAD_NUM
        #
        # print "Loop Required : " + str(chunks)
        # print "Starting Thread Count :" + str(self.THREAD_NUM)
        # print "Remaining Count : " + str(remaining)
        #
        # for i in range(0, chunks):
        #     for idx in range(self.THREAD_NUM):
        #         # print idx
        #         link = self.page_links[idx]
        #         page_num = link["page"]
        #         page_url = link["url"]
        #         #
        #         print "Starting Thread for url: " + page_url
        #         # t = threading.Thread(target=self.visit_page, args=(page_url, page_num,))
        #         # self.threads.append(t)
        #         # t.start()

        #
        # start_index = self.THREAD_NUM * chunks
        # #
#.........这里部分代码省略.........
开发者ID:samundra,项目名称:amazon-search-scrapper,代码行数:103,代码来源:scraper.py


注:本文中的page.Page.scrap方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。