本文整理汇总了Python中page.Page.scrap方法的典型用法代码示例。如果您正苦于以下问题:Python Page.scrap方法的具体用法?Python Page.scrap怎么用?Python Page.scrap使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类page.Page
的用法示例。
在下文中一共展示了Page.scrap方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: visit_page
# 需要导入模块: from page import Page [as 别名]
# 或者: from page.Page import scrap [as 别名]
def visit_page(self, product_list_url, page_num):
print "Page-{}, {}".format(page_num, product_list_url)
response = urllib2.urlopen(product_list_url)
if response.code is self.HTTP_STATUS_OK:
page = Page()
page.set_data(response.read())
page.set_url(product_list_url)
page.set_out_file(self.output_filename + "-page-" + str(page_num) + "-")
page.scrap()
print "{} - Completed".format(product_list_url)
示例2: __init__
# 需要导入模块: from page import Page [as 别名]
# 或者: from page.Page import scrap [as 别名]
class Scraper:
ROOT_DOMAIN = 'http://www.amazon.com'
HTTP_STATUS_OK = 200
THREAD_NUM = 5
def __init__(self, url):
self.url = None
self.set_url(url)
self.page = Page()
self.product = Product()
self.page_links = []
self.output_filename = "out_"
self.threads = []
def set_url(self, url):
self.url = url
return self
def set_output_file(self, keyword):
self.output_filename = keyword.replace(" ", "_").lower()
def get_page_links(self):
return self.page_links
def get_url(self):
return self.url
def build_page_links(self, max_page_num, pagination_format):
if pagination_format is not None:
query_string = pagination_format.split("&")
# pages = query_string[1].split("=")
match = re.search("page=(\d+)", pagination_format)
page_num = int(match.group(1))
page_dict = {
"page": str(page_num),
"url": self.ROOT_DOMAIN + pagination_format
}
self.page_links.append(page_dict)
while page_num < int(max_page_num):
page_num += 1
query_string[1] = 'page=' + str(page_num)
pack_query_string = '&'.join(query_string)
page_dict = {
"page": str(page_num),
"url": self.ROOT_DOMAIN + pack_query_string
}
self.page_links.append(page_dict)
def visit_page(self, product_list_url, page_num):
print "Page-{}, {}".format(page_num, product_list_url)
response = urllib2.urlopen(product_list_url)
if response.code is self.HTTP_STATUS_OK:
page = Page()
page.set_data(response.read())
page.set_url(product_list_url)
page.set_out_file(self.output_filename + "-page-" + str(page_num) + "-")
page.scrap()
print "{} - Completed".format(product_list_url)
# list_page.set_response(response)
# products = self.page.scrap()
def scrap_all_products(self):
# total_links = len(self.page_links)
# print "Total Links : " + str(total_links)
#
# chunks = total_links / self.THREAD_NUM
# remaining = total_links % self.THREAD_NUM
#
# print "Loop Required : " + str(chunks)
# print "Starting Thread Count :" + str(self.THREAD_NUM)
# print "Remaining Count : " + str(remaining)
#
# for i in range(0, chunks):
# for idx in range(self.THREAD_NUM):
# # print idx
# link = self.page_links[idx]
# page_num = link["page"]
# page_url = link["url"]
# #
# print "Starting Thread for url: " + page_url
# # t = threading.Thread(target=self.visit_page, args=(page_url, page_num,))
# # self.threads.append(t)
# # t.start()
#
# start_index = self.THREAD_NUM * chunks
# #
#.........这里部分代码省略.........