示例1: check_offsale_product
# 需要导入模块: from models import Product [as 别名]
# 或者: from models.Product import objects [as 别名]
def check_offsale_product(self, id, url):
prd = Product.objects(key=id).first()
if prd is None:
print '\n\ngilt {0}, {1}\n\n'.format(id, url)
ret = self.s.get(url, headers=self.headers)
if ret.url == 'http://www.gilt.com/' or ret.content == '':
prd.soldout = True
prd.update_history.update({ 'soldout': datetime.utcnow() })
tree = self.get_correct_tree(ret.content)
soldout = True
sizes = tree.cssselect('#sku-selection .sku-attribute .sku-attribute-values li')
count = len(sizes)
for size in sizes:
if 'for_sale' in size.get('class'):
soldout = False
if soldout is False:
print '\n\ngilt product[{0}] on sale again.'.format(url)
if prd.soldout != soldout:
prd.soldout = soldout
prd.update_history.update({ 'soldout': datetime.utcnow() })
prd.products_end = datetime.utcnow() + timedelta(days=3)
prd.update_history.update({ 'products_end': datetime.utcnow() })
prd.on_again = True
示例2: check_onsale_product
# 需要导入模块: from models import Product [as 别名]
# 或者: from models.Product import objects [as 别名]
def check_onsale_product(self, id, url):
prd = Product.objects(key=id).first()
if prd is None:
print '\n\nmyhabit {0}, {1}\n\n'.format(id, url)
if not prd.jslink:
print 'myhabit product[{0}] has no jslink'.format(prd.combine_url)
ret = self.s.get(prd.jslink, headers=self.headers)
data = re.compile(r'parse_asin_\w+\((.*)\);$').search(ret.text).group(1)
js = json.loads(data)
asin = js['detailJSON']['asin']
title = js['detailJSON']['title']
brand = js['detailJSON']['brand']
# Ourprice is not right
# if 'amount' in js['detailJSON']['ourPrice']:
# price = float( js['detailJSON']['ourPrice']['amount'] )
# if price != float( prd.price.replace('$', '') ):
# print 'myhabit product[{0}] price error: {1} vs {2}'.format(prd.combine_url, prd.price, price)
# else:
# print 'myhabit product[{0}] price can not get from network {1}'.format(prd.combine_url, prd.price)
if 'listPrice' in js['detailJSON'] and 'amount' in js['detailJSON']['listPrice']:
listprice = float( js['detailJSON']['listPrice']['amount'] )
if '-' not in prd.listprice:
if listprice != float( prd.listprice.replace('$', '').replace(',', '') ):
print 'myhabit product[{0}] listprice error: {1} vs {2}'.format(prd.combine_url, prd.listprice, listprice)
# else:
# print 'myhabit product[{0}] listprice can not get from network {1}'.format(prd.combine_url, prd.listprice)
if title.lower() != prd.title.rsplit('(', 1)[0].rstrip().lower():
print 'myhabit product[{0}] title error: [{1}] vs [{2}]'.format(prd.combine_url, prd.title.encode('utf-8'), title.encode('utf-8'))
if brand != prd.brand:
print 'myhabit product[{0}] brand error: {1} vs {2}'.format(prd.combine_url, prd.brand, brand)
示例3: check_onsale_product
# 需要导入模块: from models import Product [as 别名]
# 或者: from models.Product import objects [as 别名]
def check_onsale_product(self, id, url):
prd = Product.objects(key=id).first()
if prd is None:
print '\n\nshopbop {0}, {1}\n\n'.format(id, url)
ret = self.fetch_page(url)
if isinstance(ret, int):
print("\n\nshopbop download product page error: {0}".format(url))
tree = lxml.html.fromstring(ret)
listprice = price = None
for price_node in tree.cssselect('div#productPrices div.priceBlock'):
if price_node.cssselect('span.salePrice'):
price = price_node.cssselect('span.salePrice')[0].text_content().replace(',', '').replace('$', '').strip()
elif price_node.cssselect('span.originalRetailPrice'):
listprice = price_node.cssselect('span.originalRetailPrice')[0].text_content().replace(',', '').replace('$', '').strip()
soldout = True if tree.cssselect('img#soldOutImage') else False
if listprice and prd.listprice != listprice:
prd.listprice = listprice
prd.update_history.update({ 'listprice': datetime.utcnow() })
if prd.price != price:
prd.price = price
prd.update_history.update({ 'price': datetime.utcnow() })
if prd.soldout != soldout:
prd.soldout = soldout
prd.update_history.update({ 'soldout': datetime.utcnow() })
示例4: publish_old_stuff
# 需要导入模块: from models import Product [as 别名]
# 或者: from models.Product import objects [as 别名]
def publish_old_stuff(self):
for prd in Product.objects():
if prd.publish_time is None or 'soldout' not in prd.update_history or prd.publish_time < prd.update_history['soldout']:
self.api.product(prd.muri.strip('/').split('/')[-1]).patch({'sold_out': prd.soldout})
prd.publish_time = datetime.utcnow()
示例5: crawl_listing
# 需要导入模块: from models import Product [as 别名]
# 或者: from models.Product import objects [as 别名]
def crawl_listing(self, url, ctx='', **kwargs):
res = requests.get(url)
tree = lxml.html.fromstring(res.content)
category = Category.objects(key=kwargs.get('key')).first()
if not category:
common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key'))
product_nodes = tree.cssselect('div#searchResults a')
for product_node in product_nodes:
price = None; listprice = None
price = product_node.cssselect('.price-6pm')[0].text
listprice_node = product_node.cssselect('.discount')
listprice = ''.join(listprice_node[0].xpath('text()')) if listprice_node else None
# eliminate products of no discountIndexError:
if price is None or listprice is None:
# common_failed.send(sender=ctx, url=url, \
# reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice))
key = product_node.get('data-product-id')
if not key:
common_failed.send(sender=ctx, url=url, reason='listing product has no key')
combine_url = product_node.get('href')
key = '%s_%s' % (key, combine_url.split('/')[-1])
match = re.search(r'https?://.+', combine_url)
if not match:
combine_url = '%s%s' % (HOST, combine_url)
brand = product_node.cssselect('.brandName')[0].text.strip()
title = product_node.cssselect('.productName')[0].text.strip()
is_new = False; is_updated = False
product = Product.objects(key=key).first()
if not product:
is_new = True
product = Product(key=key)
product.updated = False
product.event_type = False
if title and title != product.title:
product.title = title
is_updated = True
if brand and brand != product.brand:
product.brand = brand
is_updated = True
if combine_url and combine_url != product.combine_url:
product.combine_url = combine_url
is_updated = True
if price and price != product.price:
product.price = price
is_updated = True
if listprice and listprice != product.listprice:
product.listprice = listprice
is_updated = True
if category.cats and set(category.cats).difference(product.dept):
product.dept = list(set(category.cats) | set(product.dept or []))
is_updated = True
if category.key not in product.category_key:
is_updated = True
if is_updated:
product.list_update_time = datetime.utcnow()
# To pick the product which fit our needs, such as a certain discount, brand, dept etc.
selected = Picker(site='6pm').pick(product)
if not selected:
product.hit_time = datetime.utcnow()
common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
is_new=is_new, is_updated=((not is_new) and is_updated) )
print product.key; print product.brand; print product.title; \
print product.price, ' / ', product.listprice; print product.combine_url; \
print product.dept; print
# Go to the next page to keep on crawling.
next_page = None
page_node = tree.cssselect('div.pagination')
if not page_node:
last_node =page_node[0].cssselect('.last')
if last_node:
示例6: crawl_product
# 需要导入模块: from models import Product [as 别名]
# 或者: from models.Product import objects [as 别名]
def crawl_product(self, url, ctx='', **kwargs):
key = kwargs.get('key')
product = Product.objects(key=key).first()
if not product:
print 'product not exists -> %s' % kwargs
common_failed.send(sender=ctx, url=url, reason='product not exists -> %s' % kwargs)
res = requests.get(url, params={'zfcTest': 'mat:1'})
tree = lxml.html.fromstring(res.content)
is_new = False; is_updated = False; ready = False
# breadcumbs, also treated as depts of the product
# categories = tree.cssselect('div#breadcrumbs #crumbs a')
# for category in categories:
# if category.strip() not in product.dept:
# product.dept.append(category.strip())
# is_updated = True
theater_node = tree.cssselect('div#theater')[0]
stage_node = theater_node.cssselect('div#productStage')[0]
# original display/large image of the product
image_nodes = stage_node.cssselect('div#productImages > ul > li a')
for image_node in image_nodes:
thumbnail_url = image_node.get('href')
image_url = re.sub('_THUMBNAILS', '', thumbnail_url)
if image_url not in product.image_urls:
is_updated = True
# description list infos of the product
li_infos = stage_node.cssselect('div#prdInfo > div#prdInfoText > div.description ul')[0]
li_infos = lxml.html.fromstring(re.sub('</?sup>|</?sub>', '', lxml.html.tostring(li_infos))).xpath('.//text()')
list_info = [li_info.replace('\n', '') for li_info in li_infos if li_info != '\n']
# original and sale price of the product
sale_info_node = theater_node.cssselect('div#productForm form#prForm ul')[0]
price_node = sale_info_node.cssselect('li#priceSlot')[0]
listprice = price_node.cssselect('.oldPrice')[0].text.strip()
price = price_node.cssselect('.price')[0].text.strip()
# shipping info of the product
shipping = sale_info_node.cssselect('li.shipping a')[0].xpath('.//text()')
shipping = ''.join(shipping)
# update product
if list_info and list_info != product.list_info:
product.list_info = list_info
is_updated = True
if price and price != product.price:
product.price = price
is_updated = True
if listprice and not product.listprice and listprice != product.listprice:
product.listprice = listprice
is_updated = True
if shipping != product.shipping:
product.shipping = shipping
is_updated = True
if is_updated:
if not product.updated:
ready = True
product.updated = True
product.full_update_time = datetime.utcnow()
common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
is_new=is_new, is_updated=((not is_new) and is_updated), ready=ready)
print product.dept
print product.image_urls
print product.brand
print product.title
print product.listprice
print product.price
print product.shipping
print is_new
print is_updated
print ready
print product.updated
示例7: _parse_product
# 需要导入模块: from models import Product [as 别名]
# 或者: from models.Product import objects [as 别名]
def _parse_product(self, event_id, asins, cAsins, prefix_url, product_data, ctx):
""" no video info, list_info, summary
:param event_id: this product belongs to the event's id
:param asins: all asins info in this event
:param cAsins: all casins info in this event
:param prefix_url: image and js prefix_url, probably 'http://z-ecx.images-amazon.com/images/I/'
:param product_data: product data in this product
asin = product_data['asin']
casin = product_data['cAsin']
title = product_data['title'].encode('utf-8') # color is in title
# image_urls = [product_data['image']] + product_data['altImages'] # one picture, altImages is []
if 'listPrice' in product_data:
listprice = product_data['listPrice']['display'] # or 'amount', if isRange: True, don't know what 'amount' will be
else: listprice = ''
price = product_data['ourPrice']['display']
sizes = []
if product_data['teenagers']: # no size it is {}
for k, v in product_data['teenagers'].iteritems():
if v['size'] not in sizes: sizes.append(v['size'])
# tag is not precision. e.g. a bag is in shoes
# tag = product_data['productGL'] if 'productGL' in product_data else '' # 'apparel', 'home', 'jewelry', ''
soldout_link = 'http://www.myhabit.com/request/getBuyableAsinInfo?asin={0}&saleId={1}&flavor=parent&sid=177-4704555-7345351'.format(asin, event_id)
# one soldout link contains this asin's all color.
ret = req.get(soldout_link)
jsdata = json.loads(ret.content)
key_list = sorted(jsdata['buyableAsin'].keys())
len_sizes = len(sizes)
soldout = False
if len_sizes == 0:
if jsdata['buyableAsin'][casin]['stats']['remaining']['claimed'] == 0:
soldout = True
soldout = False
else: # more than one size.
if 'asin' in key_list: key_list.remove('asin')
if 'privateSaleID' in key_list: key_list.remove('privateSaleID')
count = 0
for l in key_list:
if l == casin or (count > 0 and count < len_sizes):
count += 1
if jsdata['buyableAsin'][l]['stats']['remaining']['claimed'] == 0:
soldout = True
soldout = False
# if casin in cAsins and 'soldOut' in cAsins[casin] and cAsins[casin]['soldOut'] == 1:
# soldout = True
# else: soldout = False
jslink = prefix_url + asins[asin]['url'] if asin in asins else ''
combine_url = 'http://www.myhabit.com/homepage#page=d&sale={0}&asin={1}&cAsin={2}'.format(event_id, asin, casin)
is_new, is_updated = False, False
product = Product.objects(key=casin).first()
if not product:
is_new = True
product = Product(key=casin)
product.combine_url = combine_url
product.asin = asin
product.title = title
# product.image_urls = image_urls
product.listprice = listprice
product.price = price
product.sizes = sizes
product.soldout = soldout
product.updated = False
if soldout and product.soldout != soldout:
product.soldout = True
is_updated = True
product.update_history.update({ 'soldout': datetime.utcnow() })
if product.title != title:
product.title = title
product.update_history.update({ 'title': datetime.utcnow() })
if product.combine_url != combine_url:
product.combine_url = combine_url
product.update_history.update({ 'combine_url': datetime.utcnow() })
if product.listprice != listprice:
product.listprice = listprice
product.update_history.update({ 'listprice': datetime.utcnow() })
if product.price != price:
product.price = price
product.update_history.update({ 'price': datetime.utcnow() })
if event_id not in product.event_id: product.event_id.append(event_id)
product.jslink = jslink
product.list_update_time = datetime.utcnow()
common_saved.send(sender=ctx, obj_type='Product', key=casin, url=product.combine_url, is_new=is_new, is_updated=is_updated)
return casin
示例8: get_product_abstract_by_url
# 需要导入模块: from models import Product [as 别名]
# 或者: from models.Product import objects [as 别名]
def get_product_abstract_by_url(self, url):
content = self.s.get(url, headers=self.headers).content
product_id = re.compile(r'/(\d+)-').search(url).group(1)
product_info = re.compile(r'product.init\((.*?)[\)]+;').search(content).group(1)
product_info = re.compile(r'new Gilt.Product\((.*?)[\)]+;').search(content).group(1)
product_info = json.loads(product_info)
title = product_info['name'].encode('utf-8')
description = product_info['description'].replace('<br>','\n').encode('utf-8')
return 'gilt_'+product_id, title+'\n'+description
if __name__ == '__main__':
check = CheckServer()
obj = Product.objects(products_end__lt=datetime.utcnow()).timeout(False)
print 'have {0} off sale event products.'.format(obj.count())
obj2 = Product.objects(products_end__exists=False).timeout(False)
print 'have {0} off sale category products.'.format(obj2.count())
for o in obj:
check.check_offsale_product( o.key, o.url() )
for o in obj2:
check.check_offsale_product( o.key, o.url() )