def fetch_category(search_index, amazon_node_id):
api = caching.ResponseCachingAPI(
for root in api.item_search(search_index, BrowseNode=str(amazon_node_id),
for item in root.Items.Item:
product = Product()
product.category = Category.objects.get(amazon_node_id=amazon_node_id)
product.asin = item.ASIN
product.title = unicode(item.ItemAttributes.Title)
product.detailpageurl = unicode(item.DetailPageURL)
product.manufacturer = unicode(getattr(item.ItemAttributes, 'Manufacturer', None))
product.publisher = unicode(getattr(item.ItemAttributes, 'Publisher', None))
product.brand = unicode(getattr(item.ItemAttributes, 'Brand', None))
product.popularity = getattr(item, 'SalesRank', 1000)
if hasattr(item, 'MediumImage'):
product.medium_image = getattr(item.MediumImage, 'URL', None)
if hasattr(item, 'LargeImage'):
product.large_image = getattr(item.LargeImage, 'URL', None)
if hasattr(item, 'EditorialReviews'):
product.description = unicode(getattr(item.EditorialReviews.EditorialReview, 'Content', None))
if hasattr(item.Offers, 'Offer'):
product.price = item.Offers.Offer.OfferListing.Price.FormattedPrice.pyval
elif hasattr(item.ItemAttributes, 'ListPrice'):
product.price = item.ItemAttributes.ListPrice.FormattedPrice.pyval
elif hasattr(item.OfferSummary, 'LowestUsedPrice'):
product.price = u'used from %s' % item.OfferSummary.LowestUsedPrice.FormattedPrice.pyval
product.price = None
except AWSError, e:
if e.code == 'AWS.ParameterOutOfRange':
pass # reached the api limit of 10 pages
raise ValidationError(message=e.msg)
示例2: crawl_listing
# 需要导入模块: from models import Product [as 别名]
# 或者: from models.Product import brand [as 别名]
def crawl_listing(self, url, ctx='', **kwargs):
res = requests.get(url)
tree = lxml.html.fromstring(res.content)
category = Category.objects(key=kwargs.get('key')).first()
if not category:
common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key'))
product_nodes = tree.cssselect('div#searchResults a')
for product_node in product_nodes:
price = None; listprice = None
price = product_node.cssselect('.price-6pm')[0].text
listprice_node = product_node.cssselect('.discount')
listprice = ''.join(listprice_node[0].xpath('text()')) if listprice_node else None
# eliminate products of no discountIndexError:
if price is None or listprice is None:
# common_failed.send(sender=ctx, url=url, \
# reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice))
key = product_node.get('data-product-id')
if not key:
common_failed.send(sender=ctx, url=url, reason='listing product has no key')
combine_url = product_node.get('href')
key = '%s_%s' % (key, combine_url.split('/')[-1])
match = re.search(r'https?://.+', combine_url)
if not match:
combine_url = '%s%s' % (HOST, combine_url)
brand = product_node.cssselect('.brandName')[0].text.strip()
title = product_node.cssselect('.productName')[0].text.strip()
is_new = False; is_updated = False
product = Product.objects(key=key).first()
if not product:
is_new = True
product = Product(key=key)
product.updated = False
product.event_type = False
if title and title != product.title:
product.title = title
is_updated = True
if brand and brand != product.brand:
product.brand = brand
is_updated = True
if combine_url and combine_url != product.combine_url:
product.combine_url = combine_url
is_updated = True
if price and price != product.price:
product.price = price
is_updated = True
if listprice and listprice != product.listprice:
product.listprice = listprice
is_updated = True
if category.cats and set(category.cats).difference(product.dept):
product.dept = list(set(category.cats) | set(product.dept or []))
is_updated = True
if category.key not in product.category_key:
is_updated = True
if is_updated:
product.list_update_time = datetime.utcnow()
# To pick the product which fit our needs, such as a certain discount, brand, dept etc.
selected = Picker(site='6pm').pick(product)
if not selected:
product.hit_time = datetime.utcnow()
common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
is_new=is_new, is_updated=((not is_new) and is_updated) )
print product.key; print product.brand; print product.title; \
print product.price, ' / ', product.listprice; print product.combine_url; \
print product.dept; print
# Go to the next page to keep on crawling.
next_page = None
page_node = tree.cssselect('div.pagination')
if not page_node:
last_node =page_node[0].cssselect('.last')
if last_node:
示例3: crawl_product
# 需要导入模块: from models import Product [as 别名]
# 或者: from models.Product import brand [as 别名]
def crawl_product(self, url, casin, ctx='', **kwargs):
r = req.get(url)
data = re.compile(r'parse_asin_\w+\((.*)\);$').search(r.text).group(1)
data = json.loads(data)
image_urls = []
for i in data['detailJSON']['main']['altviews']:
if i['zoomImage'] not in image_urls:
if not image_urls:
for i in data['detailJSON']['asins']:
if i['asin'] == casin:
for j in i['altviews']:
if j['zoomImage'] not in image_urls:
asin = data['detailJSON']['asin']
summary = data['productDescription']['shortProdDesc']
if data['productDescription']['bullets']:
list_info = [i.replace('"', '"').replace(''', '\'') for i in data['productDescription']['bullets'][0]['bulletsList']]
list_info = []
brand = data['detailJSON']['brand']
returned = data['detailJSON']['returnPolicy']
# if 'intlShippable' in data['detailJSON']:
# shipping = 'international shipping' if data['detailJSON']['intlShippable'] == 1 else 'no international shipping'
# elif 'choices' in data['detailJSON']:
# for i in data['detailJSON']['choices']:
# if i['asin'] == casin:
# shipping = 'international shipping' if i['intlShippable'] == 1 else 'no international shipping'
# break
# shipping = shipping if shipping else ''
video = ''
for p in data['detailJSON']['asins']:
if p['asin'] == casin:
video = p['videos'][0]['url'] if p['videos'] else ''
is_new, is_updated = False, False
product = Product.objects(key=casin).first()
if not product:
is_new = True
product = Product(key=casin)
product.summary = summary
product.list_info = list_info
product.brand = brand
product.shipping = 'FAST, FREE SHIPPING, FREE RETURN SHIPPING in the U.S.'
product.returned = returned
product.video = video
product.image_urls = image_urls
product.full_update_time = datetime.utcnow()
if product.updated == False:
product.updated = True
ready = True
else: ready = False
common_saved.send(sender=ctx, obj_type='Product', key=casin, url=url, is_new=is_new, is_updated=is_updated, ready=ready)