本文整理汇总了Python中product_spiders.items.ProductLoader.load_item方法的典型用法代码示例。如果您正苦于以下问题:Python ProductLoader.load_item方法的具体用法?Python ProductLoader.load_item怎么用?Python ProductLoader.load_item使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类product_spiders.items.ProductLoader
的用法示例。
在下文中一共展示了ProductLoader.load_item方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_product
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_product(self, response):
if not isinstance(response, HtmlResponse):
return
hxs = HtmlXPathSelector(response)
# products
products = hxs.select('//table[@class="productListing"]/tr')[1:]
if len(products) < 20: # if the product list can not be parsed using lxml, use BeautifulSoup
soup = BeautifulSoup(response.body)
products = soup.find('table', {'class': 'productListing'}).findAll('tr')
products = products[1:]
for product in products:
product_loader = ProductLoader(item=Product(), response=response)
product = product.findAll('td')
name = product[1].find('a').contents
url = product[1].find('a')['href']
price = product[2].text
price = re.findall('[0-9\.]+', price)
product_loader.add_value('name', name)
product_loader.add_value('url', url)
product_loader.add_value('price', price[0])
yield product_loader.load_item()
else:
for product in products:
product_loader = ProductLoader(item=Product(), selector=product)
product_loader.add_xpath('name', './td[position()=2]/a/text()')
product_loader.add_xpath('url', './td[position()=2]/a/@href')
product_loader.add_xpath('price', './td[position()=3]', re='\xa3(.*[0-9])')
yield product_loader.load_item()
示例2: parse
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse(self, response):
base_url = get_base_url(response)
hxs = HtmlXPathSelector(response)
items = hxs.select("//div[@class='navArea']/div[@class='navAreaPagging fr']/span[@class='paggingBtnNext']/a/@href").extract()
for item in items:
yield Request(urljoin_rfc(base_url,item), callback=self.parse)
content = hxs.select("//div[@class='mainProducts']")
products = content.select(".//a")
for product_ in products:
name = product_.select(".//ul/li/span[@class='productName']/text()").extract()
url = product_.select(".//@href").extract()
price = product_.select(".//ul//li/ul/li[1]/span[@class='orange']/text()").re(r'\xa3(.*)')
if not price:
price = product_.select(".//ul/li/ul/li[1]/span[@class='gray']/text()").re(r'\xa3(.*)')
if name:
l = ProductLoader(item=Product(), response=response)
l.add_value('name', name)
l.add_value('url', url)
l.add_value('price', price)
l.load_item()
yield l.load_item()
"""content = hxs.select("//div[@class='mainProducts']")
示例3: parse_product
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_product(self, response):
if not isinstance(response, HtmlResponse):
return
hxs = HtmlXPathSelector(response)
# products
products = hxs.select(u'//form/div[@id="price"]')
if not products:
product_loader = ProductLoader(item=Product(), response=response)
product_loader.add_value('url', response.url)
product_loader.add_xpath('name', u'//div[@class="product"]/h1/text()')
price = hxs.select(u'//div[@class="product"]//p[@class="price1"]/text()').re(u'\xa3(.*)')
if not price:
return
product_loader.add_value('price', price)
yield product_loader.load_item()
else:
for product in products:
product_loader = ProductLoader(item=Product(), selector=product)
product_loader.add_xpath('name', u'./h4/text()')
product_loader.add_value('url', response.url)
price = product.select(u'.//p[@class="price1"]/text()').re('\xa3(.*[0-9])')
if not price:
continue
product_loader.add_value('price', price)
yield product_loader.load_item()
示例4: parse_product
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_product(self, response):
if not isinstance(response, HtmlResponse):
return
hxs = HtmlXPathSelector(response)
name = hxs.select(u'//div[@class="product-name fn"]/h1/text()').extract()[0]
multiple_prices = hxs.select(u'//table[@id="super-product-table"]//tr')
if not multiple_prices:
product_loader = ProductLoader(item=Product(), response=response)
product_loader.add_value('name', name)
product_loader.add_value('url', response.url)
product_loader.add_xpath('price', u'//div[@class="price-box"]/span[contains(@id,"product-price")]/span[@class="price"]/text()',
re='\xa3(.*[0-9])')
product_loader.add_xpath('price', u'//div[@class="price-box"]/p[@class="special-price"]/span[@class="price"]/text()',
re='\xa3(.*[0-9])')
yield product_loader.load_item()
else:
for name_and_price in multiple_prices:
product_loader = ProductLoader(item=Product(), selector=name_and_price)
name_options = name_and_price.select(u'./td[position()=1]/text()').extract()[0]
product_loader.add_value('name', name + ' ' + name_options)
product_loader.add_value('url', response.url)
product_loader.add_xpath('price', u'./td[position()=2]/div[@class="price-box"]/span[@class="regular-price"]/span[@class="price"]/text()',
re=u'\xa3(.*)')
product_loader.add_xpath('price', u'./td[position()=2]/div[@class="price-box"]/p[@class="special-price"]/span[@class="price"]/text()',
re=u'\xa3(.*)')
yield product_loader.load_item()
示例5: parse_product
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_product(self, response):
hxs = HtmlXPathSelector(response)
# detect multiple product page
tableHeader = hxs.select("//td[text()='Item#']")
if tableHeader:
subProducts = tableHeader.select("../../tr[@class='Multi-Child_Background']")
for subProduct in subProducts:
loader = ProductLoader(Product(), subProduct)
theTDs = subProduct.select("td")
loader.add_value('sku', theTDs[0].select("text()").extract())
loader.add_value('name', theTDs[1].select("text()").extract())
loader.add_value('price', theTDs.select("b/text()").extract())
loader.add_value('url', response.url)
yield loader.load_item()
else:
productNode = hxs.select('//table[@id="v65-product-parent"]')[0]
priceNode = productNode.select(".//font[@class='pricecolor colors_productprice']/text()")
# Unavailable products are still online but have no price
if priceNode:
loader = ProductLoader(selector=productNode, item=Product())
loader.add_xpath('name', './/font[@class="productnamecolorLARGE colors_productname"]/text()')
loader.add_value('url', response.url)
loader.add_value('price', priceNode.extract())
sku = ''.join(hxs.select('.//span[@class="product_code"]/text()').extract()).strip()
loader.add_value('sku', sku)
yield loader.load_item()
示例6: parse_product
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_product(self, response):
if not isinstance(response, HtmlResponse):
return
hxs = HtmlXPathSelector(response)
multiple_options = hxs.select(u'//div[@id="spec-with-options"]//table//tr')[1:]
name = hxs.select('//div[@id="product-title"]/text()').extract()[0]
if not multiple_options:
product_loader = ProductLoader(item=Product(), response=response)
product_loader.add_value('name', name)
product_loader.add_value('url', response.url)
product_loader.add_xpath('price', u'//div[@class="price-now"]/span[contains(@id,"product-price")]/text()',
re='\xa3(.*)')
yield product_loader.load_item()
else:
for option in multiple_options:
product_loader = ProductLoader(item=Product(), selector=option)
option_name = option.select('./td[position()=2]/text()').extract()[0]
product_loader.add_value('name', name + ' ' + option_name)
product_loader.add_value('url', response.url)
product_loader.add_xpath('price', './/div[@class="price-now"]/span/text()', re='\xa3(.*)')
yield product_loader.load_item()
示例7: parse_item
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
name = hxs.select("//div[@class='details']//h1/text()").extract()[0]
url = response.url
price = hxs.select("//div[@class='details']//span[@id='product_price']/text()").extract()[0]
options = hxs.select("//td[@class='property-value']//select[starts-with(@name, 'product_options')]/option/text()").extract()
if options:
for opt in options:
parts = opt.partition('(')
opt_name = name + " - " + parts[0].strip()
opt_price = price
if parts[2]:
sign = parts[2][0]
if sign in ['-','+']:
addon = re.sub('[^0-9\.]','',parts[2])
if sign == '-':
opt_price = str(float(price) - float(addon))
else:
opt_price = str(float(price) + float(addon))
l = ProductLoader(item=Product(), response=response)
l.add_value('name', opt_name)
l.add_value('url', url)
l.add_value('price', opt_price)
yield l.load_item()
else:
l = ProductLoader(item=Product(), response=response)
l.add_value('name', name)
l.add_value('url', url)
l.add_value('price', price)
yield l.load_item()
示例8: browse_and_parse
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def browse_and_parse(self, response):
base_url = get_base_url(response)
hxs = HtmlXPathSelector(response)
for subcat_href in hxs.select('//div[@id="navColumnOne"]//a/@href').extract():
subsubcat_url = urlparse.urljoin(base_url, subcat_href)
if subsubcat_url not in self.navig_url_set:
self.navig_url_set.add(subsubcat_url)
yield Request(subsubcat_url, callback=self.browse_and_parse)
next_page = hxs.select("//div[@id='productListing']//div[@id='productsListingListingTopLinks']//a[contains(., 'Neste')]/@href")
if next_page:
yield Request(next_page[0].extract(), callback=self.browse_and_parse)
# parse product listing in this page, if any
for tr in hxs.select('//div[@id="productListing"]//tr[@class="productListing-even" or @class="productListing-odd"]'):
product_loader = ProductLoader(item=Product(), response=response)
product_loader.add_value('url', tr.select(".//td[2]//a/@href").extract()[0])
product_loader.add_value('name', tr.select(".//td[2]//a/text()").extract()[0])
product_loader.add_value('price', tr.select(".//td[3]/text()").extract()[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.'))
yield product_loader.load_item()
# edge case: product listing page with a single product
product_price = hxs.select('//h2[@id="productPrices"]/text()').extract()
if product_price:
# this product listing page contains a single product
product_loader = ProductLoader(item=Product(), response=response)
product_loader.add_xpath('name', '//h1[@id="productName"]/text()')
product_loader.add_value('url', response.url)
product_loader.add_value('price', product_price[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.'))
yield product_loader.load_item()
示例9: parse_item
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = hxs.select("//div[@class='product-view hproduct']")
for item in items:
title = item.select("form/div[@id='pageTitle']/h1/text()").extract()[0]
url = response.url
# check if there is options element
options = item.select("form/div[@id='pageTitle']/div[@id='addToCart']/\
fieldset[@id='product-options-wrapper']/dl/dd[1]/select/option")
found_products = False
if options:
# find options content in JS
scripts = hxs.select("//script")
product_config = None
for script in scripts:
script_cont = script.extract()
m = re.search("spConfig.*?\((.*)\)", script_cont)
if m:
product_config = m.group(1)
if product_config:
product_config = json.loads(product_config)
child_products = product_config['childProducts']
attributes = product_config['attributes']
attr_codes = []
for attr_id, attr in attributes.items():
attr_codes.append(attr['code'])
if attr['code'] == 'outlet_size':
options = attr['options']
# add products
for option in options:
found_products = True
# id_part = str(option['products'][0])
title_part = option['label']
price = child_products[option['products'][0]]['price']
name = title + " " + title_part
l = ProductLoader(item=Product(), response=response)
l.add_value('identifier', str(name))
l.add_value('name', name)
l.add_value('url', url)
l.add_value('price', price)
yield l.load_item()
self.save_attr(attr_codes)
if not found_products:
l = ProductLoader(item=Product(), response=response)
price = item.select("form/div[@id='pageTitle']/div[@id='addToCart']/\
div[@class='price-box']/span[@class='regular-price']/\
span[@class='price']/text()").extract()[0]
l.add_value('identifier', str(title))
l.add_value('name', title)
l.add_value('url', url)
l.add_value('price', price)
yield l.load_item()
示例10: parse
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse(self, response):
hxs = HtmlXPathSelector(response)
url = response.url
sku = response.meta['sku']
sec_number = response.meta['notes']
name = hxs.select("//h1[contains(@class, 'hheadline')]/text()").extract()
if not name:
logging.error('ERROR!! NO NAME!! %s "%s"' % (sku, url))
return
name = name[0].strip()
price = hxs.select("//div[@class='orderblock']/form/div[@class='orderblock-big']/b/font/text()").extract()
if not price:
options = hxs.select("//div[@class='dummies']")
for option in options:
add_name = option.select("strong[1]/text()").extract()
if not add_name:
logging.error('ERROR!! NO ADD NAME!! %s "%s"' % (sku, url))
continue
add_name = add_name[0]
add_number = option.select("font[@color='gray']/text()").extract()
if not add_number:
logging.error('ERROR!! NO ADD NUMBER!! %s "%s"' % (sku, url))
continue
add_number = add_number[0]
price = option.select("b[1]/text()").extract()
if not price:
logging.error('ERROR!! NO ADD PRICE!! %s "%s"' % (sku, url))
continue
price = price[0]
if sec_number == add_number:
product = Product()
loader = ProductLoader(item=product, response=response, selector=hxs)
loader.add_value('url', url)
loader.add_value('name', add_name)
loader.add_value('price', price)
loader.add_value('sku', sku)
yield loader.load_item()
else:
price = price[0].strip()
product = Product()
loader = ProductLoader(item=product, response=response, selector=hxs)
loader.add_value('url', url)
loader.add_value('name', name)
loader.add_value('price', price)
loader.add_value('sku', sku)
yield loader.load_item()
示例11: parse_product
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_product(self, response):
hxs = HtmlXPathSelector(response)
name = hxs.select("//div[@id='pricetext']/b/text()").extract()
if not name:
print "ERROR!! NO NAME!! %s" % response.url
logging.error("ERROR!! NO NAME!! %s, %s" % (name, response.url))
return
name = [x.strip() for x in name]
name = name[0]
url = response.url
names2 = hxs.select("//div[@id='pricetext']/span[@class='style1']/text()").extract()
price_position = 1
if names2:
prices = hxs.select("//div[@id='pricetext']/text()").extract()
for name2 in names2:
name += " %s" % name2.strip()
price = prices[price_position]
product = Product()
loader = ProductLoader(item=product, response=response)
loader.add_value('url', url)
loader.add_value('name', name)
loader.add_value('price', price)
loader.add_value('sku', '')
yield loader.load_item()
price_position+=1
else:
prices = hxs.select("//div[@id='pricetext']/text()").extract()
if len(prices) > (price_position+1):
for i in range(price_position, len(prices)-1):
m = re.search("(.*?):[\s]*(\$?[\d,.]+)$", prices[i])
if m:
name += m.group(1).strip()
price = m.group(2)
product = Product()
loader = ProductLoader(item=product, response=response)
loader.add_value('url', url)
loader.add_value('name', name)
loader.add_value('price', price)
loader.add_value('sku', '')
yield loader.load_item()
else:
price = prices[price_position]
product = Product()
loader = ProductLoader(item=product, response=response)
loader.add_value('url', url)
loader.add_value('name', name)
loader.add_value('price', price)
loader.add_value('sku', '')
yield loader.load_item()
示例12: parse_product
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_product(self, response):
if not isinstance(response, HtmlResponse):
return
hxs = HtmlXPathSelector(response)
loaded = False
multiple_prices = hxs.select(u'//select[@id="item"]/option/text()').extract()
if not multiple_prices:
names = hxs.select(u'//td/h1/text()').extract()
prices = hxs.select(u'//span[@class="largeheadblackcentred"]/text()').extract()
if len(names) > 1:
names = names[1:]
prices = prices[1:]
for i, name in enumerate(names):
if i >= len(prices):
break
name = '-'.join(name.split('-')[:-1])
price = prices[i]
product_loader = ProductLoader(item=Product(), response=response)
product_loader.add_value('name', name.strip())
product_loader.add_value('url', response.url)
product_loader.add_value('price', price)
loaded = True
yield product_loader.load_item()
else:
for name_and_price in multiple_prices:
product_loader = ProductLoader(item=Product(), selector=name_and_price)
name, price = re.match('(.*)-.*\xa3(.*)', name_and_price).groups()
# if not name or not price:
# continue
product_loader.add_value('name', name.strip())
product_loader.add_value('url', response.url)
product_loader.add_value('price', price)
yield product_loader.load_item()
loaded = True
retries = response.meta.get('retries', 0)
if not loaded and retries < 3:
log.msg('Retrying %s' % response.url, level=log.WARNING)
yield Request(response.url, meta={'retries': retries + 1},
dont_filter=True, callback=self.parse_product)
示例13: parse_item
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_item(self, response):
url = response.url
hxs = HtmlXPathSelector(response)
name = hxs.select("//div[@id='primary_block']/div[@id='pb-left-column']/h2/text()").extract()
if not name:
logging.error("NO NAME! %s" % url)
return
name = name[0]
price = hxs.select("//p[@class='price']/span[@class='our_price_display']/span/text()").extract()
if not price:
logging.error("NO PRICE! %s" % url)
return
price = price[0]
price = Decimal(extract_price2uk(price))
eco_tax = hxs.select("//p[@class='price-ecotax']/span/text()").extract()
if eco_tax:
eco_tax[0] = eco_tax[0].encode("ascii", "ignore")
print "Found eco tax %s" % eco_tax[0]
price -= Decimal(extract_price2uk(eco_tax[0]))
l = ProductLoader(item=Product(), response=response)
l.add_value("identifier", str(name))
l.add_value("name", name)
l.add_value("url", url)
l.add_value("price", unicode(price))
yield l.load_item()
示例14: parse_product
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_product(self, response):
if not isinstance(response, HtmlResponse):
return
hxs = HtmlXPathSelector(response)
soup = BeautifulSoup(response.body)
products = soup.findAll('a', href=re.compile('ProductDetail'))
products = {product.parent.parent for product in products}
for product in products:
product_loader = ProductLoader(item=Product(), response=response)
name = product.findAll('font')[1].text
price = product.find('nobr', text=re.compile('\$'))
url = product.find('a', href=re.compile('ProductDetail'))
if url:
url = urljoin_rfc(get_base_url(response), url['href'])
else:
url = response.url
product_loader.add_value('name', name)
product_loader.add_value('price', price)
product_loader.add_value('url', url)
product_loader.add_value('url', url)
product_loader.add_value('sku', response.meta['sku'])
#product_loader.add_value('identifier', response.meta['sku'])
site_mfrgid = product.find('nobr').text
if site_mfrgid:
site_mfrgid = site_mfrgid.strip().lower()
mfrgid = response.meta['mfrgid'].strip().lower()
if site_mfrgid == mfrgid:
yield product_loader.load_item()
示例15: parse_item
# 需要导入模块: from product_spiders.items import ProductLoader [as 别名]
# 或者: from product_spiders.items.ProductLoader import load_item [as 别名]
def parse_item(self, response):
url = response.url
sku = response.meta['sku']
notes = response.meta['notes']
name = response.meta['name'].encode('ascii', 'ignore')
try:
hxs = HtmlXPathSelector(response)
price = hxs.select("//table[@class='productDetail']//span[@id='offer_price']/text()").extract()
if not price:
logging.error('ERROR!! NO PRICE!! %s "%s" "%s"' % (sku, name, url))
return
price = price[0].strip()
product = Product()
loader = ProductLoader(item=product, response=response, selector=hxs)
loader.add_value('identifier', sku)
loader.add_value('url', url)
loader.add_value('name', name)
loader.add_value('price', price)
loader.add_value('sku', sku)
yield loader.load_item()
except lxml.etree.XMLSyntaxError:
logging.error("Rerequesting")
yield Request(
url,
callback=self.parse_item,
meta={'sku': sku, 'notes': notes, 'name': name},
dont_filter=True
)